In [1]:
from __future__ import division
import pymongo
import json

In [2]:
data_files = ['assigned_to',
              'bug_status',
              'cc',
              'component',
              'op_sys',
              'priority',
              'product',
              'reports',
              'resolution',
              'severity',
              'short_desc',
              'version']

In [3]:
mc = pymongo.MongoClient()
db = mc['bugs']

How many records (and unique bug ids) are in different tables

In [31]:
for json_file in data_files:
    coll = db[json_file]
    print '{:12s} {:9,.0f} ({:9,.0f} unique ids)'.format(json_file, coll.count(), len(coll.distinct('id'))) 

assigned_to    582,668 (  394,878 unique ids)
bug_status   1,026,480 (  394,878 unique ids)
cc           1,648,136 (  394,878 unique ids)
component      509,746 (  394,878 unique ids)
op_sys         442,897 (  394,878 unique ids)
priority       436,623 (  394,878 unique ids)
product        435,628 (  394,878 unique ids)
reports        394,878 (  394,878 unique ids)
resolution     800,080 (  394,878 unique ids)
severity       434,078 (  394,878 unique ids)
short_desc     491,657 (  394,878 unique ids)
version        469,073 (  394,878 unique ids)


What kind of information is stored in different tables

In [19]:
for json_file in data_files:
    coll = db[json_file]
    first_line = eval(str(coll.find_one({}, {'_id': False}))) #filter out _id assigned by mongodb
    print json_file
    for key, val in first_line.iteritems():
        print '\t{:19s} {:18s} {:12s}'.format(key, type(val), str(val)) 
    print

assigned_to
	what                <type 'NoneType'>  None        
	who                 <type 'int'>       111157      
	when                <type 'int'>       1111425133  
	id                  <type 'unicode'>   287144      

bug_status
	what                <type 'unicode'>   UNCONFIRMED 
	who                 <type 'int'>       111157      
	when                <type 'int'>       1111425133  
	id                  <type 'unicode'>   287144      

cc
	what                <type 'unicode'>               
	who                 <type 'int'>       111157      
	when                <type 'int'>       1111425133  
	id                  <type 'unicode'>   287144      

component
	what                <type 'unicode'>   General     
	who                 <type 'int'>       111157      
	when                <type 'int'>       1111425133  
	id                  <type 'unicode'>   287144      

op_sys
	what                <type 'unicode'>   Linux       
	who                 <type 'int'>       111157      

How many records are in each table for single bug

In [24]:
sample_id = '287144'
for json_file in data_files:
    coll = db[json_file]
    count = coll.find({'id': sample_id}, {'_id': False}).count()
    print '{:19s} {:18s}'.format(json_file, str(count))

	assigned_to         1                 
	bug_status          4                 
	cc                  3                 
	component           1                 
	op_sys              1                 
	priority            1                 
	product             1                 
	reports             1                 
	resolution          4                 
	severity            1                 
	short_desc          2                 
	version             1                 


In [36]:
sample_id = '287144'
#sample_when = 1111425133
for json_file in data_files:
    coll = db[json_file]
    lines = coll.find({'id': sample_id}, {'_id': False})
    print json_file
    for line in lines:
        print line
    #for key, val in first_line.iteritems():
    #    print '\t{:19s} {:18s} {:12s}'.format(key, type(val), str(val)) 
    print

assigned_to
{u'what': None, u'who': 111157, u'when': 1111425133, u'id': u'287144'}

bug_status
{u'what': u'UNCONFIRMED', u'who': 111157, u'when': 1111425133, u'id': u'287144'}
{u'what': u'RESOLVED', u'who': 111157, u'when': 1136823088, u'id': u'287144'}
{u'what': u'UNCONFIRMED', u'who': 188489, u'when': 1136823115, u'id': u'287144'}
{u'what': u'RESOLVED', u'who': 188489, u'when': 1136823131, u'id': u'287144'}

cc
{u'what': u'', u'who': 111157, u'when': 1111425133, u'id': u'287144'}
{u'what': u'bugzilla@spray.se', u'who': 25585, u'when': 1111446775, u'id': u'287144'}
{u'what': u'rflint@dslr.net', u'who': 188489, u'when': 1136823131, u'id': u'287144'}

component
{u'what': u'General', u'who': 111157, u'when': 1111425133, u'id': u'287144'}

op_sys
{u'what': u'Linux', u'who': 111157, u'when': 1111425133, u'id': u'287144'}

priority
{u'what': None, u'who': 111157, u'when': 1111425133, u'id': u'287144'}

product
{u'what': u'Firefox', u'who': 111157, u'when': 1111425133, u'id': u'287144'}

rep

'what' key in different tables is the important thing:
- **assigned_to** - mostly (always?) None
- **bug_status** - mostly (always?) 'UNCONFIRMED'
- **cc** - mostly (always?) reporter
- **component** - 'General' ...
- **op_sys** - 'Linux' ...
- **priority** - None ...
- **product** - 'Firefox' ...
- <strike>reports - what is not a valid key in this file</strike>
- **resolution** - mostly (always?) '' (empty)
- **severity** - 'normal' ...
- **short_desc** - text
- **version** - 'unspecified' ...

In [4]:
coll_reports = db['reports']
for report in coll_reports.find():
    print report['id'], report['opening'], report['reporter']
    break

287144 1111425133 111157


In [18]:
coll_assigned = db['assigned_to']
cursor = coll_reports.find(no_cursor_timeout=True) #need to manually close!
all_assigned = set()
not_asigned = 0

for report in cursor:
    assigned = coll_assigned.find_one({'id': report['id'], 'when': report['opening']}, {'_id': False})
    all_assigned.add(assigned['what'])
    if assigned['what'] == None:
        not_asigned += 1

cursor.close()

KeyboardInterrupt: 