### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.



In [1]:
import pandas

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
initial_date = '2010-01-01'

# List of Projects

## List of projects: Git

In [3]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('projects', 'terms', field='project', size=100000)\
    .bucket('organizations', 'terms', field='author_org_name', size=100)\
    .metric('commits', 'cardinality', field='hash', precision_threshold=1000000)
result = s.execute()

projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Org', value_column='# Commits',
                 group_field='projects', subgroup_field='organizations', value_field='commits')

#projects_df = projects_df.sort_values(by='# Commits', ascending=0)

ut.print_table(projects_df, filename='github-projects-table.html')


# Authors by Project

## Authors by Project: Git

In [4]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('projects', 'terms', field='project', size=100000)\
    .bucket('organizations', 'terms', field='author_org_name', size=100)\
    .metric('authors', 'cardinality', field='author_uuid', precision_threshold=1000000)
result = s.execute()

projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Org', value_column='Authors',
                 group_field='projects', subgroup_field='organizations', value_field='authors')

#projects_df = projects_df.sort_values(by='# Commits', ascending=0)

ut.print_table(projects_df, 'github-authors-table.html')


**Table above: Git Authors by Projects using Spreadsheet Data**

## Authors by Project: Bugzilla

We use Product and Component to assign project name to Bugzilla entries.

### TODO: enrich index with project information

In [11]:
######
# First retrieve Mozilla employees (triying to get everything together results in 502 error)
# so let's take partial results and merge them
######
s = ut.create_search(es_conn, 'bugzilla')

s = s.filter('terms', author_org_name=['Mozilla Staff'])

# Unique count of Authors by Project
s.aggs.bucket('product', 'terms', field='product', size=100000)\
    .bucket('component', 'terms', field='component', size=100000)\
    .bucket('contributors', 'terms', field='author_uuid', size=100000)
result = s.execute()

print('q1')

# Process results to build a DataFrame
i = 0
moz_df = pandas.DataFrame(columns=['Product', 'Component', 'uuid'])
for product in result.to_dict()['aggregations']['product']['buckets']:
    for component in product['component']['buckets']:
        for author in component['contributors']['buckets']:
            moz_df.loc[len(moz_df)] = [product['key'], component['key'], author['key']]
            i += 1
            if i % 10000 == 0:
                print(i, end=', ')

print('Moz: ', len(moz_df))
            
# Merge projects by product & components DFs
moz_merged_df = moz_df.merge(projects['Bugzilla'], on=['Product', 'Component'], how='left')
moz_merged_df['Org'] = 'Employees'

print('Moz merged: ', len(moz_merged_df))


####
# Second get results for non-employees
###
s = ut.create_search(es_conn, 'bugzilla')

s = s.exclude('terms', author_org_name=['Mozilla Staff'])

# Unique count of Authors by Project 
s.aggs.bucket('product', 'terms', field='product', size=100000)\
    .bucket('component', 'terms', field='component', size=100000)\
    .bucket('contributors', 'terms', field='author_uuid', size=100000)
result = s.execute()

print('q2')

# Process results to build a DataFrame
non_moz_df = pandas.DataFrame(columns=['Product', 'Component', 'uuid'])
i = 0
for product in result.to_dict()['aggregations']['product']['buckets']:
    for component in product['component']['buckets']:
        for author in component['contributors']['buckets']:
            non_moz_df.loc[len(non_moz_df)] = [product['key'], component['key'], author['key']]
            i += 1
            if i % 10000 == 0:
                print(i, end=', ')

print('Non-moz: ', len(non_moz_df))
            
# Merge projects by product & components DFs
non_moz_merged_df = non_moz_df.merge(projects['Bugzilla'], on=['Product', 'Component'], how='left')
non_moz_merged_df['Org'] = 'Non-Employees'

print('Non-moz merged: ', len(non_moz_merged_df))

###
# Concat both data frames into a single one
###
merged_df = pandas.concat([moz_merged_df, non_moz_merged_df])

print('After concat')


# Group By project
projects_df = merged_df.groupby(['Project', 'Org']).agg({'uuid': pandas.Series.nunique,
                                                         'Product': pandas.Series.nunique,
                                                         'Component': pandas.Series.nunique})
projects_df = projects_df.sort_values(by='uuid', ascending=0)

ut.print_table(projects_df.reset_index(), filename='bugzilla-projects-contributors-table.html')

q1
10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, Moz:  86562
Moz merged:  86562
q2
10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, 290000, 300000, 310000, 320000, 330000, 340000, 350000, 360000, Non-moz:  365549
Non-moz merged:  365549
After concat


**Table above: Bugzilla Authors by Project using Spreadsheet Data**

# Number of Authors by project over time
## Authors by project over time: Git


In [7]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (max 100 projects)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('projects', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('authors', 'cardinality', field='author_uuid', precision_threshold=1000000)

result = s.execute()

projects_evo_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Authors',
                 group_field='projects', subgroup_field='time', value_field='authors')

# Remove 'Unknown' project entries
projects_evo_df = projects_evo_df.loc[projects_evo_df['Project'] != 'Unknown']

# Plot it
ut.print_stacked_bar(df=projects_evo_df, time_column='Time', value_column='Authors', group_column='Project')

**Git authors over time using Spreadsheet Data**

## Authors by Project over Time: Bugzilla

### TODO: enrich index with project information

In [23]:
s = ut.create_search(es_conn, 'bugzilla')

s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})

# Unique count of Authors by Project
s.aggs.bucket('product', 'terms', field='product', size=100000)\
    .bucket('component', 'terms', field='component', size=100000)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')\
    .bucket('contributors', 'terms', field='author_uuid', size=100000)
result = s.execute()

print('q1')

# Process results to build a DataFrame
i = 0
repos_evo_df = pandas.DataFrame(columns=['Product', 'Component', 'Time', 'uuid'])
for product in result.to_dict()['aggregations']['product']['buckets']:
    for component in product['component']['buckets']:
        for time in component['time']['buckets']:
            for author in time['contributors']['buckets']:
                repos_evo_df.loc[len(repos_evo_df)] = [product['key'], component['key'], 
                                                       time['key_as_string'], author['key']]
                i += 1
                if i % 10000 == 0:
                    print(i, end=', ')

merged_evo_df = repos_evo_df.merge(projects['Bugzilla'], on=['Product', 'Component'], how='left')

# Group By project
projects_evo_df = merged_evo_df.groupby(['Project', 'Time']).agg({'uuid': pandas.Series.nunique,
                                                                  'Product': pandas.Series.nunique,
                                                                  'Component': pandas.Series.nunique})
projects_evo_df = projects_evo_df.sort_values(by='uuid', ascending=0)    

# Plot it
ut.print_stacked_bar(df=projects_evo_df.reset_index(), time_column='Time', value_column='uuid',
                  group_column='Project')

q1
10000, 20000, 30000, 40000, 50000, 60000, 70000, 80000, 90000, 100000, 110000, 120000, 130000, 140000, 150000, 160000, 170000, 180000, 190000, 200000, 210000, 220000, 230000, 240000, 250000, 260000, 270000, 280000, 290000, 300000, 310000, 

# List of organizations
## List of Organizations: Git


In [12]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .metric('commits', 'cardinality', field='hash', precision_threshold=100000)
result = s.execute()

In [13]:
ut.to_simple_df(result=result, group_field='organizations', value_field='commits', \
         group_column='Organization', value_column='# Commits')

Unnamed: 0,Organization,# Commits
0,Mozilla Staff,1140468
1,Community,438800
2,Code Sheriff,45219


# Contributors by organization
## Contributors by Organization: Git

In [14]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100).\
    metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)
result = s.execute()

In [15]:
ut.to_simple_df(result=result, group_field='organizations', value_field='contributors', \
         group_column='Organization', value_column='# Contributors')

Unnamed: 0,Organization,# Contributors
0,Mozilla Staff,2010
1,Community,13027
2,Code Sheriff,8


## Contributors by Organization: Bugzilla

In [16]:
s = ut.create_search(es_conn, 'bugzilla')

s = ut.add_bot_filter(s)

# Unique count of Commits by Project (max 100 projects)
s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100).\
    metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)
result = s.execute()

In [17]:
ut.to_simple_df(result=result, group_field='organizations', value_field='contributors', \
         group_column='Organization', value_column='# Contributors')

Unnamed: 0,Organization,# Contributors
0,Community,182365
1,Mozilla Staff,2801
2,Code Sheriff,8
3,Mozilla Reps,3


# Number of contributors by organization over time
## Contributors by Org over Time: Git

In [7]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('org', 'terms', field='author_org_name', size=10)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)

result = s.execute()

#authors_org_df = ut.stack_by(result=result, group_column='Organization', subgroup_column='Time', 
#                             value_column='# Contributors', group_field='org', subgroup_field='time',
#                             value_field='contributors')
authors_org_df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time',\
                                value_column='# Contributors', group_field='org', subgroup_field='time',\
                                metric_field='contributors', staff_org_names=['Mozilla Staff', 'Code Sheriff'],\
                                staff_org='Employees')


Mozilla Staff -> Employees
Community -> Non-Employees
Code Sheriff -> Employees


In [8]:
ut.print_stacked_bar(df=authors_org_df, time_column='Time', value_column='# Contributors',
                     group_column='Organization')

In [9]:
ut.print_grouped_bar(df=authors_org_df, time_column='Time', value_column='# Contributors',
                     group_column='Organization')

## Contributors by Org over Time: Bugzilla

In [10]:
s = ut.create_search(es_conn, 'bugzilla')

s = ut.add_bot_filter(s)

# Unique count of Commits by Project (max 100 projects)
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('org', 'terms', field='author_org_name', size=10)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')\
    .metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)

result = s.execute()

authors_org_df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time',\
                                value_column='# Contributors', group_field='org', subgroup_field='time',\
                                metric_field='contributors', staff_org_names=['Mozilla Staff', 'Code Sheriff'],\
                                staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


In [11]:
ut.print_stacked_bar(df=authors_org_df, time_column='Time', value_column='# Contributors', group_column='Organization')