### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.

In [1]:
import pandas

import plotly as plotly
import plotly.graph_objs as go

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
initial_date = '2010-01-01'

# Git Activity
## Git: Total Number of commits authored
Commits are contributions in terms of Git. Looking at them we can measure not only global activity of projects and organizations, but also how these projects and organizations evolve through time.

In [6]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',\
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits', group_column='Project')

## Git: Number of commits authored by Non-Employees

In [8]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (NON_EMPLOYEES ONLY)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
        .filter('term', author_org_name='Community')
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',\
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits',
                     group_column='Project')

## Git: Number of commits authored by Employees

In [9]:
s = ut.create_search(es_conn, 'git')

from elasticsearch_dsl import Q
filter_community = Q('match', author_org_name='Community')


# Unique count of Commits by Project (EMPLOYEES ONLY)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
        .exclude(filter_community)
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits',
                     group_column='Project')

**Figure above: commits by project using Spreadsheet based project grouping**

## Git: Number of commits authored by Organization 

In [12]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (max 100 projects)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=10)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', 
                       value_column='# Contributions', group_field='organizations', subgroup_field='time', 
                       metric_field='contributions',
                       staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Code Sheriff -> Employees


In [13]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='# Contributions', group_column='Organization')

## GitHub: Issues and Pull Requests by status

Next table shows number of issues and Pull Request open and closed for each **Project**.

**TODO**: provide plots like:
  * PRs: https://analytics.mozilla.community:443/goto/99a2cf4d0e06986fe5886ccafa01c88f
  * Issues: https://analytics.mozilla.community:443/goto/db1d0243582548c8f8a0469f3e099677
  

In [64]:
# Open & Closed PRs by Project (max 100 projects)
s_prs = create_search(source='github_issues')
s_prs = add_bot_filter(s_prs)
s_prs = s_prs.filter('terms', pull_request=['true'])
s_prs.aggs.bucket('projects', 'terms', field='project_1', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_prs = s_prs.execute()

# Open & Closed Issues by Project (max 100 projects)
s_iss = create_search(source='github_issues')
s_iss = add_bot_filter(s_iss)
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('projects', 'terms', field='project_1', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_iss = s_iss.execute()

In [74]:
prs_df = stack_by_terms(result=result_prs, group_column='Project', subgroup_column='Status', value_column='# Pull Requests',\
         group_field='projects', subgroup_field='status', value_field='doc_count')
iss_df = stack_by_terms(result=result_iss, group_column='Project', subgroup_column='Status', value_column='# Issues',\
         group_field='projects', subgroup_field='status', value_field='doc_count')

joined_df = pandas.merge(prs_df, iss_df, how='outer', on=['Project', 'Status'])
joined_df = joined_df.fillna(0)
joined_df

Unnamed: 0,Project,Status,# Pull Requests,# Issues
0,mozilla,closed,25801.0,19346.0
1,mozilla,open,1113.0,6329.0
2,mozilla-services,closed,7493.0,4773.0
3,mozilla-services,open,154.0,1115.0
4,rust-lang,closed,4100.0,2925.0
5,rust-lang,open,242.0,2954.0
6,servo,closed,3177.0,1250.0
7,servo,open,153.0,1085.0
8,mozilla-mobile,closed,3099.0,538.0
9,mozilla-mobile,open,52.0,185.0


### GitHub: Issues and Pull Requests by Organization

Below we show number of Pull Requests and Issues open and closed **by Organization**:


In [75]:
# Open & Closed PRs by Organization (max 100 projects)
s_prs = create_search(source='github_issues')
s_prs = s_prs.filter('terms', pull_request=['true'])
s_prs.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_prs = s_prs.execute()

# Open & Closed Issues by Project (max 100 projects)
s_iss = create_search(source='github_issues')
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_iss = s_iss.execute()

In [76]:
prs_df = stack_by_terms(result=result_prs, group_column='Organization', subgroup_column='Status',\
                        value_column='# Pull Requests', group_field='organizations', subgroup_field='status',\
                        value_field='doc_count')
iss_df = stack_by_terms(result=result_iss, group_column='Organization', subgroup_column='Status',\
                        value_column='# Issues', group_field='organizations', subgroup_field='status',\
                        value_field='doc_count')

joined_df = pandas.merge(prs_df, iss_df, how='outer', on=['Organization', 'Status'])
joined_df = joined_df.fillna(0)
joined_df

Unnamed: 0,Organization,Status,# Pull Requests,# Issues
0,Community,closed,146388.0,74177.0
1,Community,open,2127.0,18764.0
2,Mozilla Corporation,closed,11827.0,7943.0
3,Mozilla Corporation,open,165.0,2787.0
4,Mozilla Staff,closed,10935.0,8308.0
5,Mozilla Staff,open,418.0,2801.0
6,Unknown,closed,6174.0,7268.0
7,Unknown,open,256.0,2460.0
8,Catalyst,closed,37.0,27.0
9,"Adobe Systems, Inc.",closed,35.0,31.0


### GitHub: Issues and Pull Requests made by people hired by Mozilla

To compare contributors **hired by Mozilla** to the rest of contributors we first show a list of Organizations we are considering as 'Mozilla Staff' or 'Others'. Next a table is shown with aggregated numbers to compare both contributor groups.

In [79]:
# Open & Closed PRs by Organization (max 100 projects)
s_prs = create_search(source='github_issues')
s_prs = s_prs.filter('terms', pull_request=['true'])
s_prs.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_prs = s_prs.execute()

# Open & Closed Issues by Project (max 100 projects)
s_iss = create_search(source='github_issues')
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('status', 'terms', field='state', size=100)
result_iss = s_iss.execute()

In [86]:
print("\nPRS\n")
prs_df = stack_by_terms_cusum(result=result_prs, group_column='Organization', subgroup_column='Status',\
                        value_column='# Pull Requests', group_field='organizations', subgroup_field='status',\
                        value_field='doc_count', staff_org_names=['Mozilla Staff'], staff_org='Mozilla Staff')
print("\nISSUES\n")
iss_df = stack_by_terms_cusum(result=result_iss, group_column='Organization', subgroup_column='Status',\
                        value_column='# Issues', group_field='organizations', subgroup_field='status',\
                        value_field='doc_count', staff_org_names=['Mozilla Staff'], staff_org='Mozilla Staff')

joined_df = pandas.merge(prs_df, iss_df, how='outer', on=['Organization', 'Status'])
joined_df = joined_df.fillna(0)
joined_df


PRS

Community -> Other
Mozilla Corporation -> Other
Mozilla Staff -> Mozilla Staff
Unknown -> Other
Catalyst -> Other
Adobe Systems, Inc. -> Other
Mozilla Reps -> Other
MIT -> Other
Apple -> Other
Canonical, Ltd. -> Other
Debian GNU/Linux -> Other
Cloudscaling -> Other
Collabora -> Other
Chef -> Other
Oracle -> Other
University of North Carolina at Chapel Hill -> Other

ISSUES

Community -> Other
Mozilla Staff -> Mozilla Staff
Mozilla Corporation -> Other
Unknown -> Other
Mozilla Reps -> Other
MIT -> Other
Adobe Systems, Inc. -> Other
Catalyst -> Other
Debian GNU/Linux -> Other
Cloudscaling -> Other
Canonical, Ltd. -> Other
Collabora -> Other
Google, Inc. -> Other
Bitergia -> Other
Apple -> Other
Aptana, Inc. -> Other
Capital One -> Other
Carnegie Mellon University -> Other
CodeSourcery -> Other
Intel -> Other
Oracle -> Other
The Apache Software Foundation -> Other


Unnamed: 0,Organization,Status,# Pull Requests,# Issues
0,Other,closed,164542.0,89663.0
1,Other,open,2559.0,24219.0
2,Mozilla Staff,closed,10935.0,8308.0
3,Mozilla Staff,open,418.0,2801.0


## Bugzilla: Bugs by Project
### TODO: enrich index with project information
  

In [47]:
s = create_search(source='bugzilla')


s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})
    
# Count of Bugs by Project
s.aggs.bucket('product', 'terms', field='product', size=100000)\
    .bucket('component', 'terms', field='component', size=100000)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')
result = s.execute()

print('q1')


# Process results to build a DataFrame
bugzilla_evo_df = pandas.DataFrame(columns=['Product', 'Component', 'Time', 'Bugs'])
for product in result.to_dict()['aggregations']['product']['buckets']:
    for component in product['component']['buckets']:
        for time in component['time']['buckets']:
            bugzilla_evo_df.loc[len(bugzilla_evo_df)] = [product['key'], component['key'], 
                                                         time['key_as_string'], time['doc_count']]

# Group By project
bugzilla_evo_merged_df = bugzilla_evo_df.merge(projects['Bugzilla'], on=['Product', 'Component'], how='left')

bugzilla_evo_projects_df = bugzilla_evo_merged_df.groupby(['Project', 'Time']).agg({'Bugs': 'sum',
                                                                       'Product': pandas.Series.nunique,
                                                                       'Component': pandas.Series.nunique})
bugzilla_evo_projects_df = bugzilla_evo_projects_df.sort_values(by='Bugs', ascending=0)

# Plot it
print_stacked_bar(df=bugzilla_evo_projects_df.reset_index(), time_column='Time', value_column='Bugs',
                  group_column='Project')


q1


**Above: Bugzilla bugs over time by Project**

## Bugzilla: Bugs by Organization

In [16]:
s = ut.create_search(es_conn, 'bugzilla')

# Count of Bugs by Org
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='Bugs',
                       group_field='organizations', subgroup_field='time',
                       staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


In [17]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='Bugs', group_column='Organization')

**Above: Bugzilla bugs over time by organization **

# Mailing lists:
## Number of e-mails sent by project
### TODO: enrich index with project information


In [3]:
s = ut.create_search(es_conn, 'mbox')

# Count of e-mails by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('origin', 'terms', field='origin', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
lists_df = ut.stack_by(result=result, group_column='List', subgroup_column='Time', value_column='E-mails',
                       group_field='origin', subgroup_field='time')

# Group By project
projects = ut.get_projects()
merged_df = lists_df.merge(projects['Mailing lists'], on='List', how='left')

projects_df = merged_df.groupby(['Project', 'Time']).agg({'E-mails': 'sum', 'List': 'count'})
projects_df = projects_df.sort_values(by='E-mails', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df.reset_index(), time_column='Time', value_column='E-mails',
                  group_column='Project')

## Number of e-mails sent by organization

In [10]:
s = ut.create_search(es_conn, 'mbox')

# Count of E-mails by Org
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='E-Mails',
                 group_field='organizations', subgroup_field='time',
                 staff_org_names=['Mozilla Staff'], staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Mozilla Reps -> Non-Employees


In [11]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='E-Mails', group_column='Organization')

**Discourse**:
  * Number of threads initiated
    * https://analytics.mozilla.community:443/goto/71771202d68a10cc422c6bda86c7cf3e
  * Number of comments posted
    * https://analytics.mozilla.community:443/goto/73c76412902180d14e0418d03fb30884
  
These metrics will be computed for the speficied contributor groups, over time.