### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.

In [1]:
import pandas

import plotly as plotly
import plotly.graph_objs as go

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
initial_date = '2010-01-01'

# Git Activity
## Git: Total Number of commits authored
Commits are contributions in terms of Git. Looking at them we can measure not only global activity of projects and organizations, but also how these projects and organizations evolve through time.

In [3]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',\
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits', group_column='Project')

## Git: Number of commits authored by Non-Employees

In [4]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (NON_EMPLOYEES ONLY)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
        .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',\
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits',
                     group_column='Project')

## Git: Number of commits authored by Employees

In [5]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (EMPLOYEES ONLY)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
        .filter('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Commits',
        group_field='project', subgroup_field='time', value_field='contributions')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Commits', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Commits',
                     group_column='Project')

**Figure above: commits by project using Spreadsheet based project grouping**

## Git: Number of commits authored by Organization 

In [6]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project (max 100 projects)
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=10)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')\
    .metric('contributions', 'cardinality', field='hash', precision_threshold=100000)

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', 
                       value_column='# Contributions', group_field='organizations', subgroup_field='time', 
                       metric_field='contributions',
                       staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

# Plot it
ut.print_stacked_bar(df=df, time_column='Time', value_column='# Contributions', group_column='Organization')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


# GitHub
## GitHub: Pull Requests by Organization

In [32]:
s = ut.create_search(es_conn, 'github_issues')

# Get only PRs
s = s.filter('terms', pull_request=['true'])

s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='PRs',
                 group_field='organizations', subgroup_field='time',
                 staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')
ut.print_stacked_bar(df=df, time_column='Time', value_column='PRs', group_column='Organization')

Mozilla Staff -> Employees
Community -> Non-Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


## GitHub: Pull Requests by Project

In [37]:
s = ut.create_search(es_conn, 'github_issues')

# Get only PRs
s = s.filter('terms', pull_request=['true'])

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='PRs',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='PRs', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='PRs', group_column='Project')

## GitHub: Pull Requests by Project Non-Employees Only

In [43]:
s = ut.create_search(es_conn, 'github_issues')

# Get only PRs for NON-EMPLOYEES
s = s.filter('terms', pull_request=['true'])\
        .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='PRs',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='PRs', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='PRs', group_column='Project')

## GitHub: Issues by Organization

In [33]:
s = ut.create_search(es_conn, 'github_issues')

# Get only Issues
s = s.filter('terms', pull_request=['false'])

s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='Issues',
                 group_field='organizations', subgroup_field='time',
                 staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')
ut.print_stacked_bar(df=df, time_column='Time', value_column='Issues', group_column='Organization')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Community -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


## GitHub: Issues by Project

In [40]:
s = ut.create_search(es_conn, 'github_issues')

# Get only Issues
s = s.filter('terms', pull_request=['false'])

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Issues',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Issues', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Issues', group_column='Project')

## GitHub: Issues by Project Non-Employees only

In [44]:
s = ut.create_search(es_conn, 'github_issues')

# Get only Issues
s = s.filter('terms', pull_request=['false'])\
        .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Issues',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Issues', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Issues', group_column='Project')

# Bugzilla
## Bugzilla: Bugs by Project 

In [10]:
s = ut.create_search(es_conn, 'bugzilla')

# Unique count of Commits by Project
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Bugs',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Bugs', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Bugs', group_column='Project')

## Bugzilla: Bugs by Project Non-employees only

In [45]:
s = ut.create_search(es_conn, 'bugzilla')

# Unique count of Commits by Project
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})\
    .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Bugs',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Bugs', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Bugs', group_column='Project')

**Above: Bugzilla bugs over time by Project**

## Bugzilla: Bugs by Organization

In [8]:
s = ut.create_search(es_conn, 'bugzilla')

# Count of Bugs by Org
s = s.filter('range', creation_ts={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='creation_ts', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='Bugs',
                       group_field='organizations', subgroup_field='time',
                       staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


In [9]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='Bugs', group_column='Organization')

**Above: Bugzilla bugs over time by organization **

# Mailing lists:
## Number of e-mails sent by project



In [18]:
s = ut.create_search(es_conn, 'mbox')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='E-Mails',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='E-Mails', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='E-Mails', group_column='Project')

## Number of e-mails sent by project Non-employees only

In [46]:
s = ut.create_search(es_conn, 'mbox')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
    .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
    
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='E-Mails',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='E-Mails', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='E-Mails', group_column='Project')

## Number of e-mails sent by organization

In [11]:
s = ut.create_search(es_conn, 'mbox')

# Count of E-mails by Org
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='E-Mails',
                 group_field='organizations', subgroup_field='time',
                 staff_org_names=['Mozilla Staff'], staff_org='Employees')

Mozilla Staff -> Employees
Community -> Non-Employees
Mozilla Reps -> Non-Employees


In [12]:
ut.print_stacked_bar(df=df, time_column='Time', value_column='E-Mails', group_column='Organization')

# Discourse
## Discourse: Messages by organization


In [47]:
s = ut.create_search(es_conn, 'discourse')

# Count of E-mails by Org
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})

s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time', value_column='Messages',
                 group_field='organizations', subgroup_field='time',
                 staff_org_names=['Mozilla Staff', 'Code Sheriff'], staff_org='Employees')

ut.print_stacked_bar(df=df, time_column='Time', value_column='Messages', group_column='Organization')

Community -> Non-Employees
Mozilla Staff -> Employees
Mozilla Reps -> Non-Employees
Code Sheriff -> Employees


## Discourse: Messages by Project

In [28]:
s = ut.create_search(es_conn, 'discourse')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})
s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Messages',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Messages', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Messages', group_column='Project')

## Discourse: Messages by project Non-employees only

In [49]:
s = ut.create_search(es_conn, 'discourse')

# Unique count of Commits by Project
s = s.filter('range', grimoire_creation_date={'gte': initial_date, 'lt': 'now/y'})\
    .exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])


s.aggs.bucket('project', 'terms', field='project', size=100000)\
    .bucket('time', 'date_histogram', field='grimoire_creation_date', interval='quarter')

result = s.execute()
            
projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Messages',\
        group_field='project', subgroup_field='time')

# Remove 'Unknown' project entries
projects_df = projects_df.loc[projects_df['Project'] != 'Unknown']

projects_df = projects_df.sort_values(by='Messages', ascending=0)

# Plot it
ut.print_stacked_bar(df=projects_df, time_column='Time', value_column='Messages', group_column='Project')