### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.



In [1]:
import pandas

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
def authors_by_project_table(source):
    s = ut.create_search(es_conn, source)

    # Unique count of Commits by Project
    s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
        .bucket('projects', 'terms', field='project', size=100000)\
        .metric('authors', 'cardinality', field='author_uuid', precision_threshold=1000000)
    result = s.execute()

    #projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Org', value_column='Authors',
    #                group_field='projects', subgroup_field='organizations', value_field='authors')
    
    projects_df = ut.stack_by_cusum(result=result, group_column='Org', subgroup_column='Project',\
                                value_column='Authors', group_field='organizations', subgroup_field='projects',\
                                metric_field='authors', staff_org_names=['Mozilla Staff', 'Code Sheriff'],\
                                staff_org='Employees')

    #projects_df = projects_df.sort_values(by='# Commits', ascending=0)

    ut.print_table(projects_df, source + '-authors-table.html')

def authors_by_project_evo(source, date_field, filter_prs=None, filter_issues=None, filter_employees=None):
    """
    :filter_prs: retrieve PRs only
    :filter_issues: retrieve issues only
    """
    s = ut.create_search(es_conn, source)

    if filter_prs:
        s = s.filter('terms', pull_request=['true'])
    if filter_issues:
        s = s.filter('terms', pull_request=['false'])
        
    if filter_employees:
        s = s.exclude('terms', author_org_name=['Mozilla Staff', 'Code Sheriff'])
        
    # Unique count of Authors by Project (max 100 projects)
    s = s.filter('range', ** {date_field: {'gte': initial_date, 'lt': 'now/y'}})
    s.aggs.bucket('projects', 'terms', field='project', size=100000)\
        .bucket('time', 'date_histogram', field=date_field, interval='quarter')\
        .metric('authors', 'cardinality', field='author_uuid', precision_threshold=1000000)

    result = s.execute()

    projects_evo_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Time', value_column='Authors',
                     group_field='projects', subgroup_field='time', value_field='authors')

    # Remove 'Unknown' project entries
    projects_evo_df = projects_evo_df.loc[projects_evo_df['Project'] != 'Unknown']

    # Plot it
    ut.print_stacked_bar(df=projects_evo_df, time_column='Time', value_column='Authors', group_column='Project')
    
def authors_by_org_evo(source, date_field, filter_prs=None, filter_issues=None):
    """
    :filter_prs: retrieve PRs only
    :filter_issues: retrieve issues only
    """
    
    s = ut.create_search(es_conn, source)
    
    if filter_prs:
        s = s.filter('terms', pull_request=['true'])
    if filter_issues:
        s = s.filter('terms', pull_request=['false'])

    # Unique count of Commits by Project
    s = s.filter('range', ** { date_field: {'gte': initial_date, 'lt': 'now/y'}})
    s.aggs.bucket('org', 'terms', field='author_org_name', size=10)\
        .bucket('time', 'date_histogram', field=date_field, interval='quarter')\
        .metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)

    result = s.execute()

    #authors_org_df = ut.stack_by(result=result, group_column='Organization', subgroup_column='Time', 
    #                             value_column='# Contributors', group_field='org', subgroup_field='time',
    #                             value_field='contributors')
    authors_org_df = ut.stack_by_cusum(result=result, group_column='Organization', subgroup_column='Time',\
                                    value_column='# Contributors', group_field='org', subgroup_field='time',\
                                    metric_field='contributors', staff_org_names=['Mozilla Staff', 'Code Sheriff'],\
                                    staff_org='Employees')
    
    ut.print_stacked_bar(df=authors_org_df, time_column='Time', value_column='# Contributors',
                     group_column='Organization')
    ut.print_grouped_bar(df=authors_org_df, time_column='Time', value_column='# Contributors',
                     group_column='Organization')

In [3]:
initial_date = '2010-01-01'

# List of Projects

## List of projects: Git

In [4]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('projects', 'terms', field='project', size=100000)\
    .bucket('organizations', 'terms', field='author_org_name', size=100)\
    .metric('commits', 'cardinality', field='hash', precision_threshold=1000000)
result = s.execute()

projects_df = ut.stack_by(result=result, group_column='Project', subgroup_column='Org', value_column='# Commits',
                 group_field='projects', subgroup_field='organizations', value_field='commits')

#projects_df = projects_df.sort_values(by='# Commits', ascending=0)

ut.print_table(projects_df, filename='github-projects-table.html')


# Authors by Project

## Authors by Project: Git

In [5]:
authors_by_project_table('git')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


**Table above: Git Authors by Project**

## Authors by Project: GitHub


In [6]:
authors_by_project_table('github_issues')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees
Community -> Non-Employees


## Authors by Project: Bugzilla

We use Product and Component to assign project name to Bugzilla entries.

In [7]:
authors_by_project_table('bugzilla')

Unknown -> Non-Employees
Mozilla Staff -> Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


**Table above: Bugzilla Authors by Project**

## Authors by Project: Mailing Lists


In [8]:
authors_by_project_table('mbox')

Unknown -> Non-Employees
Mozilla Staff -> Employees
Mozilla Reps -> Non-Employees
Code Sheriff -> Employees


## Authors by Project: Discourse


In [9]:
authors_by_project_table('discourse')

Community -> Non-Employees
Mozilla Staff -> Employees
Unknown -> Non-Employees
Mozilla Reps -> Non-Employees
Code Sheriff -> Employees


# Number of Authors by project over time
## Authors by project over time: Git


In [10]:
authors_by_project_evo('git', 'grimoire_creation_date')

## Git:  Non-employees by project

In [11]:
authors_by_project_evo('git', 'grimoire_creation_date', filter_employees=True)

**Git authors over time**

## Authors by Project over Time: Bugzilla

In [12]:
authors_by_project_evo('bugzilla', 'creation_ts')

## Non-employees by project: Bugzilla

In [13]:
authors_by_project_evo('bugzilla', 'creation_ts', filter_employees=True)

## Authors by Project over Time: GitHub Pull Requests

In [14]:
authors_by_project_evo('github_issues', 'grimoire_creation_date', filter_prs=True)

## Non-employees by Project over Time: GitHub Pull Requests

In [15]:
authors_by_project_evo('github_issues', 'grimoire_creation_date', filter_prs=True, filter_employees=True)

## Authors by Project over Time: GitHub Issues

In [16]:
authors_by_project_evo('github_issues', 'grimoire_creation_date', filter_issues=True)

## Non-employees by Project over Time: GitHub Issues

In [17]:
authors_by_project_evo('github_issues', 'grimoire_creation_date', filter_issues=True, filter_employees=True)

## Authors by Project over Time: Mailing Lists

In [18]:
authors_by_project_evo('mbox', 'grimoire_creation_date')

## Non-employees by Project over Time: Mailing Lists

In [19]:
authors_by_project_evo('mbox', 'grimoire_creation_date', filter_employees=True)

## Authors by Project over Time: Discourse

In [20]:
authors_by_project_evo('discourse', 'grimoire_creation_date')

## Non-employees by Project over Time: Discourse

In [21]:
authors_by_project_evo('discourse', 'grimoire_creation_date', filter_employees=True)

# List of organizations
## List of Organizations: Git


In [22]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100)\
    .metric('commits', 'cardinality', field='hash', precision_threshold=100000)
result = s.execute()

In [23]:
ut.to_simple_df(result=result, group_field='organizations', value_field='commits', \
         group_column='Organization', value_column='# Commits')

Unnamed: 0,Organization,# Commits
0,Mozilla Staff,1003249
1,Unknown,562402
2,Code Sheriff,45094
3,Mozilla Reps,5544


# Contributors by organization
## Contributors by Organization: Git

In [24]:
s = ut.create_search(es_conn, 'git')

# Unique count of Commits by Project
s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100).\
    metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)
result = s.execute()

In [25]:
ut.to_simple_df(result=result, group_field='organizations', value_field='contributors', \
         group_column='Organization', value_column='# Contributors')

Unnamed: 0,Organization,# Contributors
0,Mozilla Staff,2053
1,Unknown,13269
2,Code Sheriff,8
3,Mozilla Reps,129


## Contributors by Organization: Bugzilla

In [26]:
s = ut.create_search(es_conn, 'bugzilla')

s = ut.add_bot_filter(s)

# Unique count of Commits by Project (max 100 projects)
s.aggs.bucket('organizations', 'terms', field='author_org_name', size=100).\
    metric('contributors', 'cardinality', field='author_uuid', precision_threshold=100000)
result = s.execute()

In [27]:
ut.to_simple_df(result=result, group_field='organizations', value_field='contributors', \
         group_column='Organization', value_column='# Contributors')

Unnamed: 0,Organization,# Contributors
0,Unknown,183118
1,Mozilla Staff,2733
2,Code Sheriff,8
3,Mozilla Reps,300


# Number of contributors by organization over time
## Contributors by Org over Time: Git

In [28]:
authors_by_org_evo('git', 'grimoire_creation_date')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


## Contributors by Org over Time: Bugzilla

In [29]:
authors_by_org_evo('bugzilla', 'creation_ts')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees


## Contributors by Org over Time: GitHub Pull Requests

In [30]:
authors_by_org_evo('github_issues', 'grimoire_creation_date', filter_prs=True)

Mozilla Staff -> Employees
Unknown -> Non-Employees
Code Sheriff -> Employees
Mozilla Reps -> Non-Employees
Community -> Non-Employees


## Contributors by Org over Time: GitHub Issues

In [31]:
authors_by_org_evo('github_issues', 'grimoire_creation_date', filter_issues=True)

Mozilla Staff -> Employees
Unknown -> Non-Employees
Mozilla Reps -> Non-Employees
Code Sheriff -> Employees


## Contributors by Org over Time: Mailing Lists

In [32]:
authors_by_org_evo('mbox', 'grimoire_creation_date')

Mozilla Staff -> Employees
Unknown -> Non-Employees
Mozilla Reps -> Non-Employees
Code Sheriff -> Employees


## Contributors by Org over Time: Discourse

In [33]:
authors_by_org_evo('discourse', 'grimoire_creation_date')

Community -> Non-Employees
Mozilla Staff -> Employees
Unknown -> Non-Employees
Mozilla Reps -> Non-Employees
Code Sheriff -> Employees
