In [11]:
from datetime import datetime
import pandas

import plotly as plotly
import plotly.graph_objs as go

import util as ut

from util import ESConnection
from elasticsearch_dsl import Search

es_conn = ESConnection()

In [12]:
def create_search(source):
    s = Search(using=es_conn, index=source)
    # TODO: Add bot and merges filtering.
    #s = s.filter('range', grimoire_creation_date={'gt': 'now/M-2y', 'lt': 'now/M'})
    #s.params(timeout=100)
    return s

In [35]:
def get_authors_df(result, author_bucket_field):        

    # Get a dataframe with each author and their first commit
    buckets_result = result['aggregations'][author_bucket_field]['buckets']

    buckets = []
    for bucket_author in buckets_result:
        author = bucket_author['key']

        first = bucket_author['first']['hits']['hits'][0]
        first_commit = first['sort'][0]/1000
        last_commit = bucket_author['last_commit']['value']/1000
        org_name = first['_source']['author_org_name']
        repo_name = first['_source']['repo_name']
        #uuid = first['_source']['author_uuid']
        buckets.append({
                'first_commit': datetime.utcfromtimestamp(first_commit),
                'last_commit': datetime.utcfromtimestamp(last_commit),
                'author': author,
                #'uuid': uuid,
                'org': org_name,
                'Repo': repo_name
        })
    authors_df = pandas.DataFrame.from_records(buckets)
    authors_df.sort_values(by='first_commit', ascending=False,
                            inplace=True)
    return authors_df

def get_active_authors_df(result, author_bucket_field, year):
    """Returns a dataframe with first and last commit of those authors
    whose last commit was made within a given year"""

    # Get a dataframe with each author and their first commit
    buckets_result = result['aggregations'][author_bucket_field]['buckets']

    buckets = []
    for bucket_author in buckets_result:
        author = bucket_author['key']

        first = bucket_author['first']['hits']['hits'][0]
        first_commit = first['sort'][0]/1000
        last_commit = bucket_author['last_commit']['value']/1000
        org_name = first['_source']['author_org_name']
        repo_name = first['_source']['repo_name']
        #uuid = first['_source']['author_uuid']
        if datetime.utcfromtimestamp(last_commit).year == year:
            buckets.append({
                    'first_commit': datetime.utcfromtimestamp(first_commit),
                    'last_commit': datetime.utcfromtimestamp(last_commit),
                    'author': author,
                    #'uuid': uuid,
                    'org': org_name,
                    'Repo': repo_name
            })
    authors_df = pandas.DataFrame.from_records(buckets)
    authors_df.sort_values(by='first_commit', ascending=False,
                            inplace=True)
    return authors_df

In [36]:
def print_horizontal_bar_chart(df, experience_field, title, min_range = 0):
    
    plotly.offline.init_notebook_mode(connected=True)
    
    experience = list(range(min_range, int(df[experience_field].max()) + 1))
    
    people_count = []
    for exp in experience:
        people_count.append(len(df.loc[df[experience_field] == exp]))
        
    data = [go.Bar(
            x=people_count,
            y=experience,
            orientation = 'h'
    )]
    
    layout = go.Layout(
        barmode='group',
        title= title
    )

    fig = go.Figure(data=data, layout=layout)
    plotly.offline.iplot(fig, filename='horizontal-bar')

In [37]:
def add_general_date_filters(s):
    # 01/01/1998
    initial_ts = '883609200000'
    return s.filter('range', grimoire_creation_date={'gt': initial_ts})

def add_bot_filter(s):
    return s.filter('term', author_bot='false')

def add_merges_filter(s):
    return s.filter('range', files={'gt': 0})

# Let's load projects from the REVIEWED SPREADSHEET
projects = ut.read_projects("data/Contributors and Communities Analysis - Project grouping.xlsx")

In [38]:
results = []
for i in range(0,2):

    # Buckets by author name, finding first commit for each of them
    s = Search(using=es_conn, index='git')
    s.params(timeout=30)

    # General filters
    s = add_general_date_filters(s)
    s = add_bot_filter(s)
    s = add_merges_filter(s)
    
    # Retrieve commits before given year
    s = s.filter('range', grimoire_creation_date={'lt': 'now-' + str(i) + 'y/y'})

    # Bucketize by uuid and get first and last commit
    s.aggs.bucket('authors', 'terms', field='author_uuid', size=100000) \
        .metric('first', 'top_hits', _source=['author_date', 'author_org_name', 'author_uuid', 'repo_name'],
                size=1, sort=[{"author_date": {"order": "asc"}}]) \
        .metric('last_commit', 'max', field='author_date')
    s = s.sort("author_date")
    #print(s.to_dict())
    results.append(s.execute())
    
authors_dfs = []
for result in results:
    authors_df = get_authors_df(result, author_bucket_field='authors')
    authors_df['active_years'] = (authors_df.last_commit-authors_df.first_commit).astype('timedelta64[Y]')
    authors_dfs.append(authors_df)

authors_dfs

[                                                    Repo  \
 11577              https://github.com/rust-lang/rust.git   
 13024               https://github.com/mozilla/gecko.git   
 14349  https://github.com/mozilla/donate.mozilla.org.git   
 12669               https://github.com/mozilla/gecko.git   
 13425      https://github.com/servo/core-graphics-rs.git   
 12596              https://github.com/rust-lang/rust.git   
 10727             https://github.com/aframevr/aframe.git   
 13554    https://github.com/aframevr/aframe-registry.git   
 11875              https://github.com/rust-lang/rust.git   
 9905               https://github.com/rust-lang/rust.git   
 9612                 https://github.com/mozilla/sops.git   
 13171  https://github.com/mozilla-services/lua_sandbo...   
 13884              https://github.com/rust-lang/rust.git   
 11900             https://github.com/servo/webrender.git   
 13834          https://github.com/mozilla/kumascript.git   
 11699               htt

In [39]:
# Merge Project Info
all_projects_dfs = []
for authors_df in authors_dfs:
    all_projects_dfs.append(authors_df.merge(projects['Github'], on='Repo', how='left'))
all_projects_dfs

[                                                    Repo  \
 0                  https://github.com/rust-lang/rust.git   
 1                   https://github.com/mozilla/gecko.git   
 2      https://github.com/mozilla/donate.mozilla.org.git   
 3                   https://github.com/mozilla/gecko.git   
 4          https://github.com/servo/core-graphics-rs.git   
 5                  https://github.com/rust-lang/rust.git   
 6                 https://github.com/aframevr/aframe.git   
 7        https://github.com/aframevr/aframe-registry.git   
 8                  https://github.com/rust-lang/rust.git   
 9                  https://github.com/rust-lang/rust.git   
 10                   https://github.com/mozilla/sops.git   
 11     https://github.com/mozilla-services/lua_sandbo...   
 12                 https://github.com/rust-lang/rust.git   
 13                https://github.com/servo/webrender.git   
 14             https://github.com/mozilla/kumascript.git   
 15                  htt

In [40]:
# Select commits for just one project
# TODO read env variable to get project name
project_name = 'Gecko'
project_dfs = []
for project_df in all_projects_dfs:
    project_dfs.append(project_df[project_df['Project'] == project_name])

project_dfs

[                                                Repo  \
 1               https://github.com/mozilla/gecko.git   
 3               https://github.com/mozilla/gecko.git   
 15              https://github.com/mozilla/gecko.git   
 17              https://github.com/mozilla/gecko.git   
 18              https://github.com/mozilla/gecko.git   
 20              https://github.com/mozilla/gecko.git   
 25              https://github.com/mozilla/gecko.git   
 29              https://github.com/mozilla/gecko.git   
 34              https://github.com/mozilla/gecko.git   
 35              https://github.com/mozilla/gecko.git   
 41              https://github.com/mozilla/gecko.git   
 43              https://github.com/mozilla/gecko.git   
 44              https://github.com/mozilla/gecko.git   
 53              https://github.com/mozilla/gecko.git   
 56              https://github.com/mozilla/gecko.git   
 59              https://github.com/mozilla/gecko.git   
 63              https://github

## Time from first to last contrib for authors who made a commit before a given year 

Next plot shows the number of authors grouped by time from their first to last contribution. This give us an idea of how long contributors are around the community. In this chart we don't have any clue of their activity in that period, just a quick and approximate glance of the time they remain around the community.

**Long bars in group of 0 years of experiece means that there are many people who made their first and last contributions whithin the same year along the whole period**. That is, the accumulated sum of people who made all their contributions within same year from 1998.

* Y axis corresponds to the difference in years from first to last contributions.
* X axis corresponds to the number of contributors in the given group.
* Each plot shows a snapshot of this information from the specified year to the past (1998 was chosen as the oldest date to get results from). 

In [41]:
# Plot bar charts for each dataframe
i = 0
for authors_df in project_dfs:
#    print(author_df['experience_years'].max(), type(author_df['experience_years'].max()))
    print_horizontal_bar_chart(authors_df, 'active_years', title=str(2016 - i))
    i += 1

## Time from first to last commit for authors active in a given year

We define an author as **active** iff she made at least one commit within a given year. E.g. an author would be considered active in 2017 if she made a commit after Jan. 1st, 2017 and before Dec. 31st 2017. 

In other words, the difference with previous plots lies in having into account only contributors who made their last contribution in the year we are visualizing data from.

* Y axis corresponds to the difference in years from first to last contributions.
* X axis corresponds to the number of contributors in the given group.
* Each plot shows a snapshot of this information from the specified year to the past (1998 was chosen as the oldest date to get results from). 

In [42]:
active_authors_dfs = []
year = 2016
for result in results:
    authors_df = get_active_authors_df(result, author_bucket_field='authors', year=year)
    authors_df['active_years'] = (authors_df.last_commit-authors_df.first_commit).astype('timedelta64[Y]')
    active_authors_dfs.append(authors_df)
    year -= 1

active_authors_dfs

[                                                   Repo  \
 3526              https://github.com/rust-lang/rust.git   
 3908               https://github.com/mozilla/gecko.git   
 4265  https://github.com/mozilla/donate.mozilla.org.git   
 3815               https://github.com/mozilla/gecko.git   
 4018      https://github.com/servo/core-graphics-rs.git   
 3796              https://github.com/rust-lang/rust.git   
 3281             https://github.com/aframevr/aframe.git   
 4049    https://github.com/aframevr/aframe-registry.git   
 3598              https://github.com/rust-lang/rust.git   
 3033              https://github.com/rust-lang/rust.git   
 2967                https://github.com/mozilla/sops.git   
 3946  https://github.com/mozilla-services/lua_sandbo...   
 4140              https://github.com/rust-lang/rust.git   
 3607             https://github.com/servo/webrender.git   
 4127          https://github.com/mozilla/kumascript.git   
 3550               https://github.com/m

In [43]:
# Merge Project Info
all_projects_dfs = []
for authors_df in active_authors_dfs:
    all_projects_dfs.append(authors_df.merge(projects['Github'], on='Repo', how='left'))
all_projects_dfs

[                                                   Repo  \
 0                 https://github.com/rust-lang/rust.git   
 1                  https://github.com/mozilla/gecko.git   
 2     https://github.com/mozilla/donate.mozilla.org.git   
 3                  https://github.com/mozilla/gecko.git   
 4         https://github.com/servo/core-graphics-rs.git   
 5                 https://github.com/rust-lang/rust.git   
 6                https://github.com/aframevr/aframe.git   
 7       https://github.com/aframevr/aframe-registry.git   
 8                 https://github.com/rust-lang/rust.git   
 9                 https://github.com/rust-lang/rust.git   
 10                  https://github.com/mozilla/sops.git   
 11    https://github.com/mozilla-services/lua_sandbo...   
 12                https://github.com/rust-lang/rust.git   
 13               https://github.com/servo/webrender.git   
 14            https://github.com/mozilla/kumascript.git   
 15                 https://github.com/m

In [44]:
# Select commits for just one project
# TODO read env variable to get project name
project_name = 'Gecko'
project_dfs = []
for project_df in all_projects_dfs:
    project_dfs.append(project_df[project_df['Project'] == project_name])

project_dfs

[                                               Repo  \
 1              https://github.com/mozilla/gecko.git   
 3              https://github.com/mozilla/gecko.git   
 15             https://github.com/mozilla/gecko.git   
 17             https://github.com/mozilla/gecko.git   
 18             https://github.com/mozilla/gecko.git   
 20             https://github.com/mozilla/gecko.git   
 25             https://github.com/mozilla/gecko.git   
 29             https://github.com/mozilla/gecko.git   
 34             https://github.com/mozilla/gecko.git   
 35             https://github.com/mozilla/gecko.git   
 41             https://github.com/mozilla/gecko.git   
 43             https://github.com/mozilla/gecko.git   
 44             https://github.com/mozilla/gecko.git   
 53             https://github.com/mozilla/gecko.git   
 56             https://github.com/mozilla/gecko.git   
 59             https://github.com/mozilla/gecko.git   
 63             https://github.com/mozilla/gecko

In [45]:
# Plot bar charts for each dataframe
i = 0
for authors_df in project_dfs:
#    print(author_df['experience_years'].max(), type(author_df['experience_years'].max()))
    print_horizontal_bar_chart(authors_df, 'active_years', title=str(2016 - i))
    i += 1

---
TODO BELOW
---

In [8]:
###
## GET COMMITS BY YEAR AND AUTHOR
###

results = []
min_commits = 1

for i in range(0,2):

    # Buckets by author name, finding first commit for each of them
    s = create_search(source='git')
    
    # General filters
    s = add_general_date_filters(s)
    s = add_bot_filter(s)
    s = add_merges_filter(s)
    
    # Retrieve commits before given year
    s = s.filter('range', grimoire_creation_date={'lte': 'now-' + str(i) + 'y/y'})

    # Bucketize by time, uuid and organization, then count commits per year
    s.aggs.bucket('time', 'date_histogram', field='grimoire_creation_date', interval='year') \
        .bucket('authors', 'terms', field='author_uuid', size=100000, min_doc_count=min_commits) \
        .bucket('org', 'terms', field='author_org_name', size=1) \
        .metric('commits', 'cardinality', field='hash', precision_threshold=1000)

    r = s.execute()
    # In case you need to check response, uncomment line below
    #print(r.to_dict()['aggregations']['time']['buckets'])
        
    results.append(r)
    
#results

In [9]:
###
## CREATE A DF CONTAINING, FOR EACH AUTHOR UUID, COUNT OF YEARS OF EXPERIENCE (YEARS
## WITH MORE THAN 12 COMMITS MADE) AND LAST YEAR ACTIVE
###
exp_df_list = []
year = 2017

for result in results:
    exp_df = ut.to_df_by_time(result, 'Author', 'Time', 'Commits', 'Org', 'authors', 'time', 'commits', 'org')
    exp_df['Time'] = exp_df['Time'].apply(lambda x: str(pandas.Period(x,'A')))
    
    ## ACTIVE CONDITION
    ## Filter those having less than 12 commits per year
    exp_df = exp_df[exp_df['Commits'] >= 12]
    
    ## Group by author, get MAX YEAR and NUMBER OF ROWS FOR THE GIVEN AUTHOR
    exp_df = exp_df.groupby(['Author', 'Org']).agg({'Time': 'max', 'Commits': 'count'})
    ## Filter those whose last active year is not the one we want
    exp_df = exp_df[exp_df['Time'] == str(year)]
    
    exp_df['exp'] = exp_df['Commits']
    exp_df['last_active'] = exp_df['Time']
    exp_df= exp_df.drop('Commits', axis=1)
    exp_df = exp_df.drop('Time', axis=1)
    
    exp_df_list.append(exp_df)
    
    year -= 1

exp_df_list

[                                                        exp last_active
 Author                                   Org                           
 000063c4e47e93ab3b30607680609e4d2500ce5d Mozilla Staff    4        2017
 002893ffe1425c220756f8ba4c78e1e3bb0be50f Mozilla Staff    7        2017
 00834d313bfc6fc60be1631bcc57b2c05ee2e0e3 Mozilla Staff    9        2017
 00846eff46b051d92317fc74e54041c6fdccd7cf Mozilla Staff   10        2017
 00a40f9e9e7f7633ddab8291a99e1e487f88481c Community        3        2017
 00b934012989b386ac9efc706dbc28cd6be173c6 Mozilla Staff    2        2017
 00d00a6e7530f1ccac19b98727f25b6528ed9fe3 Community        1        2017
 00da6ede3bff8db21a33473d7e552b76f50757eb Mozilla Staff    5        2017
 00ed7b25063cf90c8bdeb9d45d37b73ba2317f96 Mozilla Staff    6        2017
 01307140d33369746c5013f45295092f8752d378 Community        2        2017
 013bbdb9412b88db677df21347e032c57b099d97 Community        3        2017
 016a5f2ec7191e74e984c71787b2b292d89543a1 Community

In [None]:
exp_groups_evo_df = pandas.DataFrame(columns=['last_active', 'exp', 'count', 'org'])

for exp_df in exp_df_list:
    
    if exp_df.empty:
        continue
    
    experience = list(range(1, int(exp_df['exp'].max()) + 1))
    orgs = exp_df.index.get_level_values('Org').unique()
    
    last_active = exp_df['last_active'].unique()[0]
    for exp in experience:
        for org in orgs:
            org_df = exp_df[[group2 in [org] for group1, group2 in exp_df.index]]
            count = len(org_df.loc[org_df['exp'] == exp])
            #print(last_active, exp, count)
            exp_groups_evo_df.loc[len(exp_groups_evo_df)] = [last_active, exp, count, org]
        
print('Max. Exp: ', exp_groups_evo_df['exp'].max(), 'Max. Count: ',  exp_groups_evo_df['count'].max())
exp_groups_evo_df 

In [None]:
exp_groups_evo_df = pandas.DataFrame(columns=['exp'])

for exp_df in exp_df_list:
    
    if exp_df.empty:
        continue
    
    year = exp_df['last_active'].unique()[0]
    exp_groups_df = pandas.DataFrame(columns=['exp', year])
    
    experience = list(range(1, int(exp_df['exp'].max()) + 1))
    
    for exp in experience:
        count = len(exp_df.loc[exp_df['exp'] == exp])
        exp_groups_df.loc[len(exp_groups_df)] = [exp, count]

    exp_groups_evo_df = exp_groups_evo_df.merge(exp_groups_df, on='exp', how='outer')


# Fill Nan with 0's
exp_groups_evo_df = exp_groups_evo_df.fillna(0)

# Reorder columns
exp_groups_evo_df = exp_groups_evo_df.set_index('exp')
exp_groups_evo_df = exp_groups_evo_df.sort_index(axis=1)


#print('Max. Exp: ', exp_groups_evo_df['exp'].max(), 'Max. Count: ')
exp_groups_evo_df