In [1]:
import pandas

import os

import plotly as plotly
import plotly.figure_factory as ff
import plotly.graph_objs as go

import util as ut

from util import ESConnection
from elasticsearch_dsl import Search

es_conn = ESConnection()

project_name = os.environ.get('PROJECT', 'All')
max_time = os.environ.get('MAX_TIME', '10')

# Range of years from 2017 to the past to be used in all metrics
# Use 0 as starting point together with lt in queries to exclude 2017
analyzed_range = range(0,int(max_time))

## First project
Get the project in which auhtors made their first contribution to Mozilla. Show a ranking of top 5 projects in number of newcomers to Mozilla per year.

In [2]:
#results = []
#for i in analyzed_range:

# Buckets by author name, finding first commit for each of them
s = ut.create_search(es_conn, 'git')

# Filter commits to the Project Repos
s = ut.add_project_filter(s, project_name)


# Retrieve commits before given year
s = s.filter('range', grimoire_creation_date={'lt': 'now/y'})

# Bucketize by uuid and get first and last commit
s.aggs.bucket('authors', 'terms', field='author_uuid', size=100000) \
    .metric('first', 'top_hits', _source=['author_date', 'author_org_name', 'author_uuid', 'project'],
            size=1, sort=[{"author_date": {"order": "asc"}}]) \
    .metric('last_commit', 'max', field='author_date')
s = s.sort("author_date")
#print(s.to_dict())
result = s.execute()

authors_df = ut.get_authors_df(result, author_bucket_field='authors')
#authors_df['active_years'] = (authors_df.last_commit-authors_df.first_commit).astype('timedelta64[Y]')

# Get year of First Commit
authors_df['first_commit'] = authors_df['first_commit'].map(lambda t: t.to_pydatetime().year)

# Group by year of first commit and project, counting number of authors
projects_df = authors_df.groupby(['first_commit', 'project']).agg({'author': pandas.Series.nunique})
projects_df.rename(columns={"author": "# authors"}, inplace=True)
projects_df = projects_df.reset_index().sort_values(by=['first_commit', '# authors'], ascending=[False, False])

# Get top 20 projects based on newcomers from 2011
rankings_df = pandas.DataFrame()
for year in projects_df['first_commit'].unique():
    if year > 2011:
        year_df = projects_df.loc[projects_df['first_commit'] == year].head(20)
        rankings_df = pandas.concat([rankings_df, year_df]) 

ut.print_table(rankings_df, filename='git-top-projects-newcomers-table')


In [4]:
plotly.offline.init_notebook_mode(connected=True)

data = []
for project in rankings_df['project'].unique():
    #print(exp, '\n', exp_groups_evo_df.loc[exp].tolist(), '\n', exp_groups_evo_df.loc[exp].index.values)
    data.append(
        go.Scatter(
            x = rankings_df['first_commit'].unique(),
            y = rankings_df.loc[(rankings_df['project'] == project), '# authors'],
            mode = 'lines+markers',
            name = project
        )
    )
    


plotly.offline.iplot(data, filename='line-mode')

# Authors DataFrame
Get a dataframes of Authors from a given year to the past. Each dataframe includes first commit date, last commit date, org, uuid and project.

In [8]:
### DIFFERENCE BETWEEN FIRST AND LAST CONTRIBUTION
### GET SNAPSHOTS FOR DIFFERENT YEARS

results = []
for i in analyzed_range:

    # Buckets by author name, finding first commit for each of them
    s = ut.create_search(es_conn, 'git')
    
    # Filter commits to the Project Repos
    s = ut.add_project_filter(s, project_name)
    
    # Retrieve commits before given year
    s = s.filter('range', grimoire_creation_date={'lt': 'now-' + str(i) + 'y/y'})

    # Bucketize by uuid and get first and last commit
    s.aggs.bucket('authors', 'terms', field='author_uuid', size=100000) \
        .metric('first', 'top_hits', _source=['author_date', 'author_org_name', 'author_uuid', 'project'],
                size=1, sort=[{"author_date": {"order": "asc"}}]) \
        .metric('last_commit', 'max', field='author_date')
    s = s.sort("author_date")
    #print(s.to_dict())
    results.append(s.execute())


authors_dfs = []
for result in results:
    authors_df = ut.get_authors_df(result, author_bucket_field='authors')
    authors_df['active_years'] = (authors_df.last_commit-authors_df.first_commit).astype('timedelta64[Y]')
    authors_dfs.append(authors_df)

authors_dfs

[                                         author        first_commit  \
 11232  149ad981c7f8acd65b9cb9a3f306833fc4b6ec8e 2016-12-31 12:01:23   
 10338  778f8adac79c8593c8413792e9d838e0620797c1 2016-12-31 08:34:00   
 13981  f0a27b3276baf429f51f6afed8ece01f1a09e7d0 2016-12-31 05:11:58   
 10180  5c6bef06c5d1fa2e83f13db2046ec3d9d27bd00d 2016-12-30 23:09:23   
 13060  977b2b48ece5125093c2cfdedb56be00bdba4f13 2016-12-30 22:21:03   
 12247  57a1cf0d7a35c3f77475953873212148d3c01e1b 2016-12-30 21:55:02   
 10439  8c536fb5c49a281f793cb47e933fd71403891960 2016-12-30 17:26:13   
 13186  a1e6b9aca42df46460c5fed6a73cde615b2b04a3 2016-12-30 16:45:19   
 11529  26e5b20e7ccff92ec895b5f4af3f5f0b9a5b0f0a 2016-12-30 14:34:47   
 9636   09a899922407e40e04fa6ab865da040f06c0d904 2016-12-30 12:22:11   
 9338   c91f349eb811f5870fe7b4e987bd34decfbf72cb 2016-12-30 11:34:16   
 12809  829a9b7eae96736b2408c62c4b2843721116fe41 2016-12-29 23:33:52   
 13513  c19b5436d45eb06bb0d9a16accd4a16da1ad9765 2016-12-29 14:2


# Git: Newcomers by year

In [9]:
# AUTHORS GROUPED BY FIRST COMMIT DATE

attraction_dfs = []
for authors_df in authors_dfs:
    attraction_df = authors_df.copy()
    attraction_df['first_commit'] = attraction_df['first_commit'].apply(lambda x: str(pandas.Period(x,'A')))
    attraction_df['first_commit'] = attraction_df['first_commit'].apply(lambda x: int(x) * -1)
    attraction_df = attraction_df.groupby(['first_commit']).agg({'author': pandas.Series.nunique})
    attraction_df = attraction_df.reset_index()  
    attraction_dfs.append(attraction_df)  

attraction_dfs

[    first_commit  author
 0          -2016    2454
 1          -2015    2564
 2          -2014    2359
 3          -2013    1621
 4          -2012    1139
 5          -2011     974
 6          -2010     709
 7          -2009     536
 8          -2008     498
 9          -2007     390
 10         -2006     105
 11         -2005      73
 12         -2004      53
 13         -2003      58
 14         -2002      53
 15         -2001      93
 16         -2000     115
 17         -1999      91
 18         -1998     235,     first_commit  author
 0          -2015    2564
 1          -2014    2359
 2          -2013    1621
 3          -2012    1139
 4          -2011     974
 5          -2010     709
 6          -2009     536
 7          -2008     498
 8          -2007     390
 9          -2006     105
 10         -2005      73
 11         -2004      53
 12         -2003      58
 13         -2002      53
 14         -2001      93
 15         -2000     115
 16         -1999      91
 17         

In [10]:
# Plot bar charts for each dataframe
attraction_df = attraction_dfs[0]
    
plotly.offline.init_notebook_mode(connected=True)
        
data = [go.Bar(
        x=attraction_df['author'],
        y=attraction_df['first_commit'],
        orientation = 'h'
)]

layout = go.Layout(
    barmode='group',
    title=project_name + ' ' + str(2016)
)

fig = go.Figure(data=data, layout=layout)
plotly.offline.iplot(fig, filename='horizontal-bar')   

## Years of Experience
We consider **N commits** per year, as a minimum to add one year of experience to a given author. From this assumption, we build groups of authors by years of experience. As a result, we present a plot with number of people in each group.

To give a more complete idea of how community evolves, we plot snapshots corresponding to different years. Each of them will take all commits sent until the given year, and calculate years of experience for all authors in that slice.

We are also counting authors whose last year of experience is the one we are analyzing data from. That is, if we are looking to year 2016, we only count those authors who made at least N commits in 2016. From there we add 1 year of experience for each year they fulfill this condition.

* Y axis corresponds to years of experience as defined above.
* X axis corresponds to the umber of contributors in the given group.
* Each plot shows a snapshot of this information from the specified year to the past (1998 was chosen as the oldest date to get results from). 

In [16]:
###
## GET COMMITS BY YEAR AND AUTHOR
###

results = []
min_commits = 1

for i in analyzed_range:

    # Buckets by author name, finding first commit for each of them
    s = ut.create_search(es_conn, 'git')
        
    # Filter commits to the Project Repos
    s = ut.add_project_filter(s, project_name)
    
    # Retrieve commits before given year
    s = s.filter('range', grimoire_creation_date={'lt': 'now-' + str(i) + 'y/y'})

    # Bucketize by time, uuid and organization, then count commits per year
    s.aggs.bucket('time', 'date_histogram', field='grimoire_creation_date', interval='year') \
        .bucket('authors', 'terms', field='author_uuid', size=100000, min_doc_count=min_commits) \
        .bucket('org', 'terms', field='author_org_name', size=1) \
        .metric('commits', 'cardinality', field='hash', precision_threshold=1000)

    r = s.execute()
    # In case you need to check response, uncomment line below
    #print(r.to_dict()['aggregations']['time']['buckets'])
        
    results.append(r)
    
#results


In [17]:
###
## CREATE A DF CONTAINING, FOR EACH AUTHOR UUID, COUNT OF YEARS OF EXPERIENCE (YEARS
## WITH MORE THAN N COMMITS MADE) AND LAST YEAR ACTIVE
###
exp_df_list = []
year = 2016

for result in results:
    exp_df = ut.to_df_by_time(result, 'Author', 'Time', 'Commits', 'Org', 'authors', 'time', 'commits', 'org')
    exp_df['Time'] = exp_df['Time'].apply(lambda x: str(pandas.Period(x,'A')))
    
    ## ACTIVE CONDITION #################################
    ## Filter those having less than N commits per year
    exp_df = exp_df[exp_df['Commits'] >= 1]
    #####################################################
    
    ## Group by author, get MAX YEAR and NUMBER OF ROWS FOR THE GIVEN AUTHOR
    exp_df = exp_df.groupby(['Author', 'Org']).agg({'Time': 'max', 'Commits': 'count'})
    ## Filter those whose last active year is not the one we want
    exp_df = exp_df[exp_df['Time'] == str(year)]
    
    exp_df['exp'] = exp_df['Commits']
    exp_df['last_active'] = exp_df['Time']
    exp_df= exp_df.drop('Commits', axis=1)
    exp_df = exp_df.drop('Time', axis=1)
    
    exp_df['project'] = project_name
    
    exp_df_list.append(exp_df)
    
    year -= 1

exp_df_list

[                                                        exp last_active  \
 Author                                   Org                              
 000063c4e47e93ab3b30607680609e4d2500ce5d Mozilla Staff    4        2016   
 00014f52adaaeca931c4c077a1cb9791eb769ea1 Community        1        2016   
 0002c6a09a45af6481c35e29c0ed7f3bdbecb3b8 Community        1        2016   
 000336e205e3f5e2daa6e7a0b03e612b3b2b02d3 Mozilla Staff    3        2016   
 000679e4b588906b8025d2c28d45e9ece4c88715 Community        1        2016   
 000a91646239261328167971b7c6d851087c3089 Community        1        2016   
 0012fe0f0c0ebabfb0fbed08e160f6b23cd248f8 Community        2        2016   
 0014e7cebbe8ef11dab9eeb5f2111e01d9eea378 Mozilla Staff    6        2016   
 00201bda17fd9d14bc1e0f8a72afbc55af090657 Community        1        2016   
 002893ffe1425c220756f8ba4c78e1e3bb0be50f Mozilla Staff    6        2016   
 00302655e2f88afeedfb958a0364ab13bcf7ea52 Community        1        2016   
 003716394a2

In [23]:
# Plot bar charts for each dataframe
i = 0
for exp_df in exp_df_list:
    if not exp_df.empty:
        ut.print_horizontal_bar_chart(exp_df, 'exp', title=project_name + ' ' + str(2016 - i), min_range=1)
        
        employees_df = exp_df[[group2 in ['Mozilla Staff'] for group1, group2 in exp_df.index]]
        if len(employees_df) > 0:
            ut.print_horizontal_bar_chart(employees_df, 'exp', title=project_name + ' employees ' + str(2016 - i), 
                                       min_range=1)
        
        non_employees_df = exp_df[[group2 not in ['Mozilla Staff'] for group1, group2 in exp_df.index]]
        if len(non_employees_df):
            ut.print_horizontal_bar_chart(non_employees_df, 'exp', 
                                       title=project_name + ' non-employees ' + str(2016 - i), min_range=1)
    i += 1
    ## GET DATA ONLY FOR 2016 (AS IT'S THE ONLY WE USE IN THE REPORT RIGHT NOW)
    break

### Evolution of Experience

Next table and plot show how each group changes over time. This way we can visualize how new people come and remain in the community. It is worth to note that we are not following a given group of people through time (it could be done following diagonals in the table, we look at this in the next section), but looking at how a given group changes from one year to another. 

For instance, if we look at the group of 2 years of experience in 2008 we see we had 204 people. If we look at the same group in 2009 we see that our **new group** of people accumulating 2 years of experience has 105 people. So, it seems we have fewer people with 2 years of experience in 2009. If we look at 2016 we find 226 people with two years of experience, so we have more people with 2 years of experience nowadays than we had 8 years ago.

Table can be read as follows:

* Cell values corresponds to the number of contributors in the given group.
* Rows corresponds to groups based on years of experience.
* Columns corresponds to years we are analyzing. 

In [19]:
exp_groups_evo_df = pandas.DataFrame(columns=['exp'])

for exp_df in exp_df_list:
    
    if exp_df.empty:
        continue
    
    year = exp_df['last_active'].unique()[0]
    exp_groups_df = pandas.DataFrame(columns=['exp', year])
    
    experience = list(range(1, int(exp_df['exp'].max()) + 1))
    
    for exp in experience:
        count = len(exp_df.loc[exp_df['exp'] == exp])
        exp_groups_df.loc[len(exp_groups_df)] = [exp, count]

    exp_groups_evo_df = exp_groups_evo_df.merge(exp_groups_df, on='exp', how='outer')


# Fill Nan with 0's
exp_groups_evo_df = exp_groups_evo_df.fillna(0)

# Reorder columns
exp_groups_evo_df = exp_groups_evo_df.set_index('exp')
exp_groups_evo_df = exp_groups_evo_df.sort_index(axis=1)


#print('Max. Exp: ', exp_groups_evo_df['exp'].max(), 'Max. Count: ')
exp_groups_evo_df

Unnamed: 0_level_0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,390.0,498.0,536.0,709.0,973.0,1139.0,1623.0,2358.0,2564.0,2454.0
2.0,63.0,230.0,207.0,206.0,237.0,390.0,441.0,595.0,715.0,763.0
3.0,40.0,14.0,105.0,138.0,141.0,149.0,241.0,312.0,346.0,342.0
4.0,27.0,13.0,10.0,83.0,116.0,102.0,109.0,187.0,212.0,225.0
5.0,14.0,2.0,10.0,7.0,78.0,93.0,79.0,82.0,156.0,160.0
6.0,15.0,0.0,2.0,10.0,5.0,68.0,79.0,69.0,68.0,129.0
7.0,6.0,5.0,0.0,2.0,10.0,4.0,55.0,69.0,66.0,56.0
8.0,9.0,0.0,3.0,1.0,2.0,7.0,3.0,51.0,64.0,59.0
9.0,6.0,4.0,0.0,3.0,0.0,1.0,7.0,2.0,47.0,54.0
10.0,2.0,2.0,4.0,0.0,2.0,0.0,1.0,6.0,2.0,40.0


### Employees

In [20]:
exp_groups_evo_moz_df = pandas.DataFrame(columns=['exp'])

for exp_df in exp_df_list:
    
    if exp_df.empty:
        continue
        
    exp_df = exp_df[[group2 in ['Mozilla Staff'] for group1, group2 in exp_df.index]]
    
    year = exp_df['last_active'].unique()[0]
    exp_groups_df = pandas.DataFrame(columns=['exp', year])
    
    experience = list(range(1, int(exp_df['exp'].max()) + 1))
    
    for exp in experience:
        count = len(exp_df.loc[exp_df['exp'] == exp])
        exp_groups_df.loc[len(exp_groups_df)] = [exp, count]

    exp_groups_evo_moz_df = exp_groups_evo_moz_df.merge(exp_groups_df, on='exp', how='outer')


# Fill Nan with 0's
exp_groups_evo_moz_df = exp_groups_evo_moz_df.fillna(0)

# Reorder columns
exp_groups_evo_moz_df = exp_groups_evo_moz_df.set_index('exp')
exp_groups_evo_moz_df = exp_groups_evo_moz_df.sort_index(axis=1)


#print('Max. Exp: ', exp_groups_evo_moz_df['exp'].max(), 'Max. Count: ')
exp_groups_evo_moz_df

Unnamed: 0_level_0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,125.0,138.0,141.0,156.0,250.0,285.0,303.0,250.0,179.0,125.0
2.0,8.0,96.0,108.0,102.0,103.0,192.0,213.0,234.0,168.0,146.0
3.0,10.0,8.0,81.0,89.0,88.0,85.0,169.0,199.0,184.0,124.0
4.0,2.0,10.0,6.0,72.0,80.0,72.0,76.0,147.0,161.0,140.0
5.0,0.0,2.0,10.0,4.0,68.0,70.0,63.0,62.0,127.0,127.0
6.0,3.0,0.0,2.0,10.0,4.0,60.0,65.0,55.0,56.0,110.0
7.0,0.0,3.0,0.0,2.0,10.0,4.0,50.0,57.0,57.0,48.0
8.0,4.0,0.0,2.0,1.0,1.0,7.0,3.0,46.0,54.0,50.0
9.0,2.0,4.0,0.0,2.0,0.0,1.0,7.0,2.0,42.0,44.0
10.0,0.0,2.0,4.0,0.0,2.0,0.0,1.0,6.0,2.0,36.0


### Non-employees

In [21]:
exp_groups_evo_others_df = pandas.DataFrame(columns=['exp'])

for exp_df in exp_df_list:
    
    if exp_df.empty:
        continue
        
    exp_df = exp_df[[group2 not in ['Mozilla Staff'] for group1, group2 in exp_df.index]]
    
    if (len(exp_df) == 0):
        continue
    
    year = exp_df['last_active'].unique()[0]
    exp_groups_df = pandas.DataFrame(columns=['exp', year])
    
    experience = list(range(1, int(exp_df['exp'].max()) + 1))
    
    for exp in experience:
        count = len(exp_df.loc[exp_df['exp'] == exp])
        exp_groups_df.loc[len(exp_groups_df)] = [exp, count]

    exp_groups_evo_others_df = exp_groups_evo_others_df.merge(exp_groups_df, on='exp', how='outer')


# Fill Nan with 0's
exp_groups_evo_others_df = exp_groups_evo_others_df.fillna(0)

# Reorder columns
exp_groups_evo_others_df = exp_groups_evo_others_df.set_index('exp')
exp_groups_evo_others_df = exp_groups_evo_others_df.sort_index(axis=1)


#print('Max. Exp: ', exp_groups_evo_others_df['exp'].max(), 'Max. Count: ')
exp_groups_evo_others_df

Unnamed: 0_level_0,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016
exp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1.0,265.0,360.0,395.0,553.0,723.0,854.0,1320.0,2108.0,2385.0,2329.0
2.0,55.0,134.0,99.0,104.0,134.0,198.0,228.0,361.0,547.0,617.0
3.0,30.0,6.0,24.0,49.0,53.0,64.0,72.0,113.0,162.0,218.0
4.0,25.0,3.0,4.0,11.0,36.0,30.0,33.0,40.0,51.0,85.0
5.0,14.0,0.0,0.0,3.0,10.0,23.0,16.0,20.0,29.0,33.0
6.0,12.0,0.0,0.0,0.0,1.0,8.0,14.0,14.0,12.0,19.0
7.0,6.0,2.0,0.0,0.0,0.0,0.0,5.0,12.0,9.0,8.0
8.0,5.0,0.0,1.0,0.0,1.0,0.0,0.0,5.0,10.0,9.0
9.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,5.0,10.0
10.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0


Next plot can be read as follows:
* Y axis corresponds to the number of contributors in the given group.
* X axis corresponds to years we are looking through.
* Each line corresponds to a given group based on their years of experience. 

In [22]:
plotly.offline.init_notebook_mode(connected=True)

data = []
for exp in exp_groups_evo_df.index.values:
    #print(exp, '\n', exp_groups_evo_df.loc[exp].tolist(), '\n', exp_groups_evo_df.loc[exp].index.values)
    data.append(
        go.Scatter(
            x = exp_groups_evo_df.loc[exp].index.values,
            y = exp_groups_evo_df.loc[exp].tolist(),
            mode = 'lines+markers',
            name = str(int(exp)) + ' years'
        )
    )
    


plotly.offline.iplot(data, filename='line-mode')    