### Metric Calculations
First we need to load a connection against the proper ES instance. We use an external module to load credentials from a file that will not be shared. If you want to run this, please use your own credentials, just put them in a file named '.settings' (in the same directory as this notebook) following the example file 'settings.sample'.

To work with **survey results** this notebook expect to find the following files:
* **UUID's**: ../data/uuids.csv
* **SURVEY**: ../data/survey-fake.csv

**If you need to use a different survey file, just modify in the second code cell below this one.**

In [1]:
import pandas

import util as ut

from util import ESConnection

es_conn = ESConnection()

In [2]:
# Read survey dataframe from survey and (response id, uuid) correspondence files

############# MODIFY YOUR SURVEY CSV FILE HERE ######################################
survey_df = ut.load_survey_df(survey_filepath='../data/survey-fake.csv',
                              uuids_filepath='../data/uuids.csv')
#####################################################################################

print('Identities with UUID found in survey file: ', len(survey_df))
print('Unique identities found: ', len(survey_df.uuid.unique()))

def add_common_filters(source, s):
    s = s.filter('terms', author_uuid=survey_df['uuid'].tolist())
    #print(len(survey_df['uuid'].unique()))

    # Retrieve emails before given year
    #if (source == 'bugzilla'):
    #    s = s.filter('range', creation_ts={'lt': 'now/y'})
    #else:
    #    s = s.filter('range', grimoire_creation_date={'lt': 'now/y'})
        
    return s

Identities with UUID found in survey file:  327
Unique identities found:  327


In [3]:
survey_df

Unnamed: 0,uuid,active,age,country,gender,disability,education level,language,english proficiency,coding
0,8e750cd848ad5a102edcadd5b3e93c928db87b06,"Yes, I have contributed within the past year.",18-29,United States,Another Gender (write in),I wish to voluntarily share that my disability...,Prefer not to say,Other (please specify),Strong,Coding
1,e911a3c6fb7e95aa13899a09dcbf32ae94095c3d,"Yes, I have contributed within the past year.",18-29,Zimbabwe,Male,I am not a person with a disability,Bachelor’s Degree,English,Extremely strong,
2,7c63ef173f359e3b75e42e10dcec4716eebab452,"Yes, I have contributed within the past year.",18-29,Venezuela,Prefer not to say,I am not a person with a disability,Bachelor’s Degree,Other (please specify),Strong,Coding
3,046a18bb30a059adf370af956ad52e41f96851b0,"Yes, I have contributed within the past year.",30-39,Venezuela,Male,I am not a person with a disability,Bachelor’s Degree,English,Neither strong nor weak,
4,10c7ab6a3b37f0a67bc19c572678a2f8731c621b,"No, I have not contributed within the past yea...",18-29,Uruguay,Male,,Bachelor’s Degree,Hindi,Strong,
5,18f5f7c104725182c244c42bd3621756703e0c9c,"Yes, I have contributed within the past year.",18-29,Uruguay,Male,I am not a person with a disability,Bachelor’s Degree,English,Strong,
6,c88512a42bbe7d9b1d283adbfd59a45499343583,"Yes, I have contributed within the past year.",30-39,United States,Female,I am not a person with a disability,Postgraduate Degree,English,Strong,
7,02dc242268e8632b3069471be829025be76fed8a,"Yes, I have contributed within the past year.",18-29,United States,Male,I am not a person with a disability,Bachelor’s Degree,Other (please specify),Neither strong nor weak,Coding
8,23f18b9d13fe1cc102c5dff690a6ca2eb2890ba3,"Yes, I have contributed within the past year.",18-29,United States,Female,,Bachelor’s Degree,Hindi,Strong,Coding
9,0cd3d032f400e26364e884e768187a99f8d84556,"No, I have not contributed within the past yea...",65+,United States,Male,I am not a person with a disability,Postgraduate Degree,English,Extremely strong,


In [4]:
### GET AUTHORS IN GIT, THEIR PROJECTS AND COMMITS
source = 'git'
s = ut.create_search(es_conn, source)

s = add_common_filters(source, s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000) \
        .metric('commits', 'cardinality', field='hash', precision_threshold=1000000)

result = s.execute()

author_commits_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project', value_field='commits',
                                group_column='uuid', subgroup_column='git project', value_column='commits')

print('AUTHORS FOUND IN GIT: ', len(author_commits_df['uuid'].unique()))

AUTHORS FOUND IN GIT:  199


In [5]:
### GET AUTHORS IN BUGZILLA, THEIR PROJECTS AND NUMBER OF BUGS
source = 'bugzilla'
s = ut.create_search(es_conn, source)

s = add_common_filters(source, s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000)

result = s.execute()

author_bugs_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project',
                                group_column='uuid', subgroup_column='bugzilla project', value_column='bugs')

print('AUTHORS FOUND IN BUGZILLA: ', len(author_bugs_df['uuid'].unique()))

AUTHORS FOUND IN BUGZILLA:  226


In [6]:
### GET AUTHORS IN MAILING LISTS, THEIR PROJECTS AND NUMBER OF EMAILS
source = 'mbox'
s = ut.create_search(es_conn, source)

s = add_common_filters(source, s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000)

result = s.execute()

author_emails_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project',
                                group_column='uuid', subgroup_column='mbox project', value_column='emails')

print('AUTHORS FOUND IN MBOX: ', len(author_emails_df['uuid'].unique()))

AUTHORS FOUND IN MBOX:  167


In [7]:
### GET AUTHORS IN DISCOURSE, THEIR PROJECTS AND NUMBER OF MESSAGES
source = 'discourse'
s = ut.create_search(es_conn, source)

s = add_common_filters(source,s)

# Bucketize by uuid and project
s.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
        .bucket('project', 'terms', field='project', size=100000)

result = s.execute()

author_discourse_df = ut.stack_by(result=result, group_field='uuid', subgroup_field='project',
                                group_column='uuid', subgroup_column='discourse project', value_column='messages')

print('AUTHORS FOUND IN DISCOURSE: ', len(author_discourse_df['uuid'].unique()))

AUTHORS FOUND IN DISCOURSE:  98


In [8]:
### GET AUTHORS IN GITHUB, THEIR PROJECTS AND NUMBER OF ISSUES AND PR's
source = 'github_issues'

# PULL REQUESTS
s_prs = ut.create_search(es_conn, source)

s_prs = add_common_filters(source, s_prs)

# Get only PRs
s_prs = s_prs.filter('terms', pull_request=['true'])

s_prs.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
    .bucket('project', 'terms', field='project', size=100)
result_prs = s_prs.execute()


# ISSUES
s_iss = ut.create_search(es_conn, 'github_issues')

s_iss = add_common_filters(source, s_iss)

# Get only Issues
s_iss = s_iss.filter('terms', pull_request=['false'])
s_iss.aggs.bucket('uuid', 'terms', field='author_uuid', size=100000) \
    .bucket('project', 'terms', field='project', size=100)
result_iss = s_iss.execute()


# MERGE PR's and ISSUES

prs_df = ut.stack_by(result=result_prs, group_column='uuid', subgroup_column='github project',
                     value_column='pull requests', group_field='uuid', subgroup_field='project')
iss_df = ut.stack_by(result=result_iss, group_column='uuid', subgroup_column='github project',
                     value_column='issues', group_field='uuid', subgroup_field='project')

github_df = pandas.merge(prs_df, iss_df, how='outer', on=['uuid', 'github project'])
github_df = github_df.fillna(0)

print('AUTHORS FOUND IN GITHUB: ', len(github_df['uuid'].unique()))

AUTHORS FOUND IN GITHUB:  200


In [9]:
authors_global_df = pandas.concat([author_commits_df, 
                                   author_emails_df, 
                                   github_df, 
                                   author_bugs_df,
                                   author_discourse_df], 
                                  keys=['git', 'mbox', 'github', 'bugzilla', 'discourse'])


grouped_df = authors_global_df.groupby(['uuid']).agg({'emails': 'sum', 
                                                     'commits': 'sum',
                                                     'pull requests': 'sum',
                                                     'issues': 'sum',
                                                     'bugs': 'sum',
                                                     'messages': 'sum',
                                                     'git project': pandas.Series.nunique,
                                                     'mbox project': pandas.Series.nunique,
                                                     'github project': pandas.Series.nunique,
                                                     'bugzilla project': pandas.Series.nunique,
                                                     'discourse project': pandas.Series.nunique
                                                     })

grouped_df = grouped_df.reset_index().sort_values(by=['commits'], ascending=[False])


grouped_df = grouped_df[['uuid', 'commits', 'pull requests', 'issues', 'bugs', 'emails', 'messages',
                         'git project', 'github project', 'bugzilla project', 'mbox project',
                         'discourse project']]

print('AUTHORS FOUND IN GIT, GITHUB, BUGZILLA, MBOX, DISCOURSE: ', len(grouped_df['uuid'].unique()))

grouped_df

AUTHORS FOUND IN GIT, GITHUB, BUGZILLA, MBOX, DISCOURSE:  319


Unnamed: 0,uuid,commits,pull requests,issues,bugs,emails,messages,git project,github project,bugzilla project,mbox project,discourse project
108,27b393eff6d90b0167bb8c87a77cc80c1c8c7c26,3927.0,1.0,3.0,920.0,395.0,,4,1,6,1,0
7,0183bd06d487bf71fd39a568ae892e3ec3079d82,3660.0,1163.0,487.0,1.0,5.0,,3,2,1,1,0
85,1ed27760bba7202e2ae04760fe950a05e011c4de,3537.0,32.0,83.0,1689.0,8122.0,,5,5,19,15,0
157,43ff4940e0239a03a3cf8459061c4c7fb8ef91e7,3241.0,186.0,163.0,2298.0,399.0,,4,4,12,3,0
299,c71f72642e9864e525582f46e11baa446dbaf69c,2632.0,1.0,0.0,328.0,93.0,,4,1,6,1,0
111,289bdd4b2e22cd4a0e32b4b0ba990d6af13d2db9,2007.0,498.0,154.0,163.0,26.0,1.0,4,3,3,2,1
122,2daaf63704e757c1154c7a6762ffa2571fb644af,1984.0,1139.0,244.0,272.0,25.0,,7,7,8,4,0
33,0d121ebd63579eedb0336574053f601f8a6e7e73,1822.0,47.0,1.0,2131.0,5757.0,63.0,4,1,16,12,5
302,cbf91b5771b5d8dec38f1c67d7fe3a0fe5ec6e36,1696.0,1.0,0.0,165.0,20.0,,2,1,8,2,0
265,9b981479e05cbba272b3d2376a90143e25c0de81,1498.0,29.0,207.0,21.0,223.0,77.0,4,2,8,7,4


In [10]:
### USERS WITH NO ACTIVITY OR COLLABORATING IN UNTRACKED PROJECTS
print('Authors matched in Survey: ', len(survey_df.uuid.unique()))
print('Authors found in indexes & projects: ', len(grouped_df.uuid.unique()))
print('\nAuthors matched in survey but not found in any tracked project:\n')
for element in survey_df.uuid.unique():
    if element not in grouped_df.uuid.unique():
        print(element)

Authors matched in Survey:  327
Authors found in indexes & projects:  319

Authors matched in survey but not found in any tracked project:

829fa5bd85db080fed69156279859eaa2b43f28c
3920385860048b31cbf8fa86fb94253d81ce7b11
6ba47dca3af4b299f9c37ca3ca47a6cc50da1eba
1bc7f5356fc10ef0903206b127b761b649166566
7fcbe678f1c114ecb2fe9ae808c924307aae0684
b0a48947d55dcbf12d0dcc16a2969d9fc4522f50
92ee720caaeaac7979cc18f15e4b25f121593a29
f6bcf667f3ae3f4ad00cebad06dd79a110c06e35
