# Contributors to California Civic Data Coalition repositories

By Ben Welsh

This analysis is drawn from the open-source list of contributors compiled by GitHub. It was last harvested on Dec. 18, 2016, [using a Python script that interacts with GitHub's API](https://github.com/california-civic-data-coalition/django-calaccess-raw-data/blob/master/example/network-analysis/contributors.csv).  

In [59]:
import pandas as pd
import numpy as np

## Load in the data

In [60]:
table = pd.read_csv("./input/contributors.csv")

In [61]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 9 columns):
repo             183 non-null object
login            183 non-null object
name             141 non-null object
email            93 non-null object
company          93 non-null object
location         118 non-null object
bio              27 non-null object
avatar_url       183 non-null object
contributions    183 non-null int64
dtypes: int64(1), object(8)
memory usage: 12.9+ KB


### Clean up strings

In [62]:
table = table.replace(np.nan, "")

In [63]:
table.login = table.login.map(str.strip).str.lower()
table.company = table.company.map(str.strip)
table.location = table.location.map(str.strip)
table.avatar_url = table.avatar_url.map(str.strip)

### Merge in corrections

In [64]:
corrections = pd.read_csv("./input/contributors-corrections.csv")

In [65]:
table = table.merge(corrections, on="login", how="left")

In [66]:
table.company = table.corrected_company.fillna(table.company)
table.location = table.corrected_location.fillna(table.location)

In [67]:
table.drop('corrected_company', axis=1, inplace=True)
table.drop('corrected_location', axis=1, inplace=True)

#### People still missing a company

In [68]:
len(table[table.company == ''])

21

In [69]:
table[table.company == ''].login.unique()

array(['tocateunvals', 'yujiap', 'karkinosw', 'malon', 'katbuchholz',
       'lauragomezrod', 'mjlorda', 'annkiha', 'cecht', 'nicolewest',
       'pumadegit', 'danmit', 'jayelle-o', 'lengsj', 'mb10', 'mikereicher',
       'mmhirsch', 'regirob831', 'samlo78', 'soorinkimmm', 'rkiddy'], dtype=object)

#### People missing a location 

In [70]:
len(table[table.location == ''])

45

In [71]:
table[table.location == ''].login.unique()

array(['sourcedouglas', 'benlk', 'yujiap', 'karkinosw', 'malon',
       'mattdatajourno', 'duner', 'joshuarrrr', 'wcraft', 'bdin',
       'desireedesario', 'jlagetz', 'katbuchholz', 'lauragomezrod',
       'mjlorda', 'absolutevan', 'anabecker', 'annkiha', 'cecht',
       'fagerlise', 'jennbrandel', 'nicolewest', 'pumadegit',
       'saraschnadt', 'taraadiseshan', 'aquintero4354', 'danmit',
       'ebonymarieb', 'jayelle-o', 'latams', 'lehrennyt', 'lengsj', 'mb10',
       'merbroussard', 'mijebner', 'mikereicher', 'mmhirsch', 'nikkiusher',
       'regirob831', 'samlo78', 'soorinkimmm', 'vromney', 'yanofsky',
       'kissane'], dtype=object)

In [87]:
table.head(5)

Unnamed: 0,repo,login,name,email,company,location,bio,avatar_url,contributions,in_coalition
0,django-calaccess-raw-data,palewire,Ben Welsh,ben.welsh@gmail.com,Los Angeles Times,Los Angeles,"Editor, @datadesk. Organizer, @california-civi...",https://avatars.githubusercontent.com/u/9993?v=3,1054,True
1,django-calaccess-raw-data,gordonje,James Gordon,gordon.je@gmail.com,California Civic Data Coalition,"Columbia, MO",,https://avatars.githubusercontent.com/u/430454...,728,True
2,django-calaccess-raw-data,aboutaaron,Aaron Williams,aaron.colby.williams@gmail.com,The Washington Post,"Washington, D.C.",,https://avatars.githubusercontent.com/u/856628...,144,True
3,django-calaccess-raw-data,armendariz,Agustin Armendariz,,New York Times,"New York, NY",,https://avatars.githubusercontent.com/u/952608...,50,True
4,django-calaccess-raw-data,anabranch,Bill Chambers,,Databricks,California,,https://avatars.githubusercontent.com/u/164250...,45,False


### Contribution rankings

In [73]:
table.groupby("repo", as_index=False).contributions.sum().sort_values("contributions", ascending=False)

Unnamed: 0,repo,contributions
8,django-calaccess-raw-data,2391
0,california-civic-data-coalition.github.io,806
1,django-calaccess-campaign-browser,731
4,django-calaccess-downloads-website,718
6,django-calaccess-processed-data,288
9,django-calaccess-technical-documentation,229
10,django-postgres-copy,101
11,python-calaccess-notebooks,62
2,django-calaccess-cookbook,30
3,django-calaccess-docker,29


In [74]:
table.groupby('login', as_index=False).contributions.sum().sort_values("contributions", ascending=False).head(10)

Unnamed: 0,login,contributions
114,palewire,2932
50,gordonje,1329
0,aboutaaron,379
124,sahilchinoy,126
13,armendariz,69
7,anabranch,47
11,anthonyjpesce,40
21,caseymm,33
60,jjelosua,29
136,tocateunvals,25


In [75]:
table.groupby('company', as_index=False).contributions.sum().sort_values("contributions", ascending=False).head(10)

Unnamed: 0,company,contributions
45,Los Angeles Times,3017
14,California Civic Data Coalition,1329
81,The Washington Post,382
83,UC Berkeley,126
54,New York Times,72
0,,64
23,Databricks,47
48,Mozilla OpenNews,37
88,Vox Media,33
50,NPR,29


### In coalition vs. out of coalition

In [76]:
login_list = [
    'palewire',
    'gordonje',
    'sahilchinoy',
    'aboutaaron',
    'armendariz',
    'cphillips'
]
table['in_coalition'] = table.login.isin(login_list)

In [77]:
table.groupby('in_coalition').size()

in_coalition
False    150
True      33
dtype: int64

In [78]:
table.groupby('in_coalition').size() / table.groupby('in_coalition').size().sum()

in_coalition
False    0.819672
True     0.180328
dtype: float64

In [79]:
table.groupby('in_coalition').contributions.sum()

in_coalition
False     581
True     4835
Name: contributions, dtype: int64

In [80]:
table.groupby('in_coalition').contributions.sum() / table.groupby('in_coalition').contributions.sum().sum()

in_coalition
False    0.107275
True     0.892725
Name: contributions, dtype: float64

## Location

In [81]:
table.groupby('location', as_index=False).contributions.sum().sort_values("contributions", ascending=False).head(10)

Unnamed: 0,location,contributions
29,Los Angeles,2976
16,"Columbia, MO",1330
55,"Washington, D.C.",383
5,"Berkeley, CA",127
35,"New York, NY",126
0,,91
12,California,47
30,"Los Angeles, CA",40
56,"Washington, DC",36
14,Chicago,26


## Unique contributors

In [82]:
table.groupby("repo", as_index=False).size().reset_index().sort_values(0, ascending=False)

Unnamed: 0,repo,0
8,django-calaccess-raw-data,132
1,django-calaccess-campaign-browser,18
0,california-civic-data-coalition.github.io,7
4,django-calaccess-downloads-website,7
9,django-calaccess-technical-documentation,5
5,django-calaccess-lobbying-browser,3
6,django-calaccess-processed-data,3
10,django-postgres-copy,3
11,python-calaccess-notebooks,2
2,django-calaccess-cookbook,1


In [83]:
unique_contributors = table.groupby(["login", "company", "location", "avatar_url"]).contributions.sum().reset_index()

In [84]:
unique_contributors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 5 columns):
login            144 non-null object
company          144 non-null object
location         144 non-null object
avatar_url       144 non-null object
contributions    144 non-null int64
dtypes: int64(1), object(4)
memory usage: 5.7+ KB


In [85]:
unique_contributors.sort_values("login").head(25)

Unnamed: 0,login,company,location,avatar_url,contributions
0,aboutaaron,The Washington Post,"Washington, D.C.",https://avatars.githubusercontent.com/u/856628...,379
1,absolutevan,Arizona Center for Investigative Reporting,,https://avatars.githubusercontent.com/u/165300...,1
2,achavez,@DallasMorningNews,"Dallas, TX",https://avatars.githubusercontent.com/u/682828...,1
3,acoreynews,Arizona Republic,"Las Vegas, NV",https://avatars.githubusercontent.com/u/562616...,1
4,amandabee,@buzzfeed-openlab,"San Francisco, CA",https://avatars.githubusercontent.com/u/150791...,7
5,amzam,Texas Tribune,"Austin, TX",https://avatars.githubusercontent.com/u/579612...,2
6,anabecker,Wall Street Journal,,https://avatars.githubusercontent.com/u/265281...,1
7,anabranch,Databricks,California,https://avatars.githubusercontent.com/u/164250...,47
8,anniedaniel,Texas Tribune,"Austin, TX",https://avatars.githubusercontent.com/u/512549...,1
9,annkiha,,,https://avatars.githubusercontent.com/u/113393...,1


In [86]:
unique_contributors.to_csv("./output/unique-contributors.csv")