# Contributors to California Civic Data Coalition repositories

By Ben Welsh

This analysis is drawn from the open-source list of contributors compiled by GitHub. It was last harvested on Dec. 18, 2016, [using a Python script that interacts with GitHub's API](https://github.com/california-civic-data-coalition/django-calaccess-raw-data/blob/master/example/network-analysis/contributors.csv).  

In [261]:
import pandas as pd
import numpy as np

## Load in the data

In [262]:
table = pd.read_csv("./input/contributors.csv")

In [263]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 9 columns):
repo             183 non-null object
login            183 non-null object
name             141 non-null object
email            93 non-null object
company          93 non-null object
location         118 non-null object
bio              27 non-null object
avatar_url       183 non-null object
contributions    183 non-null int64
dtypes: int64(1), object(8)
memory usage: 12.9+ KB


### Clean up strings

In [264]:
table.replace(np.nan, "", inplace=True)

In [265]:
table.login = table.login.map(str.strip).str.lower()
table.company = table.company.map(str.strip)
table.location = table.location.map(str.strip)
table.avatar_url = table.avatar_url.map(str.strip)

### Merge in corrections

In [266]:
corrections = pd.read_csv("./input/contributors-corrections.csv")

In [267]:
table = table.merge(corrections, on="login", how="left")

In [268]:
table.company = table.corrected_company.fillna(table.company)
table.location = table.corrected_location.fillna(table.location)

In [269]:
table.drop('corrected_company', axis=1, inplace=True)
table.drop('corrected_location', axis=1, inplace=True)

### Merge some common variations

In [270]:
table.loc[table.location == 'Los Angeles', 'location'] = 'Los Angeles, CA'
table.loc[table.location == 'Washington D.C.', 'location'] = 'Washington, DC'
table.loc[table.location == 'Washington, D.C.', 'location'] = 'Washington, DC'
table.loc[table.location == 'Chicago', 'location'] = 'Chicago, IL'

### Identify gaps

#### People missing a name 

In [271]:
len(table[table.name == ''])

42

In [272]:
table[table.name == ''].login.unique()

array(['yujiap', 'karkinosw', 'malon', 'mattdatajourno', 'joshuarrrr',
       'tbone', 'momiperalta', 'amzam', 'desireedesario', 'katbuchholz',
       'lauragomezrod', 'mjlorda', 'absolutevan', 'annkiha', 'cecht',
       'fagerlise', 'jennbrandel', 'nicolewest', 'pumadegit',
       'saraschnadt', 'aquintero4354', 'danmit', 'ebonymarieb', 'fcoel',
       'jayelle-o', 'latams', 'lehrennyt', 'lengsj', 'maloym', 'mb10',
       'melissaleu', 'merbroussard', 'mijebner', 'mikereicher', 'mjenner',
       'mmhirsch', 'nikkiusher', 'regirob831', 'samlo78', 'soorinkimmm',
       'tchristianmiller', 'yanofsky'], dtype=object)

#### People missing a company

In [273]:
len(table[table.company == ''])

21

In [274]:
table[table.company == ''].login.unique()

array(['tocateunvals', 'yujiap', 'karkinosw', 'malon', 'katbuchholz',
       'lauragomezrod', 'mjlorda', 'annkiha', 'cecht', 'nicolewest',
       'pumadegit', 'jayelle-o', 'lengsj', 'mb10', 'mikereicher',
       'mmhirsch', 'regirob831', 'samlo78', 'soorinkimmm', 'vromney',
       'rkiddy'], dtype=object)

#### People missing a location 

In [275]:
len(table[table.location == ''])

33

In [276]:
table[table.location == ''].login.unique()

array(['benlk', 'yujiap', 'karkinosw', 'malon', 'mattdatajourno', 'duner',
       'joshuarrrr', 'wcraft', 'desireedesario', 'jlagetz', 'katbuchholz',
       'lauragomezrod', 'mjlorda', 'absolutevan', 'annkiha', 'cecht',
       'fagerlise', 'nicolewest', 'pumadegit', 'taraadiseshan',
       'aquintero4354', 'jayelle-o', 'lehrennyt', 'lengsj', 'mb10',
       'mijebner', 'mikereicher', 'mmhirsch', 'regirob831', 'samlo78',
       'soorinkimmm', 'vromney'], dtype=object)

#### People missing an email 

In [277]:
len(table[table.email == ''])

90

In [278]:
table[table.email == ''].login.unique()

array(['armendariz', 'anabranch', 'tocateunvals', 'burtherman',
       'sourcedouglas', 'danhillreports', 'yujiap', 'karkinosw', 'malon',
       'enactdev', 'mattdatajourno', 'duner', 'caseypt', 'chrisalcantara',
       'joannalin', 'joshuarrrr', 'tbone', 'carloslemos', 'chagan',
       'underthecurve', 'elainewong', 'livlab', 'momiperalta', 'amzam',
       'bdin', 'desireedesario', 'jlagetz', 'katbuchholz', 'lauragomezrod',
       'mjlorda', 'absolutevan', 'annkiha', 'brianmcgill', 'caseymm',
       'cecht', 'fagerlise', 'jernsthausen', 'jennbrandel',
       'juliewestfall', 'karissa', 'kevinbogardus', 'macdiva', 'miguelpaz',
       'nicolewest', 'pumadegit', 'sahilchinoy', 'saraschnadt',
       'taraadiseshan', 'vigneshramachandran', 'aquintero4354', 'danmit',
       'ebonymarieb', 'fcoel', 'jayelle-o', 'latams', 'lehrennyt',
       'lengsj', 'maloym', 'mb10', 'melissaleu', 'merbroussard',
       'mijebner', 'mikereicher', 'mjenner', 'mmhirsch', 'nikkiusher',
       'qstin', 'regirob

### Contribution rankings

In [279]:
def rank_by_contributions(table, field):
    grouped = table.groupby(field, as_index=False)
    summed = grouped.contributions.sum()
    return summed.sort_values("contributions", ascending=False)

In [280]:
rank_by_contributions(table, 'repo').head(10)

Unnamed: 0,repo,contributions
8,django-calaccess-raw-data,2391
0,california-civic-data-coalition.github.io,806
1,django-calaccess-campaign-browser,731
4,django-calaccess-downloads-website,718
6,django-calaccess-processed-data,288
9,django-calaccess-technical-documentation,229
10,django-postgres-copy,101
11,python-calaccess-notebooks,62
2,django-calaccess-cookbook,30
3,django-calaccess-docker,29


In [281]:
rank_by_contributions(table, 'login').head(10)

Unnamed: 0,login,contributions
114,palewire,2932
50,gordonje,1329
0,aboutaaron,379
124,sahilchinoy,126
13,armendariz,69
7,anabranch,47
11,anthonyjpesce,40
21,caseymm,33
60,jjelosua,29
136,tocateunvals,25


In [282]:
rank_by_contributions(table, 'company').head(10)

Unnamed: 0,company,contributions
42,Los Angeles Times,3017
13,California Civic Data Coalition,1329
78,The Washington Post,382
80,UC Berkeley,126
50,New York Times,72
0,,64
22,Databricks,47
45,Mozilla OpenNews,40
85,Vox Media,33
47,NPR,29


In [283]:
rank_by_contributions(table, 'location').head(10)

Unnamed: 0,location,contributions
29,"Los Angeles, CA",3018
16,"Columbia, MO",1330
55,"Washington, DC",422
34,"New York, NY",152
5,"Berkeley, CA",127
0,,68
12,California,47
14,Chicago,26
2,Argentina,25
53,United States,22


### In coalition vs. out of coalition

In [284]:
login_list = [
    'palewire',
    'gordonje',
    'sahilchinoy',
    'aboutaaron',
    'armendariz',
    'cphillips'
]
table['in_coalition'] = table.login.isin(login_list)

In [285]:
table.groupby('in_coalition').size()

in_coalition
False    150
True      33
dtype: int64

In [286]:
table.groupby('in_coalition').size() / table.groupby('in_coalition').size().sum()

in_coalition
False    0.819672
True     0.180328
dtype: float64

In [287]:
table.groupby('in_coalition').contributions.sum()

in_coalition
False     581
True     4835
Name: contributions, dtype: int64

In [288]:
table.groupby('in_coalition').contributions.sum() / table.groupby('in_coalition').contributions.sum().sum()

in_coalition
False    0.107275
True     0.892725
Name: contributions, dtype: float64

## Unique contributors

In [289]:
table.groupby("repo", as_index=False).size().reset_index().sort_values(0, ascending=False)

Unnamed: 0,repo,0
8,django-calaccess-raw-data,132
1,django-calaccess-campaign-browser,18
0,california-civic-data-coalition.github.io,7
4,django-calaccess-downloads-website,7
9,django-calaccess-technical-documentation,5
5,django-calaccess-lobbying-browser,3
6,django-calaccess-processed-data,3
10,django-postgres-copy,3
11,python-calaccess-notebooks,2
2,django-calaccess-cookbook,1


In [290]:
unique_contributors = table.groupby(["login", "company", "location", "avatar_url"]).contributions.sum().reset_index()

In [291]:
unique_contributors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 5 columns):
login            144 non-null object
company          144 non-null object
location         144 non-null object
avatar_url       144 non-null object
contributions    144 non-null int64
dtypes: int64(1), object(4)
memory usage: 5.7+ KB


In [292]:
unique_contributors.sort_values("login").head(25)

Unnamed: 0,login,company,location,avatar_url,contributions
0,aboutaaron,The Washington Post,"Washington, DC",https://avatars.githubusercontent.com/u/856628...,379
1,absolutevan,Arizona Center for Investigative Reporting,,https://avatars.githubusercontent.com/u/165300...,1
2,achavez,Dallas Morning News,"Dallas, TX",https://avatars.githubusercontent.com/u/682828...,1
3,acoreynews,Arizona Republic,"Las Vegas, NV",https://avatars.githubusercontent.com/u/562616...,1
4,amandabee,BuzzFeed,"San Francisco, CA",https://avatars.githubusercontent.com/u/150791...,7
5,amzam,Texas Tribune,"Austin, TX",https://avatars.githubusercontent.com/u/579612...,2
6,anabecker,Wall Street Journal,"New York, NY",https://avatars.githubusercontent.com/u/265281...,1
7,anabranch,Databricks,California,https://avatars.githubusercontent.com/u/164250...,47
8,anniedaniel,Texas Tribune,"Austin, TX",https://avatars.githubusercontent.com/u/512549...,1
9,annkiha,,,https://avatars.githubusercontent.com/u/113393...,1


In [293]:
unique_contributors.to_csv("./output/unique-contributors.csv")