# Contributors to California Civic Data Coalition repositories

By Ben Welsh

This analysis is drawn from the open-source list of contributors compiled by GitHub. It was last harvested on Dec. 18, 2016, [using a Python script that interacts with GitHub's API](https://github.com/california-civic-data-coalition/django-calaccess-raw-data/blob/master/example/network-analysis/contributors.csv).  

In [1]:
import pandas as pd
import numpy as np

## Load in the data

In [2]:
table = pd.read_csv("./contributors.csv")

In [3]:
table.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 183 entries, 0 to 182
Data columns (total 9 columns):
repo             183 non-null object
login            183 non-null object
name             141 non-null object
email            93 non-null object
company          93 non-null object
location         118 non-null object
bio              27 non-null object
avatar_url       183 non-null object
contributions    183 non-null int64
dtypes: int64(1), object(8)
memory usage: 12.9+ KB


In [4]:
table = table.replace(np.nan, "")

In [5]:
table.login = table.login.map(str.strip).str.lower()
table.company = table.company.map(str.strip)
table.location = table.location.map(str.strip)
table.avatar_url = table.avatar_url.map(str.strip)

In [6]:
corrections = pd.read_csv("contributors-corrections.csv")

In [7]:
table = table.merge(corrections, on="login", how="left")

In [8]:
table.company = table.corrected_company.fillna(table.company)

In [9]:
table.drop('corrected_company', axis=1, inplace=True)

In [10]:
table[table.company == ''].sort_values("login")

Unnamed: 0,repo,login,name,email,company,location,bio,avatar_url,contributions
67,django-calaccess-raw-data,annkiha,,,,,,https://avatars.githubusercontent.com/u/113393...,1
72,django-calaccess-raw-data,cecht,,,,,,https://avatars.githubusercontent.com/u/112753...,1
106,django-calaccess-raw-data,danmit,,,,,,https://avatars.githubusercontent.com/u/800801...,1
42,django-calaccess-raw-data,elainewong,Elaine Wong,,,Toronto,,https://avatars.githubusercontent.com/u/838923...,2
109,django-calaccess-raw-data,jayelle-o,,,,,,https://avatars.githubusercontent.com/u/168088...,1
21,django-calaccess-raw-data,karkinosw,,,,,,https://avatars.githubusercontent.com/u/140054...,5
58,django-calaccess-raw-data,katbuchholz,,,,,,https://avatars.githubusercontent.com/u/113408...,2
59,django-calaccess-raw-data,lauragomezrod,,,,,,https://avatars.githubusercontent.com/u/178048...,2
113,django-calaccess-raw-data,lengsj,,,,,,https://avatars.githubusercontent.com/u/723122...,1
22,django-calaccess-raw-data,malon,,,,,,https://avatars.githubusercontent.com/u/702708...,5


In [11]:
table.head(25)

Unnamed: 0,repo,login,name,email,company,location,bio,avatar_url,contributions
0,django-calaccess-raw-data,palewire,Ben Welsh,ben.welsh@gmail.com,Los Angeles Times,Los Angeles,"Editor, @datadesk. Organizer, @california-civi...",https://avatars.githubusercontent.com/u/9993?v=3,1054
1,django-calaccess-raw-data,gordonje,James Gordon,gordon.je@gmail.com,California Civic Data Coalition,"Columbia, MO",,https://avatars.githubusercontent.com/u/430454...,728
2,django-calaccess-raw-data,aboutaaron,Aaron Williams,aaron.colby.williams@gmail.com,The Washington Post,"Washington, D.C.",,https://avatars.githubusercontent.com/u/856628...,144
3,django-calaccess-raw-data,armendariz,Agustin Armendariz,,New York Times,,,https://avatars.githubusercontent.com/u/952608...,50
4,django-calaccess-raw-data,anabranch,Bill Chambers,,Databricks,California,,https://avatars.githubusercontent.com/u/164250...,45
5,django-calaccess-raw-data,tocateunvals,Luciana Godoy,,,Argentina,,https://avatars.githubusercontent.com/u/785444...,25
6,django-calaccess-raw-data,charlex,Charley Bodkin,charley@bodkin.me,Los Angeles Times,"Los Angeles, CA",,https://avatars.githubusercontent.com/u/154274...,23
7,django-calaccess-raw-data,bcipolli,Ben Cipollini,bcipolli@ucsd.edu,UC San Diego,"San Diego, CA",,https://avatars.githubusercontent.com/u/407245...,20
8,django-calaccess-raw-data,mhkeller,Michael Keller,code@mhkeller.com,Bloomberg,"New York, NY",,https://avatars.githubusercontent.com/u/498744...,19
9,django-calaccess-raw-data,myersjustinc,Justin Myers,justin@justinmyers.net,The Associated Press,Chicago,,https://avatars.githubusercontent.com/u/764865...,18


### Contribution rankings

In [12]:
table.groupby("repo", as_index=False).contributions.sum().sort_values("contributions", ascending=False)

Unnamed: 0,repo,contributions
8,django-calaccess-raw-data,2391
0,california-civic-data-coalition.github.io,806
1,django-calaccess-campaign-browser,731
4,django-calaccess-downloads-website,718
6,django-calaccess-processed-data,288
9,django-calaccess-technical-documentation,229
10,django-postgres-copy,101
11,python-calaccess-notebooks,62
2,django-calaccess-cookbook,30
3,django-calaccess-docker,29


In [13]:
table.groupby('login', as_index=False).contributions.sum().sort_values("contributions", ascending=False).head(10)

Unnamed: 0,login,contributions
114,palewire,2932
50,gordonje,1329
0,aboutaaron,379
124,sahilchinoy,126
13,armendariz,69
7,anabranch,47
11,anthonyjpesce,40
21,caseymm,33
60,jjelosua,29
136,tocateunvals,25


In [14]:
table.groupby('company', as_index=False).contributions.sum().sort_values("contributions", ascending=False).head(10)

Unnamed: 0,company,contributions
45,Los Angeles Times,3017
14,California Civic Data Coalition,1329
81,The Washington Post,382
83,UC Berkeley,126
54,New York Times,72
0,,66
23,Databricks,47
88,Vox Media,33
50,NPR,29
1,( ˘▽˘)っ♨,21


In [15]:
table.groupby('location', as_index=False).contributions.sum().sort_values("contributions", ascending=False).head(10)

Unnamed: 0,location,contributions
26,Los Angeles,2976
14,"Columbia, MO",1329
51,"Washington, D.C.",383
0,,200
5,"Berkeley, CA",126
32,"New York, NY",54
10,California,47
52,"Washington, DC",36
27,"Los Angeles, CA",34
12,Chicago,26


## Unique contributors

In [16]:
table.groupby("repo", as_index=False).size().reset_index().sort_values(0, ascending=False)

Unnamed: 0,repo,0
8,django-calaccess-raw-data,132
1,django-calaccess-campaign-browser,18
0,california-civic-data-coalition.github.io,7
4,django-calaccess-downloads-website,7
9,django-calaccess-technical-documentation,5
5,django-calaccess-lobbying-browser,3
6,django-calaccess-processed-data,3
10,django-postgres-copy,3
11,python-calaccess-notebooks,2
2,django-calaccess-cookbook,1


In [17]:
unique_contributors = table.groupby(["login", "company", "location", "avatar_url"]).contributions.sum().reset_index()

In [18]:
unique_contributors.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 5 columns):
login            144 non-null object
company          144 non-null object
location         144 non-null object
avatar_url       144 non-null object
contributions    144 non-null int64
dtypes: int64(1), object(4)
memory usage: 5.7+ KB


In [19]:
unique_contributors.sort_values("login").head(25)

Unnamed: 0,login,company,location,avatar_url,contributions
0,aboutaaron,The Washington Post,"Washington, D.C.",https://avatars.githubusercontent.com/u/856628...,379
1,absolutevan,Arizona Center for Investigative Reporting,,https://avatars.githubusercontent.com/u/165300...,1
2,achavez,@DallasMorningNews,"Dallas, TX",https://avatars.githubusercontent.com/u/682828...,1
3,acoreynews,Arizona Republic,"Las Vegas, NV",https://avatars.githubusercontent.com/u/562616...,1
4,amandabee,@buzzfeed-openlab,"San Francisco, CA",https://avatars.githubusercontent.com/u/150791...,7
5,amzam,Texas Tribune,,https://avatars.githubusercontent.com/u/579612...,2
6,anabecker,Wall Street Journal,,https://avatars.githubusercontent.com/u/265281...,1
7,anabranch,Databricks,California,https://avatars.githubusercontent.com/u/164250...,47
8,anniedaniel,Texas Tribune,"Austin, TX",https://avatars.githubusercontent.com/u/512549...,1
9,annkiha,,,https://avatars.githubusercontent.com/u/113393...,1


In [20]:
unique_contributors.to_csv("./unique-contributors.csv")