## Import

In [2]:
import os
import json
from pathlib import Path

from github import Github
from github import UnknownObjectException

import altair as alt
import pandas as pd

import requests
from markdown import markdown
from bs4 import BeautifulSoup

from rich import print

In [3]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Download

Get a list of repositories from [@NewsNerdRepos](https://github.com/silva-shih/open-journalism)

In [4]:
r = requests.get("https://raw.githubusercontent.com/silva-shih/open-journalism/master/README.md")

In [5]:
m = markdown(r.text)

In [6]:
soup = BeautifulSoup(m, 'html.parser')

In [7]:
link_list = [a for a in soup.find_all("a") if 'github' in a['href']]

In [8]:
org_list = [a['href'].split("github.com")[-1].replace("/", "").strip().lower() for a in link_list]

Add newer organizations missing from the list

In [9]:
extra_orgs = [
    "biglocalnews",
    "muckrock",
    "reuters-graphics",
    "the-markup",
    "usafacts",
    "teamtrace",
    "howard-center-investigations",
    "data-liberation-project",
    "the-baltimore-banner",
    "thecityny",
    "gatehousemedia",
    "worldcompany",
    "theeconomist",
    "lenfestlab",
    "srfdata",
]

In [10]:
org_list += extra_orgs

Write them out

In [11]:
org_df = pd.DataFrame(org_list).rename(columns={0: "org"}).drop_duplicates().sort_values("org")

In [12]:
org_df.to_csv("./output/orgs.csv", index=False)

Connect to GitHub API

In [63]:
#g = Github(os.getenv("GITHUB_API_TOKEN"))
g = Github("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")

Download all the public repositories in each account

In [66]:
def get_repos(org, force: bool = False):
    # Skip it if we already have the file
    data_path = Path("./input") / f"{org}.json"
    if data_path.exists() and not force:
        return json.load(open(data_path, 'r'))
    
    # Try to download an org
    print(f"Downloading {org}")
    try:
        repo_list = g.get_organization(org).get_repos()
    except UnknownObjectException:
        try:
            # Try to download a user
            repo_list = g.get_user(org).get_repos()
        except UnknownObjectException:
            # Give up
            return []

    # Parse out each repo and when it was created    
    d_list = []
    for r in repo_list:
        d = dict(
            org=org,
            name=r.name,
            full_name=r.full_name,
            created_at=str(r.created_at),
            fork=r.fork,
            stargazers_count=r.stargazers_count,
            forks_count=r.forks_count,
            language=r.language,
        )
        d_list.append(d)

    # Write it out
    with open(data_path, "w") as fp:
        json.dump(d_list, fp, indent=2)
    
    # Return the data
    return d_list

In [69]:
repo_list = []
for org in list(org_df.org):
    repo_list += get_repos(org, force=False)

## Transform

Convert to a dataframe

In [70]:
repo_df = pd.DataFrame(repo_list)

Parse the dates

In [71]:
repo_df.created_at = pd.to_datetime(repo_df.created_at)

Remove forks

In [72]:
repo_df.fork.value_counts()

False    12399
True      3139
Name: fork, dtype: int64

In [73]:
repo_df = repo_df[~repo_df.fork].copy()

Take a look

In [74]:
repo_df.head()

Unnamed: 0,org,name,full_name,created_at,fork,stargazers_count,forks_count,language
0,abcnews,scriptroot,abcnews/scriptroot,2014-11-21 01:06:49,False,1,1,JavaScript
1,abcnews,jquery-avoidOrphans,abcnews/jquery-avoidOrphans,2015-02-05 01:31:15,False,3,2,JavaScript
2,abcnews,abcnews.github.io,abcnews/abcnews.github.io,2015-02-05 01:45:52,False,1,2,HTML
3,abcnews,jquery-accessibleSpinner,abcnews/jquery-accessibleSpinner,2015-02-05 03:26:06,False,1,2,JavaScript
4,abcnews,fetchamd,abcnews/fetchamd,2015-04-27 06:14:38,False,3,1,JavaScript


In [75]:
repo_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 12399 entries, 0 to 15536
Data columns (total 8 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   org               12399 non-null  object        
 1   name              12399 non-null  object        
 2   full_name         12399 non-null  object        
 3   created_at        12399 non-null  datetime64[ns]
 4   fork              12399 non-null  bool          
 5   stargazers_count  12399 non-null  int64         
 6   forks_count       12399 non-null  int64         
 7   language          10915 non-null  object        
dtypes: bool(1), datetime64[ns](1), int64(2), object(4)
memory usage: 787.0+ KB


Write it out

In [76]:
repo_df.to_csv("output/repos.csv", index=False)

## Explore

Chart totals by month

In [77]:
alt.Chart(repo_df, title="News organizations on GitHub").mark_bar().encode(
    x=alt.X("created_at:T", timeUnit="yearmonth", title="Month", axis=alt.Axis(format='%Y', grid=False)),
    y=alt.Y("count():Q", title="New open-source repos"),
)

  for col_name, dtype in df.dtypes.iteritems():


In [78]:
created_by_month = repo_df.set_index("created_at").groupby(pd.Grouper(freq="M")).size().rename("new_repos").reset_index()

In [79]:
created_by_month.to_csv("output/months.csv", index=False)

In [80]:
repo_df['created_year'] = repo_df.created_at.dt.year

Rank organizations by total

In [81]:
orgs_by_year = repo_df.groupby(["org", "created_year"]).size().rename("new_repos").reset_index()

In [82]:
orgs_by_year_pivot = orgs_by_year.pivot(index="org", columns="created_year", values="new_repos").fillna(0)

In [83]:
orgs_by_year_pivot['total'] = orgs_by_year_pivot.sum(axis=1)

In [84]:
orgs_by_year_pivot.sort_values("total", ascending=False).head(10)

created_year,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,total
org,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
guardian,0.0,0.0,8.0,15.0,29.0,65.0,130.0,191.0,219.0,238.0,210.0,202.0,190.0,136.0,129.0,2.0,1764.0
financial-times,0.0,0.0,0.0,0.0,0.0,10.0,45.0,128.0,284.0,188.0,105.0,58.0,39.0,19.0,14.0,1.0,891.0
seattletimes,0.0,0.0,0.0,0.0,1.0,0.0,23.0,88.0,90.0,61.0,66.0,50.0,50.0,19.0,10.0,0.0,458.0
bbc-data-unit,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,60.0,55.0,71.0,49.0,8.0,7.0,1.0,286.0
ft-interactive,0.0,0.0,0.0,0.0,5.0,11.0,12.0,17.0,82.0,88.0,9.0,6.0,1.0,0.0,0.0,0.0,231.0
sunlightlabs,2.0,11.0,27.0,31.0,35.0,24.0,29.0,26.0,37.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,223.0
striblab,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,53.0,60.0,52.0,36.0,4.0,4.0,0.0,209.0
nprapps,0.0,0.0,0.0,0.0,12.0,41.0,24.0,27.0,25.0,41.0,11.0,9.0,6.0,8.0,1.0,0.0,205.0
abcnews,0.0,0.0,0.0,0.0,0.0,0.0,1.0,7.0,8.0,53.0,25.0,18.0,28.0,14.0,27.0,1.0,182.0
sfchronicle,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.0,49.0,79.0,20.0,3.0,1.0,2.0,9.0,0.0,175.0


In [85]:
orgs_by_year_pivot.to_csv("./output/orgs-by-year.csv", index=True)

Exclude the top two and see if the chart comes out the same

In [86]:
alt.Chart(repo_df[~repo_df.org.isin(['guardian', 'financial-times'])], title="News organizations on GitHub").mark_bar().encode(
    x=alt.X("created_at:T", timeUnit="yearmonth", title="Month", axis=alt.Axis(format='%Y', grid=False)),
    y=alt.Y("count():Q", title="New open-source repos"),
)

  for col_name, dtype in df.dtypes.iteritems():


Rank the top repos

In [89]:
repo_df.sort_values("stargazers_count", ascending=False).head(10)[['full_name', 'created_year', 'stargazers_count']].set_index("full_name")

Unnamed: 0_level_0,created_year,stargazers_count
full_name,Unnamed: 1_level_1,Unnamed: 2_level_1
ftlabs/fastclick,2012,18777
fivethirtyeight/data,2014,15992
bloomberg/memray,2022,9919
NUKnightLab/TimelineJS,2012,8825
nytimes/covid-19-data,2020,6919
newsapps/beeswithmachineguns,2010,6279
Financial-Times/polyfill-service,2014,5945
nytimes/objective-c-style-guide,2013,5785
guardian/frontend,2012,5660
wireservice/csvkit,2011,5292


In [90]:
repo_df.sort_values("forks_count", ascending=False).head(10)[['full_name', 'created_year', 'forks_count']].set_index("full_name")

Unnamed: 0_level_0,created_year,forks_count
full_name,Unnamed: 1_level_1,Unnamed: 2_level_1
fivethirtyeight/data,2014,10864
nytimes/covid-19-data,2020,3438
ftlabs/fastclick,2012,3313
NUKnightLab/TimelineJS,2012,1601
nytimes/objective-c-style-guide,2013,1314
freedomofpress/securedrop,2012,681
newsapps/beeswithmachineguns,2010,663
wireservice/csvkit,2011,581
NUKnightLab/TimelineJS3,2014,577
guardian/frontend,2012,563


Rank the top languages

In [91]:
languages_by_year = repo_df.groupby(["language", "created_year"]).size().rename("new_repos").reset_index()

In [93]:
languages_by_year_pivot = languages_by_year.pivot(index="language", columns="created_year", values="new_repos").fillna(0)

In [94]:
languages_by_year_pivot['total'] = languages_by_year_pivot.sum(axis=1)

In [96]:
languages_by_year_pivot.sort_values("total", ascending=False).head(10)

created_year,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023,total
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
JavaScript,0.0,1.0,16.0,32.0,77.0,201.0,460.0,702.0,747.0,710.0,425.0,280.0,237.0,109.0,75.0,2.0,4074.0
HTML,0.0,0.0,5.0,2.0,16.0,26.0,62.0,223.0,294.0,360.0,252.0,159.0,152.0,90.0,96.0,2.0,1739.0
Python,2.0,9.0,24.0,56.0,83.0,114.0,159.0,229.0,170.0,171.0,114.0,119.0,67.0,55.0,61.0,1.0,1434.0
CSS,0.0,0.0,0.0,0.0,8.0,39.0,93.0,109.0,124.0,92.0,37.0,24.0,13.0,3.0,3.0,0.0,545.0
Go,0.0,0.0,0.0,0.0,0.0,2.0,9.0,52.0,120.0,77.0,32.0,17.0,16.0,16.0,13.0,1.0,355.0
PHP,0.0,0.0,20.0,19.0,15.0,15.0,38.0,48.0,53.0,53.0,45.0,16.0,11.0,7.0,7.0,0.0,347.0
Jupyter Notebook,0.0,0.0,0.0,0.0,0.0,1.0,4.0,18.0,47.0,48.0,65.0,59.0,35.0,21.0,41.0,0.0,339.0
Scala,0.0,0.0,4.0,11.0,12.0,15.0,34.0,35.0,48.0,71.0,36.0,31.0,17.0,10.0,14.0,1.0,339.0
Ruby,0.0,11.0,10.0,15.0,24.0,36.0,39.0,56.0,40.0,23.0,20.0,5.0,6.0,4.0,3.0,0.0,292.0
TypeScript,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,11.0,33.0,37.0,42.0,73.0,42.0,49.0,0.0,288.0


In [97]:
languages_by_year_pivot.to_csv("./output/languages-by-year.csv", index=True)