## Import

In [1]:
import os
import json
from pathlib import Path

from github import Github
from github import UnknownObjectException

import theme
import altair as alt
import pandas as pd

import requests
from markdown import markdown
from bs4 import BeautifulSoup

In [2]:
alt.themes.register('palewire', theme.theme)
alt.themes.enable('palewire')

ThemeRegistry.enable('palewire')

In [3]:
alt.data_transformers.disable_max_rows()

DataTransformerRegistry.enable('default')

## Download

Get a list of repositories from [@NewsNerdRepos](https://github.com/silva-shih/open-journalism)

In [4]:
r = requests.get("https://raw.githubusercontent.com/silva-shih/open-journalism/master/README.md")

In [5]:
m = markdown(r.text)

In [6]:
soup = BeautifulSoup(m, 'html.parser')

In [7]:
link_list = [a for a in soup.find_all("a") if 'github' in a['href']]

In [8]:
org_list = sorted([a['href'].split("github.com")[-1].replace("/", "").strip().lower() for a in link_list])

Add newer organizations missing from the list

In [9]:
extra_orgs = [
    "biglocalnews",
    "muckrock",
    "reuters-graphics",
    "the-markup",
    "usafacts",
    "teamtrace",
    "howard-center-investigations",
    "data-liberation-project",
    "the-baltimore-banner",
    "thecityny",
    "gatehousemedia",
    "worldcompany",
    "theeconomist",
    "lenfestlab",
    "srfdata",
]

In [10]:
org_list += extra_orgs

Write them out

In [11]:
df = pd.DataFrame(org_list).rename(columns={0: "org"})

In [12]:
df.to_csv("./orgs.csv", index=False)

Connect to GitHub API

In [13]:
g = Github(os.getenv("GITHUB_API_TOKEN"))

Download all the public repositories in each account

In [14]:
def get_repos(org, force: bool = False):
    # Skip it if we already have the file
    data_path = Path("./api") / f"{org}.json"
    if data_path.exists() and not force:
        return json.load(open(data_path, 'r'))
    
    # Try to download an org
    print(f"Downloading {org}")
    try:
        repo_list = g.get_organization(org).get_repos()
    except UnknownObjectException:
        try:
            # Try to download a user
            repo_list = g.get_user(org).get_repos()
        except UnknownObjectException:
            # Give up
            return []

    # Parse out each repo and when it was created    
    d_list = []
    for r in repo_list:
        d = dict(
            org=org,
            name=r.name,
            created_at=str(r.created_at)
        )
        d_list.append(d)

    # Write it out
    with open(data_path, "w") as fp:
        json.dump(d_list, fp, indent=2)
    
    # Return the data
    return d_list

In [15]:
data_list = []
for org in org_list:
    data_list += get_repos(org)

Downloading berlinermorgenpost
Downloading dailyherald
Downloading pjstar
Downloading sacbee
Downloading showcasesopen-journalism
Downloading virginianpilot


## Transform

Convert to a dataframe

In [16]:
df = pd.DataFrame(data_list)

Parse the dates

In [17]:
df.created_at = pd.to_datetime(df.created_at)

Take a look

In [18]:
df.head()

Unnamed: 0,org,name,created_at
0,abcnews,scriptroot,2014-11-21 01:06:49
1,abcnews,jquery-avoidOrphans,2015-02-05 01:31:15
2,abcnews,abcnews.github.io,2015-02-05 01:45:52
3,abcnews,jquery-accessibleSpinner,2015-02-05 03:26:06
4,abcnews,fetchamd,2015-04-27 06:14:38


In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15538 entries, 0 to 15537
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   org         15538 non-null  object        
 1   name        15538 non-null  object        
 2   created_at  15538 non-null  datetime64[ns]
dtypes: datetime64[ns](1), object(2)
memory usage: 364.3+ KB


Write it out

In [20]:
df.to_csv("repos.csv", index=False)

## Explore

Chart totals by month

In [21]:
alt.Chart(df).mark_bar(color="#aaa").encode(
    x=alt.X("created_at:T", timeUnit="yearmonth", title=None, axis=alt.Axis(format='%Y', grid=False)),
    y=alt.Y("count():Q", title="New open-source repos by month"),
).properties(title="News organizations on GitHub", width=400, height=300)

  for col_name, dtype in df.dtypes.iteritems():


In [25]:
created_by_month = df.set_index("created_at").groupby(pd.Grouper(freq="M")).size().rename("new_repos").reset_index()

In [26]:
created_by_month.to_csv("months.csv", index=False)