# Open Source News Analysis

By Ben Welsh and Scott Klein

This notebook demonstrates the findings presented at the News Product Alliance conference on Oct. 24, 2025. It analyzes the decline of open source news projects over the past decade using data from GitHub repositories linked to news organizations.

## Import Python and data

In [1]:
from pathlib import Path

import pandas as pd
import altair as alt
from rich import print

In [2]:
# Allow for Altair to use the Vega Fusion renderer for large datasets
alt.data_transformers.enable("vegafusion")

DataTransformerRegistry.enable('vegafusion')

In [3]:
org_repos_df = pd.read_csv(
    Path(".") / "data" / "transformed" / "org-repos.csv",
    parse_dates=["created_at"],
)

In [4]:
org_repos_df.head(1)

Unnamed: 0,org,name,full_name,homepage,description,language,created_at,updated_at,pushed_at,stargazers_count,watchers_count,forks_count,open_issues_count,license,topics
0,abcnews,.github,abcnews/.github,,ABC public Github profile,,2023-01-31 03:49:26+00:00,2023-01-31 03:49:26+00:00,2023-02-02 03:37:20+00:00,0,0,0,0,,[]


## Count the organizations and repositories

In [5]:
print(f"{len(org_repos_df):,} public repos")

In [6]:
print(f"Released by {org_repos_df.org.nunique()} news organizations")

In [7]:
print(
    f"From {org_repos_df['created_at'].min().year} to {org_repos_df['created_at'].max().year}."
)

## Analyze the trend over time

In [8]:
alt.Chart(
    org_repos_df, title="New repositories by year", width=600, height=400
).mark_bar().encode(
    x=alt.X("year(created_at):O", title=None),
    y=alt.Y("count():Q", title=None),
    tooltip=[
        alt.Tooltip("year(created_at):O", title="Year"),
        alt.Tooltip("count():Q", title="New repositories"),
    ],
)

In [9]:
# Group and count by created year
repos_by_year = (
    org_repos_df.assign(created_year=org_repos_df["created_at"].dt.year)
    .groupby("created_year")
    .size()
    .reset_index(name="repo_count")
)

In [10]:
peak_year = repos_by_year.loc[repos_by_year["repo_count"].idxmax()].repo_count
last_year = repos_by_year.loc[
    repos_by_year["created_year"] == 2024, "repo_count"
].values[0]

In [11]:
print(
    f"{abs((last_year - peak_year) / peak_year * 100):.0f}% decline in new repos from the peak year to last year."
)

## Create a crosstab of organizations and repositories

In [12]:
# Count the number of new repos by year
org_repos_df["year"] = org_repos_df["created_at"].dt.year
org_counts = org_repos_df.groupby(["org", "year"]).size().reset_index(name="count")

In [13]:
# Pivot that to be a wide format
org_pivot = org_counts.pivot(index="org", columns="year", values="count").fillna(0)

In [14]:
# Calculate the total number of repos for each org
org_pivot["total"] = org_pivot.sum(axis=1)

In [15]:
# Calculate the percentage change from 2016, the peak year, to 2024, the most recent year
org_pivot["percent_change_16to24"] = (org_pivot[2024] - org_pivot[2016]) / org_pivot[
    2016
]

In [16]:
# Calculate some annual averages
org_pivot["annual_avg_14to23"] = org_pivot.loc[:, 2014:2023].mean(axis=1)
org_pivot["annual_avg_13to17"] = org_pivot.loc[:, 2013:2017].mean(axis=1)

In [17]:
# Go back to the org_df and calculate the most recent update_at time for each org
latest_update = (
    org_repos_df.groupby("org")["updated_at"]
    .max()
    .reset_index()
    .rename(columns={"updated_at": "latest_update"})
)

In [18]:
# Merge that on to the pivot
org_pivot = org_pivot.merge(latest_update, on="org", how="left")

In [19]:
# Write out the data
org_pivot.to_csv(Path(".") / "data" / "transformed" / "org-activity.csv", index=False)