Read raw events from our feather files:

In [1]:
import os
import numpy as np
import pandas as pd
from ruamel.yaml import YAML

yaml = YAML(typ="safe")

with open("cfg.yaml") as f:
    config = yaml.load(f)

In [4]:
from itertools import chain

def load_data():
    dataframes = []
    for parent, dirs, files in chain(
        os.walk("data/id-type-created_at-repo.name-repo.url-actor.id-actor.login/"),
        os.walk("data/old"),
        os.walk("data/email"),

    ):
        for fname in files:
            if fname.endswith(".feather"):
                path = os.path.join(parent, fname)
                dataframes.append(pd.read_feather(path))
    return pd.concat(dataframes)


raw_data = load_data()

Clean the data a bit:

Data in some older years has a different schema:

- repo_name is just `nbviewer` not `jupyter/nbviewer`
- only `repo_url` contains full org/name
- `repo_url` may be either `https://github.com/jupyter/nbviewer` or `https://api.github.com/repos/jupyter/nbviewer`
- older events may not have actor.id or event.id
- actor.id is float, despite being integer data
- event.id is str, despite being integer data
- some events appear to be double-reported

So:

- drop some duplicate events
- drop id field after removing duplicates
- make repo_name consistent across schema changes
- add org column from the first part of the repo_name
- track some repos across renames
- sort by date
- backfill missing actor_id
- fill still-missing actor_id with new unique values
- ignore bot-initiated events
- ignore fork/watch events

In [5]:
# some events may be reported multiple times in the data
raw_rows = len(raw_data)
df = raw_data.drop_duplicates()  # just on "id"? "id" is undefined for old data
without_dupes = len(df)
print(f"Dropped {len(raw_data) - len(df)}/{len(raw_data)} duplicate events")
# drop uninteresting event-id columnt after removing duplicates
df = df.drop(columns=["id"])

# drop bot events
known_bots = [
    "travisbot",
    "sourcegraphbot",
    "jupyterlab-bot",
    "npmcdn-to-unpkg-bot",
    "codetriage-readme-bot",
    "henchbot",
    "lektor-bot",
]
df = df[~(df.actor_login.str.endswith("[bot]") | df.actor_login.isin(known_bots))]
without_bots = len(df)

print(f"Dropped {without_dupes - without_bots} bot events")

# parse non-uniform repo.url into repo_name, repo_url
df["repo_name"] = df.repo_url.str.extract(
    r"https://[^/]+/(?:repos/)?(.+)"
)

# apply repo renames
for src, dest in config["renames"].items():
    df.loc[df.repo_name==src, "repo_name"] = dest

# add org column
df["org"] = df.repo_name.str.split("/", expand=True)[0]

# drop any repos not in our current config
# df = df[df.org.isin(config["orgs"])]

# sort by date
df = df.sort_values("created_at")

# cast integer id columns to integers
df["actor_id"] = df["actor_id"].astype("Int64")
df

# more intuitive name for created_at
df["date"] = df["created_at"]
df.drop(columns=["repo_url", "created_at"], inplace=True)

Dropped 270619/900298 duplicate events
Dropped 5203 bot events


In [6]:
df.type.value_counts()

IssueCommentEvent                235480
WatchEvent                        98047
PullRequestEvent                  63690
IssuesEvent                       56023
PushEvent                         53909
PullRequestReviewCommentEvent     35574
ForkEvent                         35061
EmailEvent                        25191
PullRequestCommentEvent            8600
CreateEvent                        4720
GollumEvent                        2409
DeleteEvent                        2178
IssueEvent                         1468
PullRequestReviewEvent              748
CommitCommentEvent                  617
MemberEvent                         465
ReleaseEvent                        269
PublicEvent                          16
TeamAddEvent                         11
Name: type, dtype: int64

Back-fill actor ids from events with matching login that do have an id

In [7]:
print(f"{df.actor_id.isna().sum()}/{len(df)} events lacking actor id")

103306/624476 events lacking actor id


In [8]:
login_id_map = df.dropna(subset=["actor_id"]).groupby("actor_login").actor_id.first()
df.loc[df["actor_id"].isna(), "actor_id"] = df.actor_login[df.actor_id.isna()].map(
    login_id_map
)

In [9]:
print(f"{df.actor_id.isna().sum()}/{len(df)} events still lacking actor id")

31965/624476 events still lacking actor id


Finally, drop common but less interesting fork/watch events
Do this after back-filling actor ids, since many of these events could be
a source of actor_login:actor_id mappings

In [10]:
before = len(df)
ignore_events = ["ForkEvent", "WatchEvent"]
df = df[~df.type.isin(ignore_events)]

print(f"Dropped {before-len(df)}/{before} fork/watch events")
print(f"{df.actor_id.isna().sum()}/{len(df)} events still lacking actor id")

Dropped 133108/624476 fork/watch events
26352/491368 events still lacking actor id


find actor_logins still without any actor_id and assign them new, unique ids
use a counter starting just above the max value

In [11]:
logins_without_id = df[df.actor_id.isna()].actor_login.unique()
max_actor_id = df.actor_id.dropna().max()

new_actor_ids = np.arange(max_actor_id + 1, max_actor_id + 1 + len(logins_without_id))
actor_id_missing = df.actor_id.isna()

new_actor_id_map = pd.Series(new_actor_ids, index=logins_without_id)
df.loc[actor_id_missing, "actor_id"] = df.actor_login[actor_id_missing].map(new_actor_id_map)
df.actor_id.isna().sum()
print(f"Filled out remaining {actor_id_missing.sum()} actor_ids")

Filled out remaining 26352 actor_ids


Now we have actor ids for everyone and can use actor_id for all analysis
instead of the non-unique actor_login (accounts can be renamed)

We may have a small number of double-counts for some of the 6000 logins without actor id who may have renamed
to a later login with an id.

We also don't try to map EmailEvents onto their github counterparts, so active users on the ML may be double-counted.

In [12]:
groupby_login = df.dropna(subset=["actor_id"]).groupby("actor_id").actor_login
login_counts = groupby_login.nunique()
multiple_logins = login_counts[login_counts > 2]

df[df.actor_id.isin(multiple_logins.index)].groupby("actor_id").actor_login.unique()

actor_id
890156          [jankatins, JanSchulz, janschulz]
5635139      [MaximilianR, maxim-lian, max-sixty]
10365377    [nottaanibot, thethomask, nymoorland]
26246495      [NicolaiRiis, nicolairiis, nabriis]
28781481            [soodooo, CandleSense, fsksf]
Name: actor_login, dtype: object

Now we can save the post-processed data for analysis

In [15]:
df.reset_index(drop=True).to_feather("data/processed.feather")