In [2]:
import datetime
from dotenv import load_dotenv
import os
import pandas as pd
from githubkit import GitHub
from pyoso import Client

load_dotenv()
OSO_API_KEY = os.environ['OSO_API_KEY']
client = Client(api_key=OSO_API_KEY)

GITHUB_API_KEY = os.environ['GITHUB_API_KEY']
github = GitHub(GITHUB_API_KEY)

# Check OSO data (from GitHub Archive)

In [4]:
df_oso = client.to_pandas("""
SELECT
    e.time,
    u.github_username
FROM int_events_to_project__github AS e
JOIN artifacts_by_project_v1 abp ON abp.artifact_id = e.to_artifact_id
JOIN int_github_users u ON u.user_id = e.from_artifact_id
WHERE
    abp.artifact_namespace = 'rotki'
    AND abp.artifact_name = 'rotki'
    AND event_type = 'COMMIT_CODE'
    AND e.time >= TIMESTAMP '2023-08-01'
    AND e.time <= TIMESTAMP '2025-01-31'
""")
df_oso.tail()

Unnamed: 0,time,github_username
11827,2024-09-17 06:24:20.000,lukicenturi
11828,2024-04-19 10:21:36.000,lukicenturi
11829,2024-04-19 10:21:36.000,lukicenturi
11830,2024-04-19 10:21:36.000,lukicenturi
11831,2024-04-19 10:21:36.000,lukicenturi


In [60]:
df_oso.groupby('github_username')['time'].nunique().sort_values()

github_username
nicholasyoder         1
tewshi                1
OjusWiZard            2
prettyirrelevant      2
cleanerzkp            3
yabirgb             118
lukicenturi         192
kelsos              579
LefterisJP          846
Name: time, dtype: int64

In [5]:
df_commits = client.to_pandas("""
SELECT *
FROM stg_github__commits
WHERE
    repository_name = 'rotki/rotki'
    AND created_at >= TIMESTAMP '2023-08-01'
    AND created_at <= TIMESTAMP '2025-01-31'
""")
df_commits.tail()

Unnamed: 0,created_at,repository_id,repository_name,push_id,ref,actor_id,actor_login,sha,author_email,author_name,is_distinct,api_url
5465,2023-10-23 18:42:18.000,123909654,rotki/rotki,15533365471,refs/heads/develop,2269732,kelsos,b95746d6bea78e0098da92a00c79b8be74312714,895381bfd7ef6bc2d7782122f7f071712cd4460f@gmail...,Konstantinos Paparas,True,https://api.github.com/repos/rotki/rotki/commi...
5466,2023-10-23 20:09:28.000,123909654,rotki/rotki,15534385732,refs/heads/develop,1658405,LefterisJP,efb37893b6a7456520243aea8f86a8791dcb036f,2cf27834f29876756eeaa6141c0926d6523380ac@proto...,Yabir Benchakhtir,True,https://api.github.com/repos/rotki/rotki/commi...
5467,2023-10-23 17:38:56.000,123909654,rotki/rotki,15532595277,refs/heads/develop,2269732,kelsos,57a0cc790b538bd3c780b26b564740ab5a056300,65bffb90fa1ff9b0e6a7199590f2d2923258d557@gmail...,lukicenturi,True,https://api.github.com/repos/rotki/rotki/commi...
5468,2023-10-23 17:38:56.000,123909654,rotki/rotki,15532595277,refs/heads/develop,2269732,kelsos,a8555de6733e95fca43768cdae95bf7cb2f00956,65bffb90fa1ff9b0e6a7199590f2d2923258d557@gmail...,lukicenturi,True,https://api.github.com/repos/rotki/rotki/commi...
5469,2023-10-23 17:38:56.000,123909654,rotki/rotki,15532595277,refs/heads/develop,2269732,kelsos,b8c82e9d7b1cc1b04e93dd26a1b1663a533997eb,895381bfd7ef6bc2d7782122f7f071712cd4460f@gmail...,Konstantinos Paparas,True,https://api.github.com/repos/rotki/rotki/commi...


In [63]:
df_commits.groupby(['actor_id', 'actor_login'])['created_at'].nunique().sort_values()

actor_id   actor_login     
49699333   dependabot[bot]        1
145849440  nicholasyoder          1
55619686   OjusWiZard             2
72208758   prettyirrelevant       3
5625068    tewshi                 5
104455005  cleanerzkp            11
5068010    yabirgb              216
26648140   lukicenturi          250
2269732    kelsos               762
1658405    LefterisJP          1211
Name: created_at, dtype: int64

In [64]:
df_commits.groupby('author_name')['created_at'].nunique().sort_values()

author_name
NetScr1be                 1
Kirk                      1
Kenny Rachuonyo           1
Prajjwal Yadav            1
Rafael Matias             1
                       ... 
Luki Centuri            308
lukicenturi             366
Yabir Benchakhtir       501
Konstantinos Paparas    534
Lefteris Karapetsas     652
Name: created_at, Length: 72, dtype: int64

# Check data directly from GitHub

In [6]:
all_commits = list(github.paginate(
    github.rest.repos.list_commits,
    owner="rotki",
    repo="rotki"
))

In [65]:
data = []
for commit in all_commits:
    if commit.author and getattr(commit.author, 'login', None):
        author = getattr(commit.author, 'login')
        committer = getattr(commit.committer, 'login')
        date = getattr(commit.commit.author, 'date')
        if date >= '2023-08-01' and date <= '2025-01-31':
            data.append({
                "date": date,
                "author": author.lower(),
                "committer": committer.lower(),
                "sha": commit.sha
            })

In [66]:
df_all_commits = pd.DataFrame(data)
df_all_commits.tail()

Unnamed: 0,date,author,committer,sha
3255,2023-08-02T09:25:58Z,lefterisjp,lefterisjp,b8e660a88215a54e6430f1243c1b1eb35d166130
3256,2023-08-01T15:26:47Z,lefterisjp,lefterisjp,52f3cc129718c76c291b6f19a0f6b738740b0d15
3257,2023-08-01T09:08:04Z,ianmichaelharper,web-flow,a480ccafd902ab3f8f9dfebbc1adb55d2cf4cc97
3258,2023-08-01T08:42:55Z,lukicenturi,web-flow,36da630045c6b246180704666128612f6910f8ed
3259,2023-08-01T08:36:07Z,kelsos,web-flow,850ad1e5ea552707ba6300c926aa8fa9421c5e91


In [68]:
df_all_commits.groupby('author')['date'].nunique().sort_values()

author
0saurabh0         1
jiangmencity      1
zanieb            1
mdqst             1
netscr1be         1
               ... 
tewshi          149
lukicenturi     551
kelsos          558
yabirgb         621
lefterisjp      821
Name: date, Length: 66, dtype: int64

# See where the datasets differ

In [69]:
missing_shas = list(set(df_all_commits['sha']).difference(set(df_commits['sha'])))
len(missing_shas) / len(df_all_commits)

0.05184049079754601

In [70]:
len(set(df_commits['sha']).difference(set(df_all_commits['sha']))) / len(df_commits)

0.038391224862888484

In [71]:
df_all_commits[df_all_commits['sha'].isin(missing_shas)]

Unnamed: 0,date,author,committer,sha
0,2025-01-27T10:35:47Z,lukicenturi,kelsos,6c48e7a53bfbcdd15c6af8628e089deca691b9ea
1,2025-01-23T21:59:47Z,nicholasyoder,lefterisjp,c6ae0af25060dd637c240104817e944c7aaf2949
2,2025-01-25T10:57:34Z,jiangmencity,lefterisjp,7cf8a3e4b45bdb0ed1465994af415764405167ba
3,2025-01-30T13:37:47Z,prettyirrelevant,lefterisjp,daf90087939ec69cd5bbd8f925e9abbaa4f1ae04
5,2025-01-30T18:30:14Z,nicholasyoder,lefterisjp,e656bae27cf9bcb20ae0608aabad963ec26a986f
...,...,...,...,...
3190,2023-08-09T15:33:15Z,chamalis,lefterisjp,8f4a42843a6c2b51a8b63848b7a7ec41eca44903
3192,2023-08-14T08:49:17Z,lefterisjp,lefterisjp,a7f8a359321df09d321165da91afd7d1cca36f10
3193,2023-08-14T08:43:07Z,lefterisjp,lefterisjp,981e75327f7a50062d2569ca1bf1cdb987ef1de9
3194,2023-08-14T08:37:12Z,lefterisjp,lefterisjp,1392f386ca5d0a524a9db5e2a5b4cd92e3b7dcde
