In [None]:
from pathlib import Path
import bz2

import orjson
from tqdm.auto import tqdm
import pandas as pd

In [None]:
def load_data(file_path: Path):
    with open(file_path, 'rb') as file_handle:
        byte_data = bz2.decompress(file_handle.read())
        return orjson.loads(byte_data)

In [None]:
def extract_login(data):
    actor = data.get('actor')
    if actor:
        actor = actor.get('login')
    user = data.get('user')
    if user:
        user = user.get('login')    
    if user == actor:
        return actor
    else:
        if user and not actor:
            return user
        elif not user and actor:
            return actor
        else:
            raise Exception(f"Couldn't load login from {data}")

In [None]:
events = []
file_endpoints = {}
repositories = {}
for pull_file in tqdm(list(Path('Spotify').glob('repos/*/*/pulls.json.bz2'))):
    pull_file_path = Path(pull_file)
    repo_owner = pull_file_path.parts[-3]
    repo_name = pull_file_path.parts[-2]
    for pull in load_data(pull_file_path):
        pull_number = pull['number']
        pull_id = str(pull['id'])

        repositories[pull_id] = f'{repo_owner}/{repo_name}'

        events += [(repo_owner, repo_name, pull_number, pull_id, extract_login(pull), pull['created_at'], 'created',  None)]

        timeline_path = pull_file_path.parent / f'timelines/{pull["number"]}.json.bz2'
        try:
            time_line_items = [item for item in load_data(timeline_path) if item['event'] != 'committed'] # we exclude commit events since the user data is not mapped to the GitHub datascheme
        except FileNotFoundError:
            time_line_items = []

        for event in time_line_items:
            event_type = event['event']
            match event_type:
                case 'reviewed':
                    events += [(repo_owner, repo_name, pull_number, pull_id, extract_login(event), event['submitted_at'], event_type, None)]
                case 'commit-commented' | 'line-commented':
                    for comment in event['comments']:
                        events += [(repo_owner, repo_name, pull_number, pull_id, extract_login(comment), comment['created_at'], event_type, None)]
                        events += [(repo_owner, repo_name, pull_number, pull_id, extract_login(comment), comment['updated_at'], event_type, None)]
                case 'created' | 'closed' | 'commented' | 'reopened':
                    events += [(repo_owner, repo_name, pull_number, pull_id, extract_login(event), event['created_at'], event_type, None)]
                case 'cross-referenced':
                    referencing_issue = str(event['source']['issue']['id'])
                    events += [(repo_owner, repo_name, pull_number, pull_id, extract_login(event), event['created_at'], event_type, referencing_issue)]
                    break
                case _:
                    pass

In [None]:
humans = set(pd.read_csv('raw_data/spotifiers.csv').username)

In [None]:
events_df = pd.DataFrame(events, columns=['owner', 'repo', 'pull', 'pr_id', 'user_id', 'timestamp', 'event', 'source']).drop_duplicates()
events_df.timestamp = pd.to_datetime(events_df.timestamp).dt.tz_localize(None)
events_df['human'] = events_df.user_id.isin(humans)
events_df['source_is_pr'] = events_df.source.isin(set(events_df.pr_id.unique()))

In [None]:
events_df.to_pickle('raw_data/events.pickle')