In [None]:
from pathlib import Path
import hashlib
import bz2
from datetime import datetime

try:
    import orjson as json
except ImportError:
    import json

import pandas as pd
from tqdm.auto import tqdm

In [None]:
user_location = pd.DataFrame([], columns=['user_id', 'month', 'country']) # load here your users' location for each month (captures user locations over time )

# if time information are not available, user_location can be a dict mapping persons to a country (static; does not capture user locations over time)
user_location = {
    'c8b39e436e0d96f0c8f7c66908a02d15': 'Germany',
    '3425a6f8362416088ea186018a0f5d71': 'Sweden'
}

In [None]:
if type(user_location) == dict:
    humans = set(user_location)
else:
    humans = set(user_location.user_id)

In [None]:
def hide(a_string: str):
    if a_string:
        return hashlib.md5(a_string.encode('utf-8')).hexdigest()
    return None

def load_data(file_path: Path):
    with open(file_path, 'rb') as file_handle:
        byte_data = bz2.decompress(file_handle.read())
        return json.loads(byte_data)

def remove_fields(data, fields_to_remove):
    if isinstance(data, dict):
        return {k: remove_fields(v, fields_to_remove) for k, v in data.items() if k not in fields_to_remove}
    if isinstance(data, list):
        return [remove_fields(i, fields_to_remove) for i in data]
    return data

def anonymize_user_fields(data):
    if isinstance(data, dict):
        if 'login' in data: # is a user dict
            login = data['login']
            return hide(login)
        else:
            return {k: anonymize_user_fields(v) for k, v in data.items()}
    if isinstance(data, list):
        return [anonymize_user_fields(i) for i in data]
    return data

In [None]:
data_dir = Path('./data') # Change if needed

In [None]:
pulls = []
for pull_file in tqdm(list(data_dir.glob('repos/*/*/pulls.json.bz2'))):
    pull_file_path = Path(pull_file)
    for full_pull in load_data(pull_file_path):
        pull = {field: full_pull[field] for field in ('id', 'number', 'state', 'created_at', 'closed_at', 'merged_at', 'author_association', 'user')}
        pull = anonymize_user_fields(pull)

        timeline_path = pull_file_path.parent / f'timelines/{pull["number"]}.json.bz2'
        try:
            time_line_items = [item for item in load_data(timeline_path) if item['event'] != 'committed'] # we exclude commit events since the user data is not mapped to the GitHub datascheme
        except FileNotFoundError:
            print(f'{timeline_path} seems to be missing')
            time_line_items = []
        time_line_items = remove_fields(time_line_items, ('performed_via_github_app', 'label', 'reactions', 'commit_id', 'labels', 'repository', 'assignee', 'assignees', 'milestone', 'diff_hunk', 'path'))
        time_line_items = anonymize_user_fields(time_line_items)

        pull['timeline'] = time_line_items
        pulls += [pull]

In [None]:
events = []
for pull in tqdm(pulls):
    pull_id = pull['id']
    events += [(pull['user'], pull['created_at'], 'created', pull_id)]
    for event in pull['timeline']:
        event_type = event['event']
        match event_type:
            case 'reviewed':
                events += [(event['user'], event['submitted_at'], event_type, pull_id)]
            case 'commit-commented' | 'line-commented':
                for comment in event['comments']:
                    events += [(comment['user'], comment['updated_at'], event_type, pull_id)]
            case 'created' | 'closed' | 'commented' | 'reopened':
                events += [(event['actor'], event['created_at'], event_type, pull_id)]
            case _:
                pass
events = [event for event in events if event[0] in humans]

In [None]:
activities = pd.DataFrame(events, columns=['user_id', 'timestamp', 'action', 'pr_id']).dropna()
activities.timestamp = pd.to_datetime(activities.timestamp).dt.tz_localize(None)
activities['month'] = activities.timestamp.to_numpy().astype('datetime64[M]')

In [None]:
if type(user_location) == dict:
    activities['country'] = activities.user_id.replace(user_location)
else:
    activities = activities.merge(user_location, how='left', left_on=['user_id', 'month'], right_on=['user_id', 'month'], validate='m:1')

In [None]:
# Plese find more information on modelling code review as communication channels here: https://dl.acm.org/doi/abs/10.1145/3544902.3546254

start = activities.groupby('pr_id').timestamp.min().rename('start')
end = activities.groupby('pr_id').timestamp.max().rename('end')
countries = activities.groupby('pr_id').country.nunique(dropna=True).rename('countries')
unclear = activities.set_index('pr_id').country.isnull().groupby(level=0).sum().rename('unclear')
countries_max = (countries + unclear).rename('countries_max')

communication_channels = pd.concat([start, end, countries, countries_max], axis=1)

In [None]:
# Change time frame as needed
sample_freq = '1M'
sample_timeframe_1 = (datetime(2017,1,1)<=communication_channels.end) & (communication_channels.end<=datetime(2023,1,1))

all_communication_channels = communication_channels[sample_timeframe_1].resample(sample_freq, on='end').start.count()
crossborder_communication_channels = communication_channels[sample_timeframe_1 & (communication_channels.countries>1)].resample(sample_freq, on='end').start.count()
crossborder_max_communication_channels = communication_channels[sample_timeframe_1 & (communication_channels.countries_max>1)].resample(sample_freq, on='end').start.count()

In [None]:
lower_y = (crossborder_communication_channels/all_communication_channels).rename('lower_crossborder').loc['2018-12-01':'2023-01-01']
upper_y = (crossborder_max_communication_channels/all_communication_channels).rename('upper_crossborder').loc['2018-12-01':'2023-01-01']

In [None]:
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

fig, ax = plt.subplots(figsize=(12, 6), dpi=300)

x = lower_y.index
ax.plot(x, lower_y, c='black', label='Clear location')

ax.fill_between(x, lower_y, upper_y, color='red', alpha=0.25, label='Unclear location')
ax.set_ylabel('Cross-border code reviews');
ax.set_xlabel('');


years = mdates.YearLocator()
months = mdates.MonthLocator()
monthsFmt = mdates.DateFormatter('%b')
yearsFmt = mdates.DateFormatter('%Y')

ax.xaxis.set_minor_locator(months)

ax.set_xbound((pd.Timestamp('2019-01-01'), pd.Timestamp('2023-01-01')))

plt.grid(which='major')
plt.legend(loc='upper left')
plt.tight_layout()