In [293]:
from tqdm.notebook import tqdm
import json
import collections
import pandas as pd
import numpy as np

## Load JSON files

In [294]:
with open('./../data/activity_facts.json', 'rb') as f:
    activity_facts_dict = json.load(f)
activities = pd.DataFrame(activity_facts_dict).drop_duplicates()

In [295]:
with open('./../data/analyses.json', 'r') as f:
    analyses_dict = json.load(f)

analyses_dict_cleaned = []
for d in analyses_dict:
    d.pop('factoids', None)
    d.pop('languages', None)
    analyses_dict_cleaned += [d]

analyses = pd.DataFrame(analyses_dict_cleaned).drop_duplicates()

In [296]:
with open('./../data/html_pages.json', 'r') as f:
    html_pages_dict = json.load(f)
html_pages = pd.DataFrame(html_pages_dict).drop_duplicates()

# Fix data types

In [297]:
activities.month = pd.to_datetime(activities.month).dt.tz_localize(None)
for column in set(activities.columns) - {'month', 'project_id'}:
    activity_facts[column] = activities[column].astype(int)
activities.dtypes

month               datetime64[ns]
code_added                  object
code_removed                object
comments_added              object
comments_removed            object
blanks_added                object
blanks_removed              object
commits                     object
contributors                object
project_id                  object
dtype: object

In [338]:
analyses.min_month = pd.to_datetime(analyses.min_month).dt.tz_localize(None)
analyses.max_month = pd.to_datetime(analyses.max_month).dt.tz_localize(None)
analyses.updated_at = pd.to_datetime(analyses.updated_at).dt.tz_localize(None)
analyses.oldest_code_set_time = pd.to_datetime(analyses.oldest_code_set_time).dt.tz_localize(None)

projects = analyses.groupby('project_id')[['updated_at', 'min_month', 'max_month']].max()

projects.dtypes

updated_at    datetime64[ns]
min_month     datetime64[ns]
max_month     datetime64[ns]
dtype: object

# Trim

In [360]:
trimmed_activities = activities[(activities.month.dt.year>=1991) & (activities.month.dt.year<=2020)]

In [356]:
available_projects = set(trimmed_activities.project_id)
duplicates = set(html_pages[html_pages.duplicate_link != ''].project_id)

In [365]:
len(available_projects)

173036

In [357]:
duplicates

{'494454'}

In [358]:
valid_projects = projects[projects.index.isin(available_projects-duplicates)]

In [359]:
valid_projects.to_hdf('../data/openhub.h5', key='valid_projects')

# Fill all activities

In [347]:
def fill(project_id, df, last_seen):
    df = df.set_index('month')
    
    if last_seen:
        ts = min(last_seen.to_datetime64(), np.datetime64('2020-12-01T00:00:00.000000000'))
        try:
            df = df.append(new_entry, verify_integrity=True, sort=False).sort_index() 
        except: # entry (month) does already exist
            pass
    df = df.asfreq('MS', fill_value=0)
    df['project_id'] = project_id
    return df

In [348]:
last_seen_dict = dict((valid_projects.updated_at.astype('datetime64[M]')) - pd.DateOffset(months=1))

In [349]:
filled_activities = [fill(*group, last_seen_dict.get(group[0])) for group in tqdm(groups)]

  0%|          | 0/168510 [00:00<?, ?it/s]

In [352]:
final = pd.concat(filled_activities).reset_index().rename(columns={'index': 'month'})

In [353]:
final.set_index(['month', 'project_id']).sort_index().to_hdf('../data/openhub.h5', key='filled_activities')