In [1]:
from tqdm.notebook import tqdm
import json
import collections
import pandas as pd
import numpy as np
import glob

## Load JSON files

In [41]:
sorted(glob.glob('./../data/*/*.json'))

['./../data/activity_facts/2021-06-07 13:00:40.json',
 './../data/analyses/2021-06-07 13:01:54.json',
 './../data/html_pages/2021-06-07 13:13:26.json',
 './../data/projects/project_ids.json']

In [42]:
with open('./../data/activity_facts/2021-06-07 13:00:40.json', 'rb') as f:
    activity_facts_dict = json.load(f)
activities = pd.DataFrame(activity_facts_dict).drop_duplicates()

In [43]:
with open('./../data/analyses/2021-06-07 13:01:54.json', 'r') as f:
    analyses_dict = json.load(f)
analyses = pd.DataFrame(analyses_dict).drop_duplicates()

In [44]:
with open('./../data/html_pages/2021-06-07 13:13:26.json', 'r') as f:
    html_pages_dict = json.load(f)
html_pages = pd.DataFrame(html_pages_dict).drop_duplicates().set_index('project_id')

# Fix data types

In [45]:
activities.month = pd.to_datetime(activities.month).dt.tz_localize(None)
for column in set(activities.columns) - {'month', 'project_id'}:
    activities[column] = activities[column].astype(int)
activities.dtypes

month               datetime64[ns]
code_added                   int64
code_removed                 int64
comments_added               int64
comments_removed             int64
blanks_added                 int64
blanks_removed               int64
commits                      int64
contributors                 int64
project_id                  object
dtype: object

In [46]:
analyses.min_month = pd.to_datetime(analyses.min_month).dt.tz_localize(None)
analyses.max_month = pd.to_datetime(analyses.max_month).dt.tz_localize(None)
analyses.updated_at = pd.to_datetime(analyses.updated_at).dt.tz_localize(None)
analyses.oldest_code_set_time = pd.to_datetime(analyses.oldest_code_set_time).dt.tz_localize(None)

projects = analyses.groupby('project_id')[['updated_at', 'min_month', 'max_month']].max()

projects.dtypes

updated_at    datetime64[ns]
min_month     datetime64[ns]
max_month     datetime64[ns]
dtype: object

# Trim & filter

In [47]:
trimmed_activities = activities[(activities.month.dt.year>=1991) & (activities.month.dt.year<=2020)]

In [48]:
duplicate_projects = html_pages[html_pages.original_project_name.notnull()]

In [49]:
trimmed_activities_no_duplicates = trimmed_activities[~trimmed_activities.project_id.isin(set(duplicate_projects.index))]

In [50]:
valid_project_ids = set(trimmed_activities_no_duplicates.project_id)

In [62]:
valid_projects = projects[projects.index.isin(valid_project_ids)]

In [63]:
valid_projects.to_hdf('../data/openhub.h5', key='valid_projects')

# Fill all activities

In [64]:
def fill(project_id, df, last_seen):
    df = df.set_index('month')
    
    if last_seen:
        ts = min(last_seen.to_datetime64(), np.datetime64('2020-12-01T00:00:00.000000000'))
        try:
            df = df.append(new_entry, verify_integrity=True, sort=False).sort_index() 
        except: # entry (month) does already exist
            pass
    df = df.asfreq('MS', fill_value=0)
    df['project_id'] = project_id
    return df

In [65]:
last_seen_dict = dict((valid_projects.updated_at.astype('datetime64[M]')) - pd.DateOffset(months=1))

In [69]:
groups = list(trimmed_activities_no_duplicates.groupby('project_id'))

In [70]:
filled_activities = [fill(*group, last_seen_dict.get(group[0])) for group in tqdm(groups)]

  0%|          | 0/172833 [00:00<?, ?it/s]

In [71]:
final = pd.concat(filled_activities).reset_index().rename(columns={'index': 'month'})

In [72]:
final.set_index(['month', 'project_id']).sort_index().to_hdf('../data/openhub.h5', key='filled_activities')

# Overview

## Projects

In [73]:
projects.index.nunique()

173305

In [74]:
activities.project_id.nunique()

173305

In [75]:
trimmed_activities.project_id.nunique()

173265

In [76]:
trimmed_activities_no_duplicates.project_id.nunique()

172833

## Duplicates

In [77]:
trimmed_activities.project_id.nunique() - trimmed_activities_no_duplicates.project_id.nunique()

432

Linux kernel is `'e206a54e97690cce50cc872dd70ee896'`

In [78]:
html_pages[html_pages.original_project_name=='e206a54e97690cce50cc872dd70ee896'].original_project_name.count()

72

In [82]:
valid_projects.shape

(172833, 3)