In [1]:
import numpy as np
import pprint
pp = pprint.PrettyPrinter(depth = 2)
import re
from utils import *
import pickle

Load in the data

In [2]:
with open(os.path.join("data", "datar.pickle"), "rb") as f:
    datar = pickle.load(f)

In [3]:
github_metadata            = datar.github_metadata
google_play_metadata       = datar.google_play_metadata
github_names               = datar.github_names
all_release_metadata       = datar.all_release_metadata
all_reviews_from_github    = datar.all_reviews_from_github

Some statistics

In [4]:
print_statistics(github_metadata, google_play_metadata)

There are 1388 GitHub repositories with metadata.
There are 1419 Google Play repositories with metadata.
There are 1107 applications with reviews.
There are 1363 applications with release-related metadata.


In [11]:
review_versions = {
    github_name : set(review["reviewCreatedVersion"] for review in reviews if reviews is not None) 
    for github_name, reviews in all_reviews_from_github.items()
}

github_versions = {
    github_name : set(get_version(x) for x in release_metadata if release_metadata is not None) 
    for github_name, release_metadata in all_release_metadata.items()
}

symmetric_difference = {
    github_name : review_versions[github_name].symmetric_difference(github_versions[github_name])
    for github_name in github_names
}

review_minus_github_versions = {
    github_name : review_versions[github_name].difference(github_versions[github_name])
    for github_name in github_names
}

In [41]:
github_names_wo_missing_versions_with_letters = np.array([key for key, val in review_minus_github_versions.items() if len([v for v in val if v is not None]) == 0])
github_names_wo_missing_versions_ignoring_letters = np.array([key for key, val in review_minus_github_versions.items() if len([v for v in val if v is not None and re.search('[a-zA-Z]', v) is None]) == 0])

In [42]:
with open("github_names_wo_missing_versions_with_letters.txt", "w") as f:
    f.write("\n".join(github_names_wo_missing_versions_with_letters))
    
with open("github_names_wo_missing_versions_ignoring_letters.txt", "w") as f:
    f.write("\n".join(github_names_wo_missing_versions_ignoring_letters))

In [47]:
def save_subset(datar, github_names : list, saveName : str):
    metadata_transform          = lambda x : np.array([y for y in x if y["github_name"] in github_names])
    dict_transform              = lambda x : dict((k, x[k]) for k in github_names)
    
    github_metadata            = metadata_transform(datar.github_metadata)
    google_play_metadata       = metadata_transform(datar.google_play_metadata)
    all_release_metadata       = dict_transform(datar.all_release_metadata)
    all_reviews_from_github    = dict_transform(datar.all_reviews_from_github)
    
    datar_subset = DATAR(github_metadata, 
              google_play_metadata, 
              github_names, 
              all_release_metadata, 
              all_reviews_from_github)
    
    with open(os.path.join("data", saveName + ".pickle"), "wb") as f:
        pickle.dump(datar_subset, f, protocol=pickle.HIGHEST_PROTOCOL)

In [48]:
save_subset(datar, github_names_wo_missing_versions_with_letters, "data_clean")
save_subset(datar, github_names_wo_missing_versions_ignoring_letters, "data_clean_no_letters")

In [21]:
github_repos_with_reviews = list(key for key, val in review_versions.items() if len(val) != 0)
print(f"There are {len(github_repos_with_reviews)} ({len(github_repos_with_reviews) / len(github_names) * 100:.1f}%) GitHub repositories with reviews.")

There are 1084 (78.1%) GitHub repositories with reviews.
