In [270]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import spsolve
import random
import gzip
import json
import pdb

from sklearn.preprocessing import MinMaxScaler

import implicit # The Cython library

In [275]:
# load toy example
# File from: wget http://data.gharchive.org/2018-07-16-15.json.gz
star_events = []

for date in range(16, 23):
    
    # format date string so that 1 is encoded as '01', etc.
    my_date = str(date) if date > 9 else '0'+ str(date)

    for hour in range(0, 23):
        with gzip.open('/Users/amy/git/MetaGithub/data/2018-07-%s-%d.json.gz' %(my_date, hour)) as f:
            for line in f:
                fields = json.loads(line.strip())
                if fields['type'] == 'WatchEvent':
                    star_events.append(fields)
                
len(star_events)

641396

In [276]:
# count how many times each repo was starred during this time
# maps repo name to count of star events
repo2count = {}

for event in star_events:
    
    repo_name = event['repo']['name']
    
    if repo_name not in repo2count:
        repo2count[repo_name] = 1
    else:
        repo2count[repo_name] += 1

len(repo2count)

207329

In [435]:
# record the names of repos that were starred more than k times during this time
# i.e. get popular repos during this time period
# we can change this code to find the top xx repos instead, etc.

k = 50 # this was chosen arbitrarily 
popular_repos = [repo for repo, count in repo2count.items() if count > k]
     
len(popular_repos)

1165

In [440]:
#popular_repos
df = pd.Series(popular_repos)
df.to_csv("/Users/amy/git/MetaGithub/popular_repos.csv")

In [390]:
# user2repo: user to list of repo they've starred
# only process users who have interacted with the popular repos

# aggregate starred repos for each user
user2repo = {} # maps user name to list of repo names they've starred

for event in star_events:
    
    repo_name = event['repo']['name']
    user_name = event['actor']['login']

    # skip star event if the repo is not popular
    if repo_name not in popular_repos:
        continue
    
    # for new user, initialize repo list
    if user_name not in user2repo:
        user2repo[user_name] = [repo_name]
    else:
        user2repo.get(user_name).append(repo_name)
        

len(user2repo)

94654

In [391]:
# filter out users who starred < min_n or > max_n repos
# i.e. filter out users that star too few or too many things

min_n = 3
max_n = 20

good_users = set([user for user, repo_list in user2repo.items() if len(repo_list) > min_n and len(repo_list) < max_n])

len(good_users)

5860

In [392]:
# make dataframe: user, pop_repo, n_star
users = []
repos = []

for user, repo_list in user2repo.items():
    
    # skip if not in good users
    if user not in good_users:
        continue


    repos += [repo for repo in repo_list]
    users += [user for i in range(0, len(repo_list))]


stars = [1 for i in range(0, len(users))]

len(users)

33019

In [393]:
raw_data = pd.DataFrame({'user':users, 'repo':repos, 'star':stars})

raw_data.head()

data = raw_data.dropna()
data = data.copy()

# Create a numeric user_id and repo_id column
data['user'] = data['user'].astype("category")
data['repo'] = data['repo'].astype("category")
data['user_id'] = data['user'].cat.codes
data['repo_id'] = data['repo'].cat.codes

item_user_data = sp.csr_matrix((data['star'].astype(float), (data['repo_id'], data['user_id'])))
item_user_data.shape

(1160, 5860)

In [356]:
data.head()

Unnamed: 0,repo,star,user,user_id,repo_id
0,laravel/framework,1,imamriyadi,694,430
1,Microsoft/vscode,1,imamriyadi,694,72
2,golang/go,1,imamriyadi,694,324
3,kubernetes/kubernetes,1,imamriyadi,694,426
4,godotengine/godot,1,imamriyadi,694,319


In [340]:
#import pickle
#pickle.dump(item_user_data, open("/Users/amy/git/MetaGithub/item_user_matrix.5hrs.csr", "wb"))
#pickle.dump(data, open("/Users/amy/git/MetaGithub/item_user_matrix.5hrs.df", "wb"))

In [433]:
# initialize model
# number of latent factors
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01, iterations=50)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 15
data_conf = (item_user_data * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)

100%|██████████| 50.0/50 [00:00<00:00, 82.62it/s]


In [434]:
# find similar repos
similar = model.similar_items(15)
min_score = .5

for item in similar:
    idx, score = item
    if score > min_score:
        print((data.repo[data.repo_id == idx].iloc[0], score))



('Bigkoo/Android-PickerView', 1.0000001)
('CymChad/BaseRecyclerViewAdapterHelper', 0.93985856)
('PhilJay/MPAndroidChart', 0.93609178)
('bumptech/glide', 0.87561345)
('codepath/android_guides', 0.85858792)
('ReactiveX/RxAndroid', 0.85261893)
('JessYanCoding/MVPArms', 0.83324397)
('google/dagger', 0.83246356)
('googlesamples/android-architecture', 0.83187306)
('JakeWharton/butterknife', 0.82694513)
