In [100]:
import sys
import pandas as pd
import numpy as np
import scipy.sparse as sp
from scipy.sparse.linalg import spsolve
import random
import gzip
import json
import pdb

from sklearn.preprocessing import MinMaxScaler

import implicit # The Cython library

In [117]:
# load toy example
# File from: wget http://data.gharchive.org/2018-07-16-15.json.gz

events = []
for hour in range(10, 16):
    with gzip.open('/Users/amy/git/MetaGithub/data/2018-07-16-%d.json.gz' %hour) as f:
        for line in f:
            events.append(json.loads(line.strip()))

In [118]:
# get all watch events
# The WatchEvent corresponds to starring a repository, not watching. 
star_events = [event for event in events if event['type'] == 'WatchEvent']

# count how many times each repo was starred during this time
# maps repo name to count of star events
repo2count = {}

for event in star_events:
    
    repo_name = event['repo']['name']
    
    if repo_name not in repo2count:
        repo2count[repo_name] = 1
    else:
        repo2count[repo_name] += 1



In [136]:
# record the names of repos that were starred more than k times during this time
# i.e. get popular repos during this time period
# we can change this code to find the top xx repos instead, etc.

popular_repos = []
k = 10 # this was chosen arbitrarily 

for repo_name in repo2count:
    if repo2count[repo_name] > k:
        popular_repos.append(repo_name)
        
len(popular_repos)

239

In [137]:
# user2repo: user to list of repo they've starred
# only process users who have interacted with the popular repos

# aggregate starred repos for each user
user2repo = {} # maps user name to list of repo names they've starred
repos = set() # set of all repos that were starred


popular_repos = set(popular_repos)

for event in star_events:
    
    repo_name = event['repo']['name']
    user_name = event['actor']['login']

    # skip star event if the repo is not popular
    if repo_name not in popular_repos:
        continue
    
    # for new user, initialize repo list
    if user_name not in user2repo:
        user2repo[user_name] = [repo_name]
    else:
        user2repo.get(user_name).append(repo_name)
        

len(user2repo)

3562

In [138]:
# filter out users who starred < min_n or > max_n repos

good_users = set()
min_n = 2
max_n = 10

for user, repo_list in user2repo.items():
    
    if len(repo_list) < min_n or len(repo_list) > max_n:
        continue
    
    good_users.add(user)
    
len(good_users)

407

In [139]:
# make dataframe: user, pop_repo, n_star
users = []
repos = []

for user, repo_list in user2repo.items():
    
    # skip if not in good users
    if user not in good_users:
        continue


    repos += [repo for repo in repo_list]
    users += [user for i in range(0, len(repo_list))]


stars = [1 for i in range(0, len(users))]

len(users)

1026

In [140]:
raw_data = pd.DataFrame({'user':users, 'repo':repos, 'star':stars})

raw_data.head()

data = raw_data.dropna()
data = data.copy()

# Create a numeric user_id and repo_id column
data['user'] = data['user'].astype("category")
data['repo'] = data['repo'].astype("category")
data['user_id'] = data['user'].cat.codes
data['repo_id'] = data['repo'].cat.codes

item_user_data = sp.csr_matrix((data['star'].astype(float), (data['repo_id'], data['user_id'])))
item_user_data.shape

(104, 407)

In [148]:
data.head()

Unnamed: 0,repo,star,user,user_id,repo_id
0,phobal/ivideo,1,springlo,322,73
1,BradLarson/GPUImage3,1,springlo,322,0
2,kamranahmedse/developer-roadmap,1,i23591326,177,62
3,adam-golab/react-developer-roadmap,1,i23591326,177,15
4,getify/You-Dont-Know-JS,1,i23591326,177,47


In [145]:
import pickle
pickle.dump(item_user_data, open("/Users/amy/git/MetaGithub/item_user_matrix.5hrs.csr", "wb"))
pickle.dump(data, open("/Users/amy/git/MetaGithub/item_user_matrix.5hrs.df", "wb"))

In [127]:
# initialize model
model = implicit.als.AlternatingLeastSquares(factors=50, regularization=0.01, iterations=50)

# Calculate the confidence by multiplying it by our alpha value.
alpha_val = 40
data_conf = (item_user_data * alpha_val).astype('double')

# Fit the model
model.fit(data_conf)

100%|██████████| 50.0/50 [00:00<00:00, 793.62it/s]


In [141]:
# find similar repos
similar = model.similar_items(0)

for item in similar:
    idx, score = item
    print(data.repo[data.repo_id == idx].iloc[0])



BradLarson/GPUImage3
google/compare_gan
tpn/pdfs
serhii-londar/open-source-mac-os-apps
xiaqunfeng/machine-learning-yearning
hnes/libaco
andymass/vim-tradewinds
polachok/toykio
nhnent/tui.image-editor
ant-design/ant-design
