In [1]:
import boto3
import pandas as pd

In [2]:
train_file = pd.read_csv('data/train.csv', index_col=None, sep=',') \
    .drop_duplicates()
train_file.columns = ['uid', 'repoid', 'uname', 'reponame', 'date']
train_file = train_file.drop_duplicates()
train_file.head()

Unnamed: 0,uid,repoid,uname,reponame,date
0,24466870,59996401,zzkkui,yapi,2018-03-13T01:07:02Z
1,24466870,5239185,zzkkui,quill,2018-03-13T01:07:02Z
2,24466870,76567547,zzkkui,vue-quill-editor,2018-03-13T01:07:02Z
3,24466870,105479936,zzkkui,react-email-editor,2018-03-13T01:07:02Z
4,24466870,16179237,zzkkui,virtual-dom,2018-03-13T01:07:02Z


In [3]:
user_file = pd.read_csv('data/user.csv', index_col=None, sep=',') \
    .drop_duplicates()
user_file.columns = ['uid', 'repoid', 'uname', 'reponame', 'date']
user_file = user_file.drop_duplicates()
user_file.head()

Unnamed: 0,uid,repoid,uname,reponame,date
0,6669386,1181927,daimingzhong,bitcoin,Wed Mar 14 2018 23:06:44 GMT-0400 (EDT)
1,6669386,113401553,daimingzhong,music_back,Wed Mar 14 2018 23:06:44 GMT-0400 (EDT)
2,6669386,21942759,daimingzhong,r4intellij,Wed Mar 14 2018 23:06:44 GMT-0400 (EDT)
3,6669386,73776538,daimingzhong,R-Data-Structures-and-Algorithms,Wed Mar 14 2018 23:06:44 GMT-0400 (EDT)
4,6669386,3749321,daimingzhong,Unblock-Youku,Wed Mar 14 2018 23:06:44 GMT-0400 (EDT)


In [4]:
# take only uid / repoid
train_file = train_file[['uid', 'repoid']]
train_file.head()

Unnamed: 0,uid,repoid
0,24466870,59996401
1,24466870,5239185
2,24466870,76567547
3,24466870,105479936
4,24466870,16179237


In [5]:
# same thing for user file
user_file = user_file[['uid', 'repoid']]
user_file.head()

Unnamed: 0,uid,repoid
0,6669386,1181927
1,6669386,113401553
2,6669386,21942759
3,6669386,73776538
4,6669386,3749321


In [6]:
import findspark
findspark.init()

from pyspark import SparkContext
from pyspark.sql import SQLContext

from pyspark.mllib.recommendation import ALS

sc = SparkContext(appName="RR")
sqlContext = SQLContext(sc)

In [7]:
def first_two_column(row):
    return (int(row[0]), int(row[1]), 1.0)

In [8]:
training_rdd = sqlContext.createDataFrame(train_file).rdd
training_rdd = training_rdd \
    .map(first_two_column)

In [9]:
training_rdd.take(5)

[(24466870, 59996401, 1.0),
 (24466870, 5239185, 1.0),
 (24466870, 76567547, 1.0),
 (24466870, 105479936, 1.0),
 (24466870, 16179237, 1.0)]

In [10]:
# this takes a short while too
model = ALS.trainImplicit( \
    training_rdd,
    rank=16,
    iterations=10,
    lambda_=0.1,
    alpha=80.0
)

In [11]:
user_rdd = sqlContext.createDataFrame(user_file).rdd
user_rdd.take(5)

[Row(uid=6669386, repoid=1181927),
 Row(uid=6669386, repoid=113401553),
 Row(uid=6669386, repoid=21942759),
 Row(uid=6669386, repoid=73776538),
 Row(uid=6669386, repoid=3749321)]

In [12]:
# append user to training set and train again
training_rdd = training_rdd.union(user_rdd.map(first_two_column))
# this takes a short while too
model = ALS.trainImplicit( \
    training_rdd,
    rank=16,
    iterations=10,
    lambda_=0.1,
    alpha=80.0
)

In [13]:
# prediction input is (to_be_predict_uid, all_repo_id)
user_id = user_rdd.take(1)[0].uid
predict_input_rdd = training_rdd \
    .map(lambda row: (user_id, row[1])) \
    .distinct()
predict_input_rdd.take(5)

[(6669386, 105479936),
 (6669386, 67274736),
 (6669386, 48804792),
 (6669386, 15637960),
 (6669386, 9595928)]

In [14]:
predictions = model.predictAll(predict_input_rdd).sortBy(lambda row: -row.rating)

In [15]:
# remove repos user already starred
# this is top 10 recommendations for user
predictions.map(lambda row: (row.product, row)) \
    .subtractByKey(user_rdd.map(lambda row: (row[1], 0))) \
    .map(lambda row: row[0]) \
    .take(10)


[102523304,
 67186968,
 75830968,
 52677592,
 21872392,
 83844720,
 724712,
 63484632,
 45457072,
 11981144]