In [1]:
import numpy as np
import pandas as pd
from surprise import Reader
from surprise import Dataset
from surprise import SVD
from surprise.model_selection import GridSearchCV
from surprise import BaselineOnly
from surprise import KNNBaseline
from sklearn.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score
from surprise import SlopeOne
import random

In [2]:
def load_data():
    data_df = pd.read_csv("./data/de_data_train.csv")
    data_df['user'] = data_df['Id'].str.split('_').str[0].apply(lambda x: int(x[1:]))
    data_df['item'] = data_df['Id'].str.split('_').str[1].apply(lambda x: int(x[1:]))
    data_df = data_df.rename(columns={'Prediction':'rating'})
    data_df = data_df[['user','item','rating']]
    return data_df
def create_submission(model):
    print("Predicting...")
    out = open("submission.csv","w")
    out.write('Id,Prediction\n')
    with open('./data/submission_rows') as samples:
        for i, sample in enumerate(samples):
            if i == 0:
                continue
            if i == 10000:
                print(i)
            tmp = sample.split('_')
            item = int(tmp[0][1:].strip())
            user = int(tmp[1][1:].strip())
            p = model.predict(item, user)
            p_string = "r{}_c{},{}\n".format(item, user, p)
            out.write(p_string)        
    out.close()
def get_submission_rows():
    users = []
    items = []
    with open('./data/submission_rows') as samples:
        for i, sample in enumerate(samples):
            if i == 0:
                continue
            tmp = sample.split('_')
            item = int(tmp[0][1:].strip())
            user = int(tmp[1][1:].strip())
            users.append(user)
            items.append(item)
    return users, items

In [3]:
data = load_data()
print(data.head())

   user  item  rating
0    44     1       4
1    61     1       3
2    67     1       4
3    72     1       3
4    86     1       5


In [6]:
def grid_search(data, param_grid, algo, folds):
    gs = GridSearchCV(algo, 
                  param_grid, 
                  measures=['rmse', 'mae'], 
                  return_train_measures=True,
                  cv=folds, 
                  joblib_verbose=1)
    gs.fit(data)
    return gs

## SVD SGD

In [19]:
param_grid = {'n_epochs': [30],# 10, 15], 
              'n_factors': [10, 20, 30, 40],# 10, 15, 20, 25, 30], 
              'reg_all': [0.1, 0.2],# 0.01, 0.001], 
              'lr_all': [0.01]}#, 0.01, 0.001]}
reader = Reader(rating_scale=(1, 5))
surprise_data = Dataset.load_from_df(data, reader=reader)
gs = grid_search(data=surprise_data, param_grid=param_grid, algo=SVD,folds=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  24 out of  24 | elapsed: 24.9min finished


In [20]:
pd.DataFrame.from_dict(gs.cv_results)

Unnamed: 0,split0_test_rmse,split0_train_rmse,split1_test_rmse,split1_train_rmse,split2_test_rmse,split2_train_rmse,mean_test_rmse,std_test_rmse,mean_train_rmse,std_train_rmse,...,rank_test_mae,mean_fit_time,std_fit_time,mean_test_time,std_test_time,params,param_n_epochs,param_n_factors,param_reg_all,param_lr_all
0,1.004753,0.982623,1.004574,0.981093,1.004562,0.98084,1.00463,8.7e-05,0.981519,0.000788,...,4,29.696937,0.280583,5.099243,0.500935,"{'n_epochs': 30, 'n_factors': 10, 'reg_all': 0...",30,10,0.1,0.01
1,1.013797,1.003183,1.013966,1.001992,1.014345,1.002397,1.014036,0.000229,1.002524,0.000494,...,8,29.376848,0.950957,5.155132,0.714383,"{'n_epochs': 30, 'n_factors': 10, 'reg_all': 0...",30,10,0.2,0.01
2,1.002719,0.974249,1.002611,0.971679,1.004633,0.975813,1.003321,0.000929,0.973914,0.001704,...,3,34.636455,0.762781,5.358722,0.613559,"{'n_epochs': 30, 'n_factors': 20, 'reg_all': 0...",30,20,0.1,0.01
3,1.013796,1.00317,1.013965,1.001979,1.014345,1.002386,1.014035,0.00023,1.002511,0.000494,...,7,39.812804,1.49278,5.867776,2.000901,"{'n_epochs': 30, 'n_factors': 20, 'reg_all': 0...",30,20,0.2,0.01
4,1.001658,0.967305,1.001722,0.965635,1.002474,0.967466,1.001951,0.000371,0.966802,0.000828,...,2,45.022645,2.179515,5.9712,0.501501,"{'n_epochs': 30, 'n_factors': 30, 'reg_all': 0...",30,30,0.1,0.01
5,1.013795,1.003158,1.013965,1.001966,1.014344,1.002372,1.014034,0.000229,1.002499,0.000495,...,6,49.393922,2.666508,6.304444,0.635659,"{'n_epochs': 30, 'n_factors': 30, 'reg_all': 0...",30,30,0.2,0.01
6,1.001015,0.961391,1.001477,0.960546,1.001364,0.960584,1.001285,0.000196,0.96084,0.00039,...,1,48.691504,1.955575,4.526827,0.200685,"{'n_epochs': 30, 'n_factors': 40, 'reg_all': 0...",30,40,0.1,0.01
7,1.013794,1.003144,1.013963,1.001955,1.014343,1.00236,1.014033,0.000229,1.002486,0.000494,...,5,63.067683,4.793341,7.632247,2.932391,"{'n_epochs': 30, 'n_factors': 40, 'reg_all': 0...",30,40,0.2,0.01


In [17]:
gs.best_score['rmse']

1.0033069244262036

In [21]:
pd.DataFrame.from_dict(gs.cv_results).to_csv('svd_train_v4.csv')

## SVD ALS

In [5]:
from pyspark.mllib.recommendation import Rating
from pyspark.ml.recommendation import ALS
from pyspark.mllib.recommendation import MatrixFactorizationModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.types import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

In [6]:
schema = StructType([ StructField("user", IntegerType(), True)\
                       ,StructField("item", IntegerType(), True)\
                       ,StructField("rating", IntegerType(), True)])
ratings = spark.createDataFrame(data,schema=schema)
sc.setCheckpointDir('checkpoint/')
ratings.printSchema()

root
 |-- user: integer (nullable = true)
 |-- item: integer (nullable = true)
 |-- rating: integer (nullable = true)



In [51]:
class ALS_Classifier: 
    def __init__(self):
        self.clf = ALS(rank=8, 
              maxIter=20, 
              regParam=0.06,
              userCol="user", 
              itemCol="item", 
              ratingCol="rating",
              checkpointInterval=10, 
              seed=100,
              intermediateStorageLevel="MEMORY_AND_DISK", 
              finalStorageLevel="MEMORY_AND_DISK")
        self.evaluator = RegressionEvaluator(metricName="rmse", labelCol="rating", predictionCol="prediction")

    def crossValidate(self, data):
        schema = StructType([ StructField("user", IntegerType(), True)\
                       ,StructField("item", IntegerType(), True)\
                       ,StructField("rating", IntegerType(), True)])
        ratings = spark.createDataFrame(data, schema=schema)
        self.model = self.clf.fit(ratings)
    def predict(self, test):
        schema = StructType([ StructField("user", IntegerType(), True)\
                       ,StructField("item", IntegerType(), True)\
                       ,StructField("rating", IntegerType(), True)])
        ratings = spark.createDataFrame(test,schema=schema)
        p = self.model.transform(ratings).toPandas()
        pred = p.rename(columns={'user': 'uid', 'item': 'iid', 'rating': 'r_ui', 'prediction': 'est'})
        return pred
    def crossValidate_normal(self, data):
        
        #pipeline = Pipeline(stages=[self.clf])

        #paramGrid = ParamGridBuilder() \
        #    .addGrid(self.clf.regParam, [0.06]) \
        #    .addGrid(self.clf.rank, [8]) \
        #     .addGrid(self.clf.maxIter, [50]) \
        #     .build()
        
        #crossval = CrossValidator(estimator=pipeline,
        #                  estimatorParamMaps=paramGrid,
        #                  evaluator=self.evaluator,
        #                  numFolds=5)
        
        #schema = StructType([ StructField("user", IntegerType(), True)\
        #               ,StructField("item", IntegerType(), True)\
        #               ,StructField("rating", IntegerType(), True)])
        #ratings = spark.createDataFrame(data, schema=schema)
        self.model = self.clf.fit(data, seed=10)
    def predict_normal(self, test):
        p = self.model.transform(test)
        return p
    def rmse(self, predictions):
        return self.evaluator.evaluate(predictions)
    def predict_value(self, user, item):
        return 

In [52]:
(train, test) = ratings.randomSplit([0.8, 0.2])
als_classifier = ALS_Classifier()
als_classifier.crossValidate_normal(train)

TypeError: fit() got an unexpected keyword argument 'seed'

In [50]:
p = als_classifier.predict_normal(test)
als_classifier.rmse(p) # 0.9921....65290783296

KeyboardInterrupt: 

In [50]:
pred_sub = p.toPandas()['prediction'].head()

In [63]:
test.head()

Row(user=1, item=84, rating=4)

In [73]:
als_classifier.predict_value(0, 0)

AttributeError: 'ALSModel' object has no attribute 'predict'

In [7]:
from pyspark.mllib.recommendation import ALS
rank = 8
numIterations = 30
regParam = 0.06
model = ALS.train(ratings, rank, numIterations, regParam, seed=0)

In [8]:
ratings.show()

+----+----+------+
|user|item|rating|
+----+----+------+
|  44|   1|     4|
|  61|   1|     3|
|  67|   1|     4|
|  72|   1|     3|
|  86|   1|     5|
|  90|   1|     4|
| 108|   1|     3|
| 114|   1|     3|
| 120|   1|     2|
| 135|   1|     5|
| 152|   1|     4|
| 165|   1|     3|
| 182|   1|     3|
| 310|   1|     3|
| 318|   1|     1|
| 333|   1|     3|
| 355|   1|     2|
| 390|   1|     4|
| 401|   1|     4|
| 410|   1|     2|
+----+----+------+
only showing top 20 rows



In [9]:
users, items = get_submission_rows()

In [10]:
d = dict({'user': items, 'item': users })
panda = pd.DataFrame.from_dict(d)
print(panda.shape)


(1176952, 2)


In [11]:
schema = StructType([ StructField("user", IntegerType(), True)\
                       ,StructField("item", IntegerType(), True)])
final = spark.createDataFrame(panda, schema=schema)
final.count()

1176952

In [12]:
pred_df = model.predictAll(final.rdd.map(lambda x: (x[0], x[1])))\
    .map(lambda r: ((r[0], r[1]), r[2]))\
    .toDF()\
    .toPandas()

In [13]:
pred_df.shape

(1176952, 2)

In [14]:
pred_df.shape

(1176952, 2)

In [15]:
# Post processing database
pred_df['User'] = pred_df['_1'].apply(lambda x: x['_1'])
pred_df['Movie'] = pred_df['_1'].apply(lambda x: x['_2'])
pred_df['Rating'] = pred_df['_2']
pred_df = pred_df.drop(['_1', '_2'], axis=1)
pred_df = pred_df.sort_values(by=['Movie', 'User'])
pred_df.index = range(len(pred_df))

In [16]:
pred_df.tail()

Unnamed: 0,User,Movie,Rating
1176947,9974,1000,3.152257
1176948,9977,1000,3.45579
1176949,9978,1000,2.89307
1176950,9982,1000,3.322231
1176951,9996,1000,3.768126


In [22]:
out = open("submission.csv","w")
out.write('Id,Prediction\n')
for p in pred_df.iterrows():
    rating = int(np.rint(p[1]['Rating']))
    rating = 5 if rating == 6 else rating
    rating = 1 if rating == 0 else rating
    p_string = "r{}_c{},{}\n".format(int(p[1]['User']), int(p[1]['Movie']), rating)
    out.write(p_string)        
out.close()

In [28]:
create_submission(model)

Predicting...


KeyboardInterrupt: 

## BASELINE

In [36]:
class Baseline_Classifier: 
    def __init__(self):
        self.clf = BaselineOnly()
    def crossValidate(self, train):
        reader = Reader(rating_scale=(1, 5))
        train = Dataset.load_from_df(train, reader=reader)
        train = train.build_full_trainset()
        self.clf.fit(train)
    def predict(self, test):
        reader = Reader(rating_scale=(1, 5))
        test = Dataset.load_from_df(test, reader=reader)
        test = test.build_full_trainset().build_testset()
        test_pred = self.clf.test(test)
        return pd.DataFrame(test_pred)
    def rmse(self, predictions):
        return accuracy.rmse(predictions, verbose=True)
baseline_classifier = Baseline_Classifier()

In [37]:
class KNN_Classifier: 
    def __init__(self):
        self.clf = KNNBaseline(k=20, sim_options={'name': 'pearson_baseline', 'user_based': False})
    def crossValidate(self, train):
        reader = Reader(rating_scale=(1, 5))
        train = Dataset.load_from_df(train, reader=reader)
        train = train.build_full_trainset()
        self.clf.fit(train)
    def predict(self, test):
        reader = Reader(rating_scale=(1, 5))
        test = Dataset.load_from_df(test, reader=reader)
        test = test.build_full_trainset().build_testset()
        test_pred = self.clf.test(test)
        return pd.DataFrame(test_pred)
    def rmse(self, predictions):
        return accuracy.rmse(predictions, verbose=True)
knn_classifier = KNN_Classifier()

In [38]:
class SlopeOne_Classifier: 
    def __init__(self):
        self.clf = SlopeOne()
    def crossValidate(self, train):
        reader = Reader(rating_scale=(1, 5))
        train = Dataset.load_from_df(train, reader=reader)
        train = train.build_full_trainset()
        self.clf.fit(train)
    def predict(self, test):
        reader = Reader(rating_scale=(1, 5))
        test = Dataset.load_from_df(test, reader=reader)
        test = test.build_full_trainset().build_testset()
        test_pred = self.clf.test(test)
        return pd.DataFrame(test_pred)
    def rmse(self, predictions):
        return accuracy.rmse(predictions, verbose=True)
slope_classifier = SlopeOne_Classifier()

## Blending

In [83]:
stage_2_train.head()
stage_2_test.head()

Unnamed: 0,item,user,est,est.1,est.2,r_ui
0,27,7375,3.5385,4.188634,3.908736,5.0
1,472,1494,4.038831,3.686864,3.895126,3.0
2,168,2568,4.297392,3.45043,4.656998,3.0
3,364,1308,4.381731,4.324887,4.124789,4.0
4,889,2782,3.896324,2.955418,3.874351,3.0


In [56]:
def blend(models, data):
    tmp, test = train_test_split(data, test_size=0.1)
    train, val = train_test_split(tmp, test_size=0.12)
    stage_2_train = pd.DataFrame()
    stage_2_y = pd.DataFrame()
    stage_2_test = pd.DataFrame()
    stage_2_z = pd.DataFrame()

    for i, model in enumerate(models):
        print("Starting training model...")
        model.crossValidate(train)
        
        val_pred = model.predict(val)
        #print(val_pred)
        val_pred = val_pred.sort_values(['uid','iid'],ascending=[True, True])
        #val_pred = val_pred.rename(columns={'est': 'est' + str(i)})
        stage_2_train = pd.concat([stage_2_train, val_pred['est']], axis=1)
        stage_2_y = pd.concat([stage_2_y, val_pred['r_ui']], axis=1)
        
        test_pred = model.predict(test)
        test_pred = test_pred.sort_values(['uid','iid'],ascending=[True, True])
        #test_pred = test_pred.rename(columns={'est': 'est' + str(i)})
        stage_2_test = pd.concat([stage_2_test, test_pred['est']], axis=1)
        stage_2_z = pd.concat([stage_2_z, test_pred['r_ui']], axis=1)
        
    #print(np.sum(stage_2_y.iloc[:, 0] - stage_2_y.iloc[:, 1]))
    
    #print(np.sum(stage_2_z.iloc[:, 0] - stage_2_z.iloc[:, 1]))


    stage_2_train['r_ui'] = stage_2_y.iloc[:,0]
    stage_2_train = stage_2_train.sample(frac=1)
    stage_2_test['r_ui'] = stage_2_z.iloc[:,0]
    stage_2_test = stage_2_test.sample(frac=1)
    
    print(val.shape)
    print(stage_2_train.shape)
    print(pd.concat([val, stage_2_train], axis=1).shape)
    
    stage_2_train.reset_index(drop=True, inplace=True)
    val.reset_index(drop=True, inplace=True)
    
    stage_2_test.reset_index(drop=True, inplace=True)
    test.reset_index(drop=True, inplace=True)
    return pd.concat([val[['item', 'user']], stage_2_train], axis=1), pd.concat([test[['item', 'user']], stage_2_test], axis=1)
 
#blend([knn_classifier, baseline_classifier, als_classifier, slope_classifier], data)
als_classifier = ALS_Classifier()

stage_2_train, stage_2_test = blend([als_classifier, knn_classifier], data)

Starting training model...
Starting training model...
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
(127111, 3)
(127111, 3)
(240561, 6)


In [59]:
clf = Ridge()
clf.fit(stage_2_train.drop(columns={'r_ui'}), stage_2_train['r_ui'])
pred = clf.predict(stage_2_test.drop(columns={'r_ui'}))
np.sqrt(mean_squared_error(pred, stage_2_test['r_ui']))

0.9843103617228288

In [21]:
def train_and_predict(algo, train, val, test):
    algo.fit(train)
    val_pred = algo.test(val.build_testset())
    test_pred = algo.test(test.build_testset())
    return pd.Series(list(map(lambda x: x.est, val_pred))), pd.Series(list(map(lambda x: x.est, test_pred)))


models = [#SVD(), 
          #KNNWithMeans(k=20, sim_options={'name': 'pearson_baseline', 'user_based': False}), 
          #SlopeOne(),
          #NMF(),
          #CoClustering(),
          #BaselineOnly(),
          baseline_classifier
         ]
tmp, test = train_test_split(data, test_size=0.1)
print(tmp.head())
train, val = train_test_split(tmp, test_size=0.12)
X_train = pd.DataFrame()
X_test = pd.DataFrame()
for model in models:
    model.crossValidate(train)
    val_pred = model.predict(val)
    test_pred = model.predict(test)
    X_train = pd.concat([X_train, val_pred], axis=1)
    X_test = pd.concat([X_test, test_pred], axis=1)
    
    clf = Ridge()
    clf.fit(X_train, val['rating'])
    pred = clf.predict(X_test)
    print(np.sqrt(mean_squared_error(pred, test['rating'])))

    
"""    
y_train = []
for r in val_surprise.all_ratings():
    y_train.append(r[2])
clf = Ridge()
clf.fit(X_train, y_train)

y_test = []
for r in test_surprise.all_ratings():
    y_test.append(r[2])
pred = clf.predict(X_test)

print(pred)

np.sqrt(mean_squared_error(pred, y_test))

"""

         user  item  rating
961305   5661   768       4
14745    9445     8       2
902735   8336   705       3
259053   8377   214       5
1035924  2611   820       4
Estimating biases using als...
1.119333014551237


'    \ny_train = []\nfor r in val_surprise.all_ratings():\n    y_train.append(r[2])\nclf = Ridge()\nclf.fit(X_train, y_train)\n\ny_test = []\nfor r in test_surprise.all_ratings():\n    y_test.append(r[2])\npred = clf.predict(X_test)\n\nprint(pred)\n\nnp.sqrt(mean_squared_error(pred, y_test))\n\n'