In [80]:
from data_formatting import *
from surprise import *
import pandas as pd
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import *
from surprise.prediction_algorithms import *
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score

In [19]:
PATH_CLEAN = 'csv/data_clean.csv'
PATH_SAMPLE = 'csv/sampleSubmission.csv'
PATH_SUBMISSION = 'csv/submission.csv'

In [20]:
df = pd.read_csv(PATH_CLEAN)
df.head(3)

Unnamed: 0,User,Item,Rating
0,44,1,4
1,61,1,3
2,67,1,4


In [22]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['User', 'Item', 'Rating']], reader)

In [23]:
trainset, testset = train_test_split(data, test_size=0.1, random_state=2018)

### Stage 1 learning

In [146]:
algos = [NormalPredictor(),
        BaselineOnly(bsl_options={'reg_i': 10, 'reg_u': 15, 'n_epochs': 10}, verbose=False),
        KNNBasic(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
        KNNWithMeans(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
        KNNWithZScore(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
        KNNBaseline(k=40, min_k=1, sim_options={'user_based': True}, verbose=False),
        SVD(n_epochs=20,n_factors=20, biased=True, lr_all=0.002, reg_all=0.02, random_state=2018),
        NMF(n_factors=15, n_epochs=50, biased=False, reg_pu=0.06, reg_qi=0.06, reg_bu=0.02, reg_bi=0.02, lr_bu=0.005, lr_bi=0.005, init_low=0, init_high=1, random_state=2018),
        SlopeOne(),
        CoClustering(n_cltr_u=3, n_cltr_i=3, n_epochs=20, random_state=2018)]

for i in range(len(algos)):
    algos[i].fit(trainset)
    print('Algo ' + repr(i) + ' learnt!')

Algo 0 learnt!
Algo 1 learnt!
Algo 2 learnt!
Algo 3 learnt!
Algo 4 learnt!
Algo 5 learnt!
Algo 6 learnt!
Algo 7 learnt!
Algo 8 learnt!
Algo 9 learnt!


In [148]:
estimations = [[prediction.est for prediction in algo.test(testset)] for algo in algos]

In [149]:
estimation_series = [pd.Series(estimation) for estimation in estimations]

In [150]:
df_test = pd.DataFrame(testset, columns=['User', 'Item', 'Rating'])
feature_cols = []
for i in range(len(algos)):
    column_name = 'Algo' + repr(i)
    feature_cols.append(column_name)
    df_test[column_name] = estimation_series[i]
               
X = df_test[feature_cols]
y = df_test['Rating']

In [151]:
logistic = LinearRegression()  # create the model
logistic.fit(X, y)  # train it

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [152]:
logistic.coef_

array([-2.18789943e-04, -4.86612655e-01,  1.97393786e-01, -8.49066424e-01,
        6.30090009e-01,  2.68895329e-01,  1.72988617e-01,  6.06363682e-02,
        9.26079668e-01,  3.67577867e-02])

In [153]:
df_test.head(3)

Unnamed: 0,User,Item,Rating,Algo0,Algo1,Algo2,Algo3,Algo4,Algo5,Algo6,Algo7,Algo8,Algo9
0,1474,785,5.0,2.840598,4.479789,4.237436,4.391744,4.351547,4.383548,4.431765,4.667434,4.565653,4.444841
1,8103,249,5.0,3.273476,4.267553,4.274363,4.077115,4.077271,4.352364,4.191837,4.25795,4.335988,4.23571
2,9961,645,5.0,5.0,4.456925,4.347954,4.138787,4.204443,4.438642,4.421002,4.463796,4.489497,4.347694


In [160]:
def estimate(algos, users, items):
    return [[algo.predict(x[0], x[1]).est for algo in algos] for x in zip(users, items)]

In [155]:
def predict(logistic, algos, users, items):
    estimations = estimate(algos, users, items)
    print("estimations done")
    return logistic.predict(estimations)

In [161]:
def predict(logistic, algos, users, items):
    print('Creating array filled with zeros')
    estimations = [][]
    for i in range(len(algos)):
        print('Estimating using algo ' + repr(i))
        for j in range(len(items)):
            estimations[i][j] = algo.predict(x[0], x[1]).est
    
    return logistic.predict(estimations)

SyntaxError: invalid syntax (<ipython-input-161-f6a59c508f5b>, line 3)

In [156]:
def predictions_from_path(logistic, algos, path_sample):
    users, items, _ = read_original_csv(path_sample)
    return predict(logistic, algos, users, items)

In [157]:
predictions = predictions_from_path(logistic, algos, PATH_SAMPLE)

In [158]:
def submit(predictions, path_sample, path_submission):
    ids = read_ids_ratings(path_sample)[0]
    create_csv_submission(ids, predictions, path_submission)

In [159]:
submit(predictions, PATH_SAMPLE, PATH_SUBMISSION)

Old method for estimation using a single algo

In [84]:
def predictions_from_path_simple(algo, path_sample):
    users, items, _ = read_original_csv(path_sample)
    predictions = np.zeros(len(items))
    for i in range(len(predictions)):
        predictions[i] = algo.predict(users[i], items[i], None, True, False).est
    return predictions

In [19]:
def predict_and_submit(algo, path_sample, path_submission):
    ids = read_ids_ratings(path_sample)[0]
    create_csv_submission(ids, predict(algo, path_sample), path_submission)

In [23]:
predict_and_submit(algo, PATH_SAMPLE, PATH_SUBMISSION)