In [None]:
import pandas as pd

from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNWithZScore
from surprise import KNNBaseline
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from sklearn.metrics import r2_score

def percentCorrect(prediction, ground_truth, bound):
    in_bound = 0
    for index, x in enumerate(prediction):
        if (x <= ground_truth[index] + bound and x >= ground_truth[index] - bound):
            in_bound = in_bound + 1
    return in_bound/len(prediction)

rec_df = pd.read_csv('sentences-user-actions.csv')
rec_df = rec_df.sample(frac=1).reset_index(drop=True)

rec_df.head()

# A reader is still needed but only the rating_scale param is requiered.
reader = Reader(rating_scale=(0, 1))

# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(rec_df[['user_id', 'sentence_id', 'relative_difficulty']], reader)

algos = [
    SVD(),
    SVDpp(), 
#     NMF(), #predicts 0 most of the times
    KNNBasic(), 
    KNNWithMeans(), 
    KNNWithZScore(), 
    KNNBaseline(), 
    NormalPredictor(), 
    BaselineOnly(),
#     CoClustering(), #predict all 0, the RMSE is 0.0252
    SlopeOne()
]

for algo in algos:
    # We can now use this dataset as we please, e.g. calling cross_validate
    trainset, testset = train_test_split(data, test_size=.25)
    algo.fit(trainset)
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    accuracy.fcp(predictions)
    truth = []
    preds = []
    truth_count = []
    preds_count = []
    for i in range(0, len(predictions)):
#         print(predictions[i].r_ui, predictions[i].est)
        row = rec_df.loc[(rec_df['user_id'] == predictions[i].uid) & (rec_df['sentence_id'] == predictions[i].iid)]
        preds_count.append(row.iloc[0]['unique_word_count'] * predictions[i].est)
        truth_count.append(row.iloc[0]['unknown_count'])
        truth.append(predictions[i].r_ui)
        preds.append(predictions[i].est)
    r2 = r2_score(preds, truth)
    interval = percentCorrect(preds_count, truth_count, 1)
    print('percent interval: ' + str(interval))
#     print('coefficient of determination: ' + str(r2))