# Model Training

In [1]:
import pandas as pd
from surprise import KNNWithMeans
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
import operator
from surprise import dump

In [2]:
df = pd.read_csv("dataset.csv",index_col="Unnamed: 0",dtype={'userId': int, 'movieId': int})
df = df.drop(["timestamp"],axis=1)
df

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0
...,...,...,...
100831,610,166534,4.0
100832,610,168248,5.0
100833,610,168250,5.0
100834,610,168252,5.0


## Load data into suprise module supported

In [3]:
reader = Reader( rating_scale=(0.5, 5))
data = Dataset.load_from_df(df, reader)

In [54]:
from surprise import KNNWithMeans

# To use item-based cosine similarity
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}
algo = KNNWithMeans(sim_options=sim_options)

In [58]:
trainset, testset  = train_test_split(data,test_size=0.20)
algo.fit(trainset)

predictions = algo.test(testset)
accuracy.rmse(predictions)



Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9127


0.9127064986759298

In [87]:
#Save algor
dump.dump('./app/weight/recommend_algor', predictions, algo)

In [61]:
for uid, iid, rating in testset[:5]:
    print(f"User {uid} rated item {iid} with a rating of {rating}")

User 555 rated item 633 with a rating of 5.0
User 89 rated item 46578 with a rating of 4.0
User 230 rated item 55553 with a rating of 1.0
User 233 rated item 2324 with a rating of 4.0
User 599 rated item 2846 with a rating of 3.5


In [62]:
for prediction in predictions[0:5]:
    print(prediction)

user: 555        item: 633        r_ui = 5.00   est = 3.50   {'was_impossible': True, 'reason': 'User and/or item is unknown.'}
user: 89         item: 46578      r_ui = 4.00   est = 3.83   {'actual_k': 40, 'was_impossible': False}
user: 230        item: 55553      r_ui = 1.00   est = 2.11   {'actual_k': 40, 'was_impossible': False}
user: 233        item: 2324       r_ui = 4.00   est = 3.72   {'actual_k': 40, 'was_impossible': False}
user: 599        item: 2846       r_ui = 3.50   est = 3.59   {'actual_k': 40, 'was_impossible': False}


In [63]:
def get_Iu(uid):
    """ return the number of items rated by given user
    args: 
      uid: the id of the user
    returns: 
      the number of items rated by the user
    """
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError: # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """ return number of users that have rated given item
    args:
      iid: the raw id of the item
    returns:
      the number of users that have rated the item.
    """
    try: 
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:
        return 0
    
predict_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
predict_df['Iu'] = predict_df.uid.apply(get_Iu)
predict_df['Ui'] = predict_df.iid.apply(get_Ui)
predict_df['err'] = abs(predict_df.est - predict_df.rui)
best_predictions = predict_df.sort_values(by='err')[:10]
worst_predictions = predict_df.sort_values(by='err')[-10:]

In [64]:
best_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
1141,93,2028,5.0,5.0,"{'actual_k': 40, 'was_impossible': False}",81,144,0.0
15657,171,318,5.0,5.0,"{'actual_k': 40, 'was_impossible': False}",64,243,0.0
16888,1,3578,5.0,5.0,"{'actual_k': 40, 'was_impossible': False}",180,121,0.0
9419,74,6270,5.0,5.0,"{'actual_k': 3, 'was_impossible': False}",143,1,0.0
4938,371,1199,5.0,5.0,"{'actual_k': 31, 'was_impossible': False}",33,41,0.0
13679,586,4993,5.0,5.0,"{'actual_k': 40, 'was_impossible': False}",171,148,0.0
10122,122,3089,5.0,5.0,"{'actual_k': 40, 'was_impossible': False}",225,12,0.0
6915,1,1198,5.0,5.0,"{'actual_k': 40, 'was_impossible': False}",180,155,0.0
8154,53,916,5.0,5.0,"{'actual_k': 16, 'was_impossible': False}",16,21,0.0
14926,435,318,5.0,5.0,"{'actual_k': 35, 'was_impossible': False}",35,243,0.0


In [65]:
worst_predictions

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
9439,380,49932,1.0,5.0,"{'actual_k': 40, 'was_impossible': False}",985,1,4.0
13560,509,84847,4.5,0.5,"{'actual_k': 40, 'was_impossible': False}",364,1,4.0
2847,260,7564,4.5,0.5,"{'actual_k': 1, 'was_impossible': False}",121,1,4.0
9663,603,3223,5.0,0.896837,"{'actual_k': 40, 'was_impossible': False}",780,1,4.103163
12896,256,7099,0.5,4.784838,"{'actual_k': 40, 'was_impossible': False}",137,17,4.284838
8074,488,4703,5.0,0.694399,"{'actual_k': 28, 'was_impossible': False}",85,1,4.305601
14460,594,7982,0.5,5.0,"{'actual_k': 40, 'was_impossible': False}",182,3,4.5
6854,594,4794,0.5,5.0,"{'actual_k': 40, 'was_impossible': False}",182,2,4.5
9073,482,2068,0.5,5.0,"{'actual_k': 14, 'was_impossible': False}",104,1,4.5
6870,441,527,0.5,5.0,"{'actual_k': 28, 'was_impossible': False}",29,177,4.5


# Test prediction and show additional results

In [66]:
movie=pd.read_csv('./movielens-small/movies.csv',dtype={'userId': int, 'movieId': int})
mapping = dict(zip(movie['movieId'],movie['title']))


In [67]:
users=list(set(best_predictions["uid"]))
print(users[0:5])

[1, 74, 171, 586, 371]


In [68]:

# items which the user not yet evaluate
items = testset

for user in users[0:5]:
    user_items = list(filter(lambda x: x[0] == user, items))
    print()
    print(user,len(user_items))
    # generate recommendation

    recommendations = algo.test(user_items)

    recommendations.sort(key=operator.itemgetter(3), reverse=True)
    print(f"User {user} recommendations:")
    for r in recommendations[0:5]:
        print(f"[iid] {r[1]} [Title] {mapping[r[1]]}, [Estimated Rating] {round(r[3],3)}")




1 52
User 1 recommendations:
[iid] 1196 [Title] Star Wars: Episode V - The Empire Strikes Back (1980), [Estimated Rating] 5
[iid] 593 [Title] Silence of the Lambs, The (1991), [Estimated Rating] 5
[iid] 296 [Title] Pulp Fiction (1994), [Estimated Rating] 5
[iid] 1198 [Title] Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981), [Estimated Rating] 5
[iid] 2028 [Title] Saving Private Ryan (1998), [Estimated Rating] 5

74 34
User 74 recommendations:
[iid] 6270 [Title] Akira Kurosawa's Dreams (Dreams) (1990), [Estimated Rating] 5
[iid] 858 [Title] Godfather, The (1972), [Estimated Rating] 4.534
[iid] 3019 [Title] Drugstore Cowboy (1989), [Estimated Rating] 4.526
[iid] 7234 [Title] Strada, La (1954), [Estimated Rating] 4.519
[iid] 48698 [Title] Deliver Us from Evil (2006), [Estimated Rating] 4.489

171 18
User 171 recommendations:
[iid] 1208 [Title] Apocalypse Now (1979), [Estimated Rating] 5
[iid] 318 [Title] Shawshank Redemption, The (1994), [Estimated Rating] 5

In [69]:
#https://surprise.readthedocs.io/en/stable/dump.html

## load model

In [79]:
saved_predict_result,algo = dump.load('recommend_algor')
algo

<surprise.prediction_algorithms.knns.KNNWithMeans at 0x149f97b20>

## load movie info

In [80]:
my_movie=pd.read_csv('./movielens-small/movies.csv',dtype={'userId': int, 'movieId': int})
mapping = { row["movieId"] :{ "title" : row["title"],"genres" : row["genres"] } for _,row in my_movie.iterrows() }
mapping

{1: {'title': 'Toy Story (1995)',
  'genres': 'Adventure|Animation|Children|Comedy|Fantasy'},
 2: {'title': 'Jumanji (1995)', 'genres': 'Adventure|Children|Fantasy'},
 3: {'title': 'Grumpier Old Men (1995)', 'genres': 'Comedy|Romance'},
 4: {'title': 'Waiting to Exhale (1995)', 'genres': 'Comedy|Drama|Romance'},
 5: {'title': 'Father of the Bride Part II (1995)', 'genres': 'Comedy'},
 6: {'title': 'Heat (1995)', 'genres': 'Action|Crime|Thriller'},
 7: {'title': 'Sabrina (1995)', 'genres': 'Comedy|Romance'},
 8: {'title': 'Tom and Huck (1995)', 'genres': 'Adventure|Children'},
 9: {'title': 'Sudden Death (1995)', 'genres': 'Action'},
 10: {'title': 'GoldenEye (1995)', 'genres': 'Action|Adventure|Thriller'},
 11: {'title': 'American President, The (1995)',
  'genres': 'Comedy|Drama|Romance'},
 12: {'title': 'Dracula: Dead and Loving It (1995)',
  'genres': 'Comedy|Horror'},
 13: {'title': 'Balto (1995)', 'genres': 'Adventure|Animation|Children'},
 14: {'title': 'Nixon (1995)', 'genres': 

## Show recommendation results from output model

In [72]:
user_items = [ (val["userId"],int(val["movieId"]),val["rating"] )for _,val in df[df.userId == users[0]].iterrows()]
print()
print(user,len(user_items))
# generate recommendation

recommendations = algo.test(user_items)

recommendations.sort(key=operator.itemgetter(3), reverse=True)
print(f"User {user} recommendations:")
# for r in recommendations[0:5]:
#     print(f"[iid] {r[1]}")

# Only id
[{"id": i[1]} for i in recommendations[0:5]]




371 232
User 371 recommendations:


[{'id': 50}, {'id': 260}, {'id': 296}, {'id': 593}, {'id': 608}]

## get only features from history

In [82]:
#Get features
[str(t[1]) for t in user_items]

['1196',
 '673',
 '2000',
 '593',
 '296',
 '1265',
 '592',
 '423',
 '1777',
 '1023',
 '2949',
 '2143',
 '3176',
 '2094',
 '1198',
 '3062',
 '2700',
 '1219',
 '1092',
 '3052',
 '110',
 '1256',
 '1030',
 '2028',
 '1029',
 '2991',
 '2571',
 '1348',
 '2078',
 '2115',
 '2648',
 '1197',
 '804',
 '1473',
 '2012',
 '2406',
 '1213',
 '2253',
 '2761',
 '736',
 '3386',
 '1793',
 '3578',
 '1573',
 '661',
 '2640',
 '2105',
 '2161',
 '1258',
 '2644',
 '3147',
 '2692']

## result recommendation full details

In [83]:
user_items = list(filter(lambda x: x[0] == users[0], items))
print()
print(user,len(user_items))
# generate recommendation

recommendations = algo.test(user_items)

recommendations.sort(key=operator.itemgetter(3), reverse=True)
print(f"User {user} recommendations:")
# for r in recommendations[0:5]:
#     print(f"[iid] {r[1]}")

# Only id
temp = [{"id": i[1], "title":mapping[i[1]]["title"] , "genres" : mapping[i[1]]["genres"] } for i in recommendations[0:5] ]
temp




371 52
User 371 recommendations:


[{'id': 1196,
  'title': 'Star Wars: Episode V - The Empire Strikes Back (1980)',
  'genres': 'Action|Adventure|Sci-Fi'},
 {'id': 593,
  'title': 'Silence of the Lambs, The (1991)',
  'genres': 'Crime|Horror|Thriller'},
 {'id': 296,
  'title': 'Pulp Fiction (1994)',
  'genres': 'Comedy|Crime|Drama|Thriller'},
 {'id': 1198,
  'title': 'Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)',
  'genres': 'Action|Adventure'},
 {'id': 2028,
  'title': 'Saving Private Ryan (1998)',
  'genres': 'Action|Drama|War'}]