In [6]:
# import statements
import pandas as pd
import numpy as np
from surprise.prediction_algorithms.knns import KNNWithMeans as knn
from surprise import Dataset
from surprise import evaluate
from sklearn.metrics.pairwise import cosine_similarity

## Reading the data

In [8]:
col_name_rating = ['user_id', 'item_id', 'rating', 'timestamp']
rating = pd.read_csv('u.data', sep='\t', names = col_name_rating)

col_name_user = ['user_id', 'age', 'gender', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names = col_name_user)

col_name_movies = ['movie_id', 'movie_title', 'release_date', 'video_release_date', 'IMDb_URL', 'unknown', 'Action', 
                   'Adventure','Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 
                   'Film-Noir', 'Horror','Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
movies = pd.read_csv('u.item', sep='|', names = col_name_movies, encoding='latin-1')

## KNN from scratch

In [52]:
n_users = rating.user_id.unique().shape[0]
n_items = rating.item_id.unique().shape[0]
print('Number of users = ' + str(n_users) + ' | Number of movies = ' + str(n_items)) 

Number of users = 943 | Number of movies = 1682


In [53]:
user_data_matrix = np.zeros((n_users, n_items))
for line in rating.itertuples():
    user_data_matrix[line[1]-1, line[2]-1] = line[3]

## Normalising the dataset

In [54]:
mean_user_rating = np.nanmean(user_data_matrix,axis = 1)

In [55]:
ratings_diff = (user_data_matrix - mean_user_rating[:, np.newaxis]) 
rating_diff_new = np.nan_to_num(ratings_diff)

## Training and test dataset

In [56]:
train = rating_diff_new[:-1]
test = rating_diff_new[-1]

## User-user similarity

In [63]:
user_similarity =  cosine_similarity(train, test).T



## Recommending movies for one user

In [64]:
users = list(user_similarity[0])
sorted_users = sorted(users,reverse=True)
list_top_users  = sorted_users[:5]

In [50]:
pred = np.zeros(train[0].shape)
for i in list_top_users:
    index = users.index(i)
    pred += i*train[index]

In [18]:
sorted_pred = sorted(pred,reverse=True)
top_pred = sorted_pred[:15]
indexes = []
pred_list = list(pred)
for i in top_pred:
    if (pred_list.index(i) not in indexes):
        indexes.append(pred_list.index(i))

In [19]:
movies_watched_index = []
for i in range(len(test)):
    if (test[i]>0):
        movies_watched_index.append(i+1)

In [45]:
## Movies the user has watched
movies_watched=[]
for i in movies_watched_index:
    movies_watched.append(movies['movie_title'][movies['movie_id'] == i])

## Movies the users can watch (recommended movies)
movie_to_watch = []
for i in (indexes):
    if (i not in movies_watched_index):
        movie_to_watch.extend(movies['movie_title'][movies['movie_id'] == i+1])

In [46]:
if (len(movie_to_watch) > len(movies_watched)):
    movie_watched.extend(['']*(len(movie_to_watch) - len(movies_watched)))
else:
    movie_to_watch.extend(['']*(len(movies_watched) - len(movie_to_watch)))

In [47]:
recommended_movies = pd.DataFrame({"Watched Movies": movies_watched,
                         "Recommended Movies":movie_to_watch})

In [48]:
recommended_movies

Unnamed: 0,Recommended Movies,Watched Movies
0,GoldenEye (1995),"1 GoldenEye (1995) Name: movie_title, dtype..."
1,,8 Dead Man Walking (1995) Name: movie_title...
2,,"10 Seven (Se7en) (1995) Name: movie_title, ..."
3,,"11 Usual Suspects, The (1995) Name: movie_t..."
4,,"21 Braveheart (1995) Name: movie_title, dty..."
5,,"22 Taxi Driver (1976) Name: movie_title, dt..."
6,,23 Rumble in the Bronx (1995) Name: movie_t...
7,,"26 Bad Boys (1995) Name: movie_title, dtype..."
8,,"27 Apollo 13 (1995) Name: movie_title, dtyp..."
9,,"30 Crimson Tide (1995) Name: movie_title, d..."


## KNN using Surprise Package

In [25]:
movie_data = Dataset.load_builtin('ml-100k')
movie_data.split(n_folds=3)

In [26]:
knn_algorithm = knn(k=10, min_k=1)

In [28]:
for train_set, test_set in movie_data.folds():
    knn_algorithm.train(train_set)
    pred = knn_algorithm.test(test_set)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


In [29]:
evaluation = evaluate(knn_algorithm, movie_data, measures=['RMSE', 'MAE'])

Evaluating RMSE, MAE of algorithm KNNWithMeans.

------------
Fold 1
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9819
MAE:  0.7763
------------
Fold 2
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9828
MAE:  0.7750
------------
Fold 3
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.9767
MAE:  0.7709
------------
------------
Mean RMSE: 0.9805
Mean MAE : 0.7741
------------
------------


## Recommending movies for one user

In [30]:
predict = []
for i in range(1682):
    predict.append(knn_algorithm.predict('330', str(i), r_ui = test_set).est)

In [35]:
top_movies = sorted(predict, reverse=True)[:5]

In [44]:
## Function to get the index of the movie
get_index = lambda x, xs: [i for (y, i) in zip(xs, range(len(xs))) if x == y]

indices = []
predict = list(predict)
count = 1
for i in top_movies:
    if predict.index(i) in indexes:
        indices.append(get_index(i, predict)[count])
        count += 1
    else:
        indexes.append(predict.index(i))
indices

[15, 22, 25, 50, 56]

In [42]:
for i in indices:
    recommended_movie = movies.movie_title[movies['movie_id'] == i+1]
    print(recommended_movie)

15    French Twist (Gazon maudit) (1995)
Name: movie_title, dtype: object
22    Taxi Driver (1976)
Name: movie_title, dtype: object
25    Brothers McMullen, The (1995)
Name: movie_title, dtype: object
50    Legends of the Fall (1994)
Name: movie_title, dtype: object
56    Priest (1994)
Name: movie_title, dtype: object
