<img src="../Pics/MLSb-T.png" width="160">
<br><br>
<center><u><H1>Recommender Systems with KNN</H1></u></center>

In [None]:
import pandas as pd
import numpy as np

In [None]:
header = ['user_id', 'movie_id', 'rating', 'timestamp']
df_ratings = pd.read_csv('../Data/ml-100k/u.data', sep='\t', names=header)
df_ratings.head()

In [None]:
movie_details = df_ratings.groupby('movie_id').agg({'rating':
                                                   [np.size, np.mean]})
movie_details.head()

In [None]:
cols = ['movie_id', 'title']
movies = pd.read_csv('../Data/ml-100k/u.item', sep='|', names=cols, usecols=range(2), encoding='latin-1')
ratings_titles = pd.merge(movies, df_ratings).sort_values(['rating'], ascending=False)
ratings_titles.head()

## Similarity matrix

In [None]:
ratings_ = df_ratings.pivot(index = 'user_id', columns ='movie_id', values = 'rating').fillna(0)
ratings_.head(5)

In [None]:
matrix = ratings_.as_matrix()

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
ratings_train, ratings_test = train_test_split(matrix, test_size=0.3, random_state=42)

In [None]:
ratings_train.shape

In [None]:
ratings_test.shape

## Finding the top N nearest neighbors

In [None]:
from sklearn.neighbors import NearestNeighbors
k = 5

In [None]:
knn = NearestNeighbors(k, 'cosine')

In [None]:
knn.fit(ratings_train)

In [None]:
top_k_dist, top_k_users = knn.kneighbors(ratings_train, return_distance=True)

In [None]:
top_k_dist.shape
top_k_users.shape

## Top 5, similar users to user 1:

In [None]:
top_k_users[0][:5]

In [None]:
top_k_dist[0][:5]

## Prediction matrix for rating movies:
### Selecting the top 5 users for each user and use their rating info
### while predicting the ratings using the weighted sum of all of the
### ratings of the top 5 similar users.

In [None]:
user_pred = np.zeros(ratings_train.shape)
for i in range(ratings_train.shape[0]):
    user_pred[i,:] = top_k_dist[i].T.dot(ratings_train[top_k_users][i])/np.array([np.abs(top_k_dist[i].T).sum(axis=0)]).T

user_pred.shape

In [None]:
df = pd.DataFrame(user_pred, columns=ratings_.columns)
df.head(10)

## Evaluating the model:

In [None]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [None]:
def rmse(pred,actual):
    prediction = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, actual))

In [None]:
rmse(user_pred, ratings_train)

In [None]:
rmse(user_pred, ratings_test)

## Item-based recommendations

In [None]:
# defining k as the number of movies
k_ = ratings_train.shape[1]
k_

In [None]:
neigh_ = NearestNeighbors(k_, 'cosine')

In [None]:
neigh_.fit(ratings_train.T)

In [None]:
top_k_dist_, top_k_movies_ = neigh_.kneighbors(ratings_train.T, return_distance=True)

## Similar movies to movie 1:

In [None]:
top_k_movies_[0]

In [None]:
top_k_dist_[0]

## Predicting the movie ratings

In [None]:
item_predictions = ratings_train.dot(top_k_dist_) / np.array([np.abs(top_k_dist_).sum(axis=1)])

In [None]:
df_predictions = pd.DataFrame(item_predictions, columns=ratings_.columns)
df_predictions.head(10)
#Because the query set matches the training set, the nearest neighbor of each point is the point itself, at a distance of zero.

In [None]:
df_predictions.shape

## Evaluating the model:

In [None]:
rmse(item_predictions, ratings_train)

In [None]:
rmse(item_predictions, ratings_test)

## Reference:

http://scikit-learn.org/stable/modules/neighbors.html