In [1]:
from skmultilearn.adapt import MLkNN
import pandas as pd
import numpy as np

u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')

i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols, encoding='latin-1')

items = items.drop(columns=["IMDb URL","release date","unknown","video release date"])

r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]

data_matrix = np.zeros((n_users, n_items))
for line in ratings.itertuples():
    data_matrix[line[1]-1, line[2]-1] = line[3]

from sklearn.metrics.pairwise import pairwise_distances 

user_similarity = pairwise_distances(data_matrix, metric='cosine')
item_similarity = pairwise_distances(data_matrix.T, metric='cosine')

def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

user_prediction = predict(data_matrix, user_similarity, type='user')
item_prediction = predict(data_matrix, item_similarity, type='item')
user_prediction

array([[ 2.06532606,  0.73430275,  0.62992381, ...,  0.39359041,
         0.39304874,  0.3927712 ],
       [ 1.76308836,  0.38404019,  0.19617889, ..., -0.08837789,
        -0.0869183 , -0.08671183],
       [ 1.79590398,  0.32904733,  0.15882885, ..., -0.13699223,
        -0.13496852, -0.13476488],
       ...,
       [ 1.59151513,  0.27526889,  0.10219534, ..., -0.16735162,
        -0.16657451, -0.16641377],
       [ 1.81036267,  0.40479877,  0.27545013, ..., -0.00907358,
        -0.00846587, -0.00804858],
       [ 1.8384313 ,  0.47964837,  0.38496292, ...,  0.14686675,
         0.14629808,  0.14641455]])

In [7]:
from sklearn.neighbors import NearestNeighbors

knn = NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine',n_jobs=-1).fit(data_matrix)
distances, indices= knn.kneighbors(data_matrix)
raw_recommends =sorted(list(zip(indices.squeeze().tolist(),distances.squeeze().tolist())),
                               key=lambda x: x[1])[:0:-1]
raw_recommends

[([0,
   915,
   863,
   267,
   91,
   434,
   456,
   737,
   428,
   302,
   275,
   888,
   822,
   386,
   513,
   292,
   681,
   885,
   726,
   300],
  [2.220446049250313e-15,
   0.43093426847201255,
   0.4524517378059171,
   0.45792295247989423,
   0.4594664388157663,
   0.46133546811462456,
   0.46152402496061107,
   0.47296892649889033,
   0.4740500732819177,
   0.4742822659150133,
   0.4754774770279343,
   0.478262685728188,
   0.47914981075905316,
   0.4830223544430169,
   0.4855632756339002,
   0.4892011486147828,
   0.4894203452997239,
   0.4896569366077418,
   0.49170151293014386,
   0.4955698806192357]),
 ([863,
   377,
   221,
   456,
   91,
   428,
   275,
   300,
   895,
   617,
   302,
   434,
   193,
   93,
   55,
   748,
   486,
   576,
   177,
   726],
  [1.5543122344752192e-15,
   0.36885158043106603,
   0.3690524475859207,
   0.38008853111879837,
   0.38252792481760267,
   0.3918354710149823,
   0.3944264632962814,
   0.39626311540895987,
   0.3980583771010705