In [1]:
import pandas as pd
import numpy as np
import zipfile

zf = zipfile.ZipFile("ml-100k.zip")

users = pd.read_csv(zf.open("ml-100k/u.user"), encoding="latin-1", sep="|", header=None, names=[
    "user_id", "age", "gender", "occupation"
], usecols=["user_id", "age", "gender", "occupation"])

movies = pd.read_csv(zf.open("ml-100k/u.item"), encoding="latin-1", sep="|", header=None, names=[
    "movie_id", "movie_title"
], usecols=["movie_id", "movie_title"])

data = pd.read_csv(zf.open("ml-100k/u.data"), encoding="latin-1", sep="\t", header=None, names=[
    "user_id", "movie_id", "rating", "timestamp"
], usecols=["user_id", "movie_id", "rating", "timestamp"])

numOfUsers = data["user_id"].unique().shape[0]
numOfMovies = data["movie_id"].unique().shape[0]
print("numOfUsers=%d, numOfMovies=%d" % (numOfUsers, numOfMovies))

matrix = np.zeros((numOfUsers + 1, numOfMovies + 1))
for line in data.itertuples():
    userId = line[1]
    movieId = line[2]
    score = line[3]
    matrix[userId, movieId] = score

numOfUsers=943, numOfMovies=1682


In [2]:
from sklearn.metrics.pairwise import pairwise_distances

# how close any 2 users are: 
# 0 means it's the same user,
# 1 means that users have nothing in common
# userDistances[userA][userB] is "distance between users A and B"
userDistances = pairwise_distances(matrix, metric='cosine')

# how similar any 2 users are: (1 - userDistances)
# 0 means that users have nothing in common
# 1 means that users are the same one user
# userSimilarities[userA][userB] is "similarity of users A and B"
userSimilarities = np.ones(userDistances.shape) - userDistances

# which movies are "right" for which users
# this matrix's data is not just about recommendations,
# but it also has the movies that users already watched
# recommendations[userA][movieX] is "how good the movie X is to user A"
# score(userA, movieX) = sumof( similarityOf(user[i])*ratingOf(user[i], movieX) )
recommendations = userSimilarities.dot(matrix)

# Movies that user has already watched have highest scores
# We nan-ify them, because we don't want to recommend them
for userIndex in range(numOfUsers+1):
    for movieIndex in range(numOfMovies+1):
        if matrix[userIndex][movieIndex] != 0:
            recommendations[userIndex][movieIndex] = np.nan

def findUserByUserId(userId):
    userIndex = userId-1
    return users.iloc[userIndex]

def findUsersByUserIds(userIds):
    userIndices = list(map(lambda userId: userId-1, userIds))
    return users.iloc[userIndices]

def findMoviesByMovieIds(movieIds):
    movieIndices = list(map(lambda movieId: movieId-1, movieIds))
    return movies.iloc[movieIndices]

def findRecommendedMoviesByUserId(userId):
    movieRatings = recommendations[userId]
    recommendedMovieIdsSortedByRatingHighToLow = (-movieRatings).argsort()
    recommendedMovies = findMoviesByMovieIds(recommendedMovieIdsSortedByRatingHighToLow).copy()
    recommendedMovies["rating"] = movieRatings[recommendedMovieIdsSortedByRatingHighToLow]
    return recommendedMovies[:5]

def findSimilarUsersByUserId(userId):
    thisUserSimilarities = userSimilarities[userId]
    similarUsersSortedBySimilarityHighToLow = (-thisUserSimilarities).argsort()
    similarUsers = findUsersByUserIds(similarUsersSortedBySimilarityHighToLow).copy()
    similarUsers["similarity"] = thisUserSimilarities[similarUsersSortedBySimilarityHighToLow]
    return similarUsers[1:6]
    
def findWatchedMoviesByUserId(userId):
    movieRatings = matrix[userId]
    watchedMovieIdsSortedByRatingHighToLow = (-movieRatings).argsort()[:20]
    watchedMovies = findMoviesByMovieIds(watchedMovieIdsSortedByRatingHighToLow).copy()
    watchedMovies["rating"] = movieRatings[watchedMovieIdsSortedByRatingHighToLow]
    return watchedMovies

userOfInterest = 17

user = findUserByUserId(userOfInterest)
print("USER #%d" % (userOfInterest,))
print(user)
print()

watchedMovies = findWatchedMoviesByUserId(userOfInterest)
print("MOVIES WATCHED BY USER #%d" % (userOfInterest,))
print(watchedMovies)
print()

recommendedMovies = findRecommendedMoviesByUserId(userOfInterest)
print("MOVIES RECOMMENDED TO USER #%d" % (userOfInterest,))
print(recommendedMovies)
print()

similarUsers = findSimilarUsersByUserId(userOfInterest)
print("USERS SIMILAR TO USER #%d" % (userOfInterest,))
print(similarUsers)
print()

topSimilarUser = similarUsers.iloc[0]
topSimilarUserId = topSimilarUser["user_id"]
moviesWatchedByTopSimilarUser = findWatchedMoviesByUserId(topSimilarUserId)
print("MOVIES WATCHED BY THE TOP SIMILAR USER (#%d)" % (topSimilarUserId,))
print(moviesWatchedByTopSimilarUser)
print()

USER #17
user_id               17
age                   30
gender                 M
occupation    programmer
Name: 16, dtype: object

MOVIES WATCHED BY USER #17
     movie_id                                   movie_title  rating
149       150                               Swingers (1996)     5.0
125       126                    Spitfire Grill, The (1996)     4.0
268       269                        Full Monty, The (1997)     4.0
293       294                              Liar Liar (1997)     4.0
99        100                                  Fargo (1996)     4.0
150       151  Willy Wonka and the Chocolate Factory (1971)     4.0
474       475                          Trainspotting (1996)     4.0
275       276                      Leaving Las Vegas (1995)     4.0
918       919             City of Lost Children, The (1995)     4.0
136       137                              Big Night (1996)     4.0
0           1                              Toy Story (1995)     4.0
6           7          