In [68]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm
import numpy as np

In [69]:
user_columns = ['user_id', 'age', 'sex', 'occupation', 'zip code']
users = pd.read_csv('ml-100k/u.user', sep = '|', names=user_columns, encoding='latin-1')
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [70]:
movie_columns = ['movie_id', 'title', 'release_date', 'video_release_date', 'imdb_url']
movies = pd.read_csv('ml-100k/u.item', sep = '|', names=movie_columns, encoding='latin-1', usecols = range(5))
movies = movies.drop('video_release_date', axis = 1)
movies.head()

Unnamed: 0,movie_id,title,release_date,imdb_url
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [71]:
ratings_columns = ['user_id','movie_id','rating']
ratings = pd.read_csv('ml-100k/u.data', sep = '\t', names=ratings_columns, encoding='latin-1', usecols = range(3))
ratings.head()

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [72]:
# Two approaches
# 1. similar characteristics of the movies
# 2. other users that like the same movies - use correlation
# We only have the data to do #2 here

users.to_csv('data/users.csv', sep=',', encoding='utf-8')
movies.to_csv('data/movies.csv', sep=',', encoding='utf-8')
ratings.to_csv('data/ratings.csv', sep=',', encoding='utf-8')

In [73]:
movies = pd.read_csv('data/movies.csv')
movies.head()

Unnamed: 0.1,Unnamed: 0,movie_id,title,release_date,imdb_url
0,0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...
1,1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...
2,2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%...
3,3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%...
4,4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)


In [74]:
def make_clickable(val):
    return '<a href="{}">{}</a>'.format(val, val)


In [75]:
# testing
make_clickable('www.wvu.edu')

'<a href="www.wvu.edu">www.wvu.edu</a>'

In [76]:
movies.style.format({'imdb_url':make_clickable})

Unnamed: 0.1,Unnamed: 0,movie_id,title,release_date,imdb_url
0,0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%20(1995)
1,1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(1995)
2,2,3,Four Rooms (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Four%20Rooms%20(1995)
3,3,4,Get Shorty (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Get%20Shorty%20(1995)
4,4,5,Copycat (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Copycat%20(1995)
5,5,6,Shanghai Triad (Yao a yao yao dao waipo qiao) (1995),01-Jan-1995,http://us.imdb.com/Title?Yao+a+yao+yao+dao+waipo+qiao+(1995)
6,6,7,Twelve Monkeys (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Twelve%20Monkeys%20(1995)
7,7,8,Babe (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Babe%20(1995)
8,8,9,Dead Man Walking (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Dead%20Man%20Walking%20(1995)
9,9,10,Richard III (1995),22-Jan-1996,http://us.imdb.com/M/title-exact?Richard%20III%20(1995)


In [77]:
my_ratings = ratings.where(ratings.user_id==196).dropna()
# my_ratings.count()

test_ratings = my_ratings[:15]

my_ratings = my_ratings[~my_ratings.movie_id.isin(test_ratings.movie_id)]
my_ratings.count()
my_ratings.head()

Unnamed: 0,user_id,movie_id,rating
17102,196.0,8.0,5.0
17830,196.0,428.0,4.0
18853,196.0,1118.0,4.0
21605,196.0,70.0,3.0
22271,196.0,66.0,3.0


In [78]:
# Find all the users who have seen at least 10 movies in common
import numpy as np

userPool = []
for movie in list(my_ratings.movie_id):
    otherUserRatingsForThisMovie = ratings.where(ratings.movie_id==movie).dropna()
    otherUsers = otherUserRatingsForThisMovie.user_id
    userPool.extend(otherUsers)

userPool = np.unique(userPool)



In [79]:

userRatings = ratings.where(ratings.user_id==3).dropna()
list(set(userRatings.movie_id) & set(my_ratings.movie_id))
len(userPool)

818

In [80]:
usersWithMoviesInCommon = []
for i,user in enumerate(userPool):
    userRatings = ratings.where(ratings.user_id==user).dropna()
    moviesInCommon = list(set(userRatings.movie_id) & set(my_ratings.movie_id))
    if len(moviesInCommon) > 10:
        usersWithMoviesInCommon.append(user)
print(usersWithMoviesInCommon)

[1.0, 13.0, 14.0, 18.0, 43.0, 59.0, 62.0, 85.0, 92.0, 94.0, 125.0, 144.0, 151.0, 178.0, 184.0, 196.0, 207.0, 234.0, 244.0, 268.0, 269.0, 271.0, 276.0, 279.0, 286.0, 297.0, 303.0, 305.0, 308.0, 327.0, 334.0, 336.0, 345.0, 354.0, 378.0, 389.0, 393.0, 406.0, 416.0, 417.0, 429.0, 437.0, 450.0, 457.0, 500.0, 523.0, 537.0, 561.0, 592.0, 642.0, 655.0, 666.0, 707.0, 711.0, 747.0, 758.0, 790.0, 805.0, 847.0, 854.0, 864.0, 880.0, 883.0, 889.0, 936.0]


In [81]:
from sklearn.metrics.pairwise import euclidean_distances
userDistances = []
for user in usersWithMoviesInCommon:
    userRatings = ratings.where(ratings.user_id==user).dropna()
    moviesInCommon = list(set(userRatings.movie_id) & set(my_ratings.movie_id))
    myRatingVector = []
    userRatingVector = []
    for movie in moviesInCommon:
        myRatingVector.append(float(my_ratings.where(my_ratings.movie_id==movie).dropna().rating))
        userRatingVector.append(float(userRatings.where(userRatings.movie_id==movie).dropna().rating))
    distance = float(euclidean_distances(myRatingVector, userRatingVector)[0][0])
    userDistances.append((user, distance))











In [82]:
userDistances = sorted(userDistances, key = lambda x: x[1])
mostSimilarUsers = [each[0] for each in userDistances][1:]

In [83]:
predictions = []
for movie in list(test_ratings.movie_id):
    otherRatings = []
    for user in mostSimilarUsers[:20]:
        userRatings = ratings.where(ratings.user_id==user).dropna()
        if movie in list(userRatings.movie_id):
            rat = list(userRatings.where(userRatings.movie_id==movie).dropna().rating)[0]
            otherRatings.append(rat)
    prediction = int(np.mean(otherRatings))
    predictions.append(prediction)

test_ratings['predictions'] = pd.Series(predictions, index = test_ratings.index)
test_ratings

Unnamed: 0,user_id,movie_id,rating,predictions
0,196.0,242.0,3.0,4
940,196.0,393.0,4.0,3
1133,196.0,381.0,4.0,3
1812,196.0,251.0,3.0,4
1896,196.0,655.0,5.0,3
2374,196.0,67.0,5.0,3
6910,196.0,306.0,4.0,5
7517,196.0,238.0,4.0,3
7842,196.0,663.0,5.0,4
10017,196.0,111.0,4.0,3
