In [1]:
#Dataset url: https://grouplens.org/datasets/movielens/latest/

import pandas as pd
import numpy as np
from random import seed
from random import randrange
from csv import reader
from math import sqrt

In [2]:
movies_df = pd.read_csv('/Users/navyasogi/Desktop/ProgrammingAssignment_2/movies.csv',usecols=['movieId','title'],dtype={'movieId': 'int32', 'title': 'str'})
rating_df=pd.read_csv('/Users/navyasogi/Desktop/ProgrammingAssignment_2/ratings.csv',usecols=['userId', 'movieId', 'rating'],dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})

#movies_df.head()
movies_arr = movies_df.values
movies_arr
movies_df.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,2,Jumanji (1995)
2,3,Grumpier Old Men (1995)
3,4,Waiting to Exhale (1995)
4,5,Father of the Bride Part II (1995)


In [3]:
rating_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [4]:
df = pd.merge(rating_df,movies_df,on='movieId')
df

Unnamed: 0,userId,movieId,rating,title
0,1,2,3.5,Jumanji (1995)
1,5,2,3.0,Jumanji (1995)
2,13,2,3.0,Jumanji (1995)
3,29,2,3.0,Jumanji (1995)
4,34,2,3.0,Jumanji (1995)
...,...,...,...,...
20000258,138301,121017,3.5,The Gentleman from Epsom (1962)
20000259,138301,121019,4.5,The Great Spy Chase (1964)
20000260,138301,121021,4.5,Taxi for Tobruk (1961)
20000261,138406,110167,4.5,"Judge and the Assassin, The (Juge et l'assassi..."


In [5]:
combine_movie_rating = df.dropna(axis = 0, subset = ['title'])
movie_ratingCount = (combine_movie_rating.
     groupby(by = ['title'])['rating'].
     count().
     reset_index().
     rename(columns = {'rating': 'totalRatingCount'})
     [['title', 'totalRatingCount']]
    )
movie_ratingCount.head()

Unnamed: 0,title,totalRatingCount
0,"""Great Performances"" Cats (1998)",155
1,#chicagoGirl: The Social Network Takes on a Di...,3
2,$ (Dollars) (1971),24
3,$5 a Day (2008),39
4,$9.99 (2008),55


In [6]:
rating_with_totalRatingCount = combine_movie_rating.merge(movie_ratingCount, left_on = 'title', right_on = 'title', how = 'left')
rating_with_totalRatingCount.head()

pd.set_option('display.float_format', lambda x: '%.3f' % x)
print(movie_ratingCount['totalRatingCount'].describe())

count   26729.000
mean      748.261
std      3086.673
min         1.000
25%         3.000
50%        18.000
75%       205.000
max     67310.000
Name: totalRatingCount, dtype: float64


In [7]:
popularity_threshold = 20000
rating_popular_movie= rating_with_totalRatingCount.query('totalRatingCount >= @popularity_threshold')
rating_popular_movie.head()

Unnamed: 0,userId,movieId,rating,title,totalRatingCount
0,1,2,3.5,Jumanji (1995),22243
1,5,2,3.0,Jumanji (1995),22243
2,13,2,3.0,Jumanji (1995),22243
3,29,2,3.0,Jumanji (1995),22243
4,34,2,3.0,Jumanji (1995),22243


In [8]:
## First lets create a Pivot matrix

movie_features_df=rating_popular_movie.pivot_table(index='movieId',columns='userId',values='rating').fillna(0)
#movie_features_df.head()

#Convert pivot table to dataframe and then to array of vectors
flattened = pd.DataFrame(movie_features_df.to_records())
dataset = flattened.values #array of vectors
#dataset
flattened.shape

(160, 135726)

In [9]:
flattened.head()

Unnamed: 0,movieId,1,2,3,4,5,6,7,8,9,...,138484,138485,138486,138487,138488,138489,138490,138491,138492,138493
0,1,0.0,0.0,4.0,0.0,0.0,5.0,0.0,4.0,0.0,...,0.0,0.0,5.0,0.0,3.0,0.0,0.0,2.0,0.0,3.5
1,2,3.5,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,4.0
2,6,0.0,0.0,0.0,3.0,0.0,0.0,0.0,3.0,0.0,...,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,10,0.0,0.0,0.0,4.0,0.0,0.0,0.0,4.0,0.0,...,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,17,0.0,0.0,0.0,0.0,3.0,5.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0


In [11]:
# Locate the most similar neighbors
def get_neighbors(train, test_row, num_neighbors):
	distances = list()
	for train_row in train:
		dist = cosineSimilarity(test_row, train_row)
		distances.append((train_row, dist))
	distances.sort(key=lambda tup: tup[1])
	neighbors = []
	for i in range(num_neighbors):
		neighbors.append(distances[i][0])
	return neighbors 
        

In [12]:
#Calculate cosine similarity between two vectors
def cosineSimilarity(row1, row2):
    sumv1, sumv2, sumv1v2 = 0, 0, 0
    for i in range(len(row1)-1):
        x = row1[i]
        y = row2[i]
        sumv1 += x * x
        sumv2 += y * y
        sumv1v2 += x * y
    return -(sumv1v2 / (sqrt(sumv1) * sqrt(sumv2)))
    

In [13]:
# Calculate the Euclidean distance between two vectors
def euclidean_distance(row1, row2):
	distance = 0.0
	for i in range(len(row1)-1):
		distance += (row1[i] - row2[i])**2
	return sqrt(distance)

In [87]:
#Calculate the Manhattan distance between two vectors
def manhattan_distance(row1, row2):
    mdistance = 0.0
    for i in range(len(row1)-1):
        mdistance += abs(row1[i] - row2[i])
    return mdistance

In [14]:
#Add random movie index selector from dataset code
rand_index = np.random.choice(len(dataset))

print("Movies similar to " + movies_arr[(int(dataset[rand_index][0]) - 1)][1] + " are :-")

similarity = get_neighbors(dataset, dataset[rand_index], 10)
movieindices = list()
for movie in similarity:
    movieid = int(movie[0] - 1)
    movieindices.append(movieid + 1)
    print(movies_arr[movieid][1])

Movies similar to Shawshank Redemption, The (1994) are :-
Shawshank Redemption, The (1994)
Village of the Damned (1995)
Pretty Woman (1990)
Wyatt Earp (1994)
Cops and Robbersons (1994)
Perez Family, The (1995)
White Man's Burden (1995)
King of the Hill (1993)
Crooklyn (1994)
Snow White and the Seven Dwarfs (1937)


In [225]:
similarity = get_neighbors(dataset, dataset[10], 10)
print(similarity[0][1])

3.5


In [15]:
#Calculating similarity scores for Star Wars movie recommendations
df = df.iloc[:7000000,:]
movieRatings = df.pivot_table(index=['userId'],columns=['title'],values='rating')
movieRatings.head()

title,1984 (Nineteen Eighty-Four) (1984),2001: A Space Odyssey (1968),2010: The Year We Make Contact (1984),28 Days (2000),28 Days Later (2002),"7th Voyage of Sinbad, The (1958)",8MM (1999),Abbott and Costello Meet Frankenstein (1948),"Abyss, The (1989)",Ace Ventura: When Nature Calls (1995),...,Witness (1985),"Wizard of Oz, The (1939)","Wolf Man, The (1941)","X-Files: Fight the Future, The (1998)",X2: X-Men United (2003),Yellow Submarine (1968),Yojimbo (1961),You've Got Mail (1998),Young Frankenstein (1974),¡Three Amigos! (1986)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.5,,,3.5,4.0,,,,,...,,3.5,,,4.0,,3.0,,4.0,
2,,5.0,,3.0,,,,5.0,,,...,,,,,,,,,,
3,5.0,5.0,4.0,,,,4.0,,3.0,,...,4.0,4.0,,5.0,,3.0,,,5.0,
4,,,,,,,,,,3.0,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [17]:
starWarsRatings = movieRatings['Star Wars: Episode IV - A New Hope (1977)']
starWarsRatings

userId
1        4.000
2        5.000
3        5.000
4          nan
5        5.000
          ... 
138489     nan
138490     nan
138491     nan
138492     nan
138493   4.500
Name: Star Wars: Episode IV - A New Hope (1977), Length: 138348, dtype: float32

In [18]:
similarMovies = movieRatings.corrwith(starWarsRatings) # pairwise correlation of Star Wars vector of user rating with every other movie

In [19]:
similarMovies = similarMovies.dropna() # Drop any results that have no data

In [20]:
data = pd.DataFrame(similarMovies) # Construct a new Dataframe of movies and their correlation score to Star Wars
data.head(10)

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
1984 (Nineteen Eighty-Four) (1984),0.059
2001: A Space Odyssey (1968),0.132
2010: The Year We Make Contact (1984),0.196
28 Days (2000),0.047
28 Days Later (2002),0.124
"7th Voyage of Sinbad, The (1958)",0.195
8MM (1999),0.013
Abbott and Costello Meet Frankenstein (1948),0.224
"Abyss, The (1989)",0.207
Ace Ventura: When Nature Calls (1995),0.129


In [21]:
similarMovies.sort_values(ascending=False)

title
Star Wars: Episode IV - A New Hope (1977)                                         1.000
Star Wars: Episode V - The Empire Strikes Back (1980)                             0.752
Star Wars: Episode VI - Return of the Jedi (1983)                                 0.687
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    0.473
Star Wars: Episode I - The Phantom Menace (1999)                                  0.399
                                                                                  ...  
Losing Isaiah (1995)                                                             -0.008
Thin Red Line, The (1998)                                                        -0.015
Monkey Shines (1988)                                                             -0.017
Godsend (2004)                                                                   -0.039
Tumbleweeds (1999)                                                               -0.040
Length: 573, dtype: float6

In [22]:
movieStats = df.groupby('title').agg({'rating': [np.size, np.mean]})
movieStats.head()

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
1984 (Nineteen Eighty-Four) (1984),3407.0,3.637
2001: A Space Odyssey (1968),25253.0,3.956
2010: The Year We Make Contact (1984),4143.0,3.388
28 Days (2000),4670.0,3.092
28 Days Later (2002),12170.0,3.752


In [23]:
popularMovies = movieStats['rating']['size'] >= 100 # Ignore movies rated by less than 100 people
movieStats[popularMovies].sort_values([('rating', 'mean')], ascending=False)[:15]

Unnamed: 0_level_0,rating,rating
Unnamed: 0_level_1,size,mean
title,Unnamed: 1_level_2,Unnamed: 2_level_2
"Shawshank Redemption, The (1994)",63366.0,4.447
"Godfather, The (1972)",41355.0,4.365
"Usual Suspects, The (1995)",47006.0,4.334
"Godfather: Part II, The (1974)",27398.0,4.276
Rear Window (1954),17449.0,4.271
Casablanca (1942),24349.0,4.258
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950),6525.0,4.257
One Flew Over the Cuckoo's Nest (1975),29932.0,4.248
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1964),23220.0,4.247
North by Northwest (1959),15627.0,4.234


In [24]:
data = movieStats[popularMovies].join(pd.DataFrame(similarMovies, columns=['similarity']))
data.head(10)



Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1984 (Nineteen Eighty-Four) (1984),3407.0,3.637,0.059
2001: A Space Odyssey (1968),25253.0,3.956,0.132
2010: The Year We Make Contact (1984),4143.0,3.388,0.196
28 Days (2000),4670.0,3.092,0.047
28 Days Later (2002),12170.0,3.752,0.124
"7th Voyage of Sinbad, The (1958)",1564.0,3.602,0.195
8MM (1999),4087.0,2.963,0.013
Abbott and Costello Meet Frankenstein (1948),959.0,3.419,0.224
"Abyss, The (1989)",17508.0,3.658,0.207
Ace Ventura: When Nature Calls (1995),20938.0,2.607,0.129


In [25]:
data.sort_values(['similarity'], ascending=False)[:10]

Unnamed: 0_level_0,"(rating, size)","(rating, mean)",similarity
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Star Wars: Episode IV - A New Hope (1977),54502.0,4.191,1.0
Star Wars: Episode V - The Empire Strikes Back (1980),45313.0,4.188,0.752
Star Wars: Episode VI - Return of the Jedi (1983),46839.0,4.005,0.687
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981),43295.0,4.219,0.473
Star Wars: Episode I - The Phantom Menace (1999),29574.0,3.081,0.399
"Lord of the Rings: The Fellowship of the Ring, The (2001)",37553.0,4.138,0.364
"Lord of the Rings: The Two Towers, The (2002)",33947.0,4.108,0.363
"Lord of the Rings: The Return of the King, The (2003)",31577.0,4.142,0.358
Indiana Jones and the Last Crusade (1989),31280.0,4.008,0.355
Superman (1978),15089.0,3.398,0.331
