In [10]:
# Import the necessary libraries.
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# Import the movies.csv dataset into a Pandas DataFrame.
moviesML = pd.DataFrame()
moviesML = pd.read_csv('ml-latest-small/movies.csv')

# Remove the year of release placed next to each move title, via regular expressions.
moviesML['title'] = moviesML['title'].str.replace(' \([0-9]+\)', '')

# We can see that film titles are accompanied by an ID and a list of pipe-delimited genre labels.
moviesML.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [12]:
# Import the tags.csv dataset into a separate Pandas DataFrame.
tagsML = pd.DataFrame()
tagsML = pd.read_csv('ml-latest-small/tags.csv')

# We can see that different users can give movies different tags of their choosing.
tagsML.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [13]:
# Group all tags, respective to each movie that they are describing.
groupBy = tagsML.groupby('movieId')

data = []

# For each movie ID and their respective list of tags:
for movieId, tags in groupBy:
    tagSeries = pd.Series(tags.tag).str.cat(sep=" ")           # Convert the group of tags into a Pandas Series.
    moviesRow = moviesML[moviesML.movieId==movieId]            # Retrieve the row in 'moviesML' associated with the current ID.
    genres = moviesRow.genres.values[0].replace("|", " ")      # Retrieve all genres of the current movie.
    tagSeries = tagSeries + " " + genres                       # Append all retrieved genres to the tag Series.
    data.append({'movieId': movieId, 'tags': tagSeries})       # Add the movie ID and its tag Series to a list.

# Convert 'data' into a Pandas DataFrame.
df_tags_genres = pd.DataFrame(data)

# We can now see a longer group of genres and tags, per each movie.
df_tags_genres.head(5)

Unnamed: 0,movieId,tags
0,1,pixar pixar fun Adventure Animation Children C...
1,2,fantasy magic board game Robin Williams game A...
2,3,moldy old Comedy Romance
3,5,pregnancy remake Comedy
4,7,remake Comedy Romance


In [14]:
# Import a set of stopwords ('the', 'this', 'and', etc.)
stopWords = set(stopwords.words('english'))

# Initialize a Count Vectorizer that ignores all stop words, as they hold no semantic value.
vectorizer = CountVectorizer(stop_words=stopWords)

# Using the Vectorizer, create a sparse matrix that computes the frequency of all tags per movie.
tagCounter = vectorizer.fit_transform(df_tags_genres[:]['tags'].values)

# For Toy Story, Tag 1162 ('pixar') is counted twice. Other tags appear either once, or never (these tag IDs are not listed). 
print(tagCounter[0])

  (0, 1162)	2
  (0, 597)	1
  (0, 36)	1
  (0, 76)	1
  (0, 277)	1
  (0, 320)	1
  (0, 538)	1


In [15]:
# Tag frequency shouldn't matter (Toy Story isn't twice as 'pixar' as Up). Flag a tag if it occurs for a movie at least once.
tagFlagger = tagCounter
tagFlagger[tagCounter[:,:]>0] = 1

# For Toy Story, Tag 1162 ('pixar') is now counted only once.
print(tagFlagger[0])

  (0, 1162)	1
  (0, 597)	1
  (0, 36)	1
  (0, 76)	1
  (0, 277)	1
  (0, 320)	1
  (0, 538)	1


In [16]:
# Compute the cosine similarities between each movie, based on their tag 'vectors'. We can think of the 'magnitude' of each
# vector as the number of tags present, and its 'direction' as which tags are present.
tag_similarity = cosine_similarity(tagFlagger)

# The diagonal of ones shows that each movie's tag vector was compared to and perfectly matched with itself.
print(tag_similarity)

[[1.         0.40089186 0.18898224 ... 0.12598816 0.25197632 0.26726124]
 [0.40089186 1.         0.         ... 0.         0.23570226 0.        ]
 [0.18898224 0.         1.         ... 0.16666667 0.         0.1767767 ]
 ...
 [0.12598816 0.         0.16666667 ... 1.         0.33333333 0.47140452]
 [0.25197632 0.23570226 0.         ... 0.33333333 1.         0.35355339]
 [0.26726124 0.         0.1767767  ... 0.47140452 0.35355339 1.        ]]


In [17]:
# Create a set of all unique movie titles.
movieSet = set(moviesML['title'])

# Let the user enter a movie title; it must match a title listed in moviesML.
movieTitle = input('What is a favorite movie of yours? ')
type(movieTitle)

while movieTitle not in movieSet:
    print("\nThis title does not exist in our database.")
    movieTitle = input("Please enter a different title: ")
    type(movieTitle)

movieId = moviesML[moviesML['title']==movieTitle]['movieId'].values[0]

# Retrieve the index of the movie ID.
movieIndex = df_tags_genres[df_tags_genres['movieId']==movieId].index.values.astype(int)[0]

# Gather the cosine similarities assigned to that movie index. 
movieScores = tag_similarity[movieIndex, :]

# Retrieve the corresponding movie indices with the five-highest scores, excluding itself.
bestMovies = []

for i in range(0, 5):
    bestMovies.append(df_tags_genres.iloc[movieScores.argsort()[-6:-1][::-1][i]]['movieId'])
    
# Offer the user the five most similar movies.
print ('\nWe think that you should watch...')
for i in range(0, 5):
    print(moviesML[moviesML['movieId']==bestMovies[i]]['title'].values[0])

What is a favorite movie of yours? This Movie Does Not Exist

This title does not exist in our database.
Please enter a different title: Shrek

We think that you should watch...
Toy Story
Porco Rosso (Crimson Pig) (Kurenai no buta)
Shrek 2
Lady and the Tramp
Sinbad: Legend of the Seven Seas
