In [17]:
# Import the necessary libraries
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [18]:
# Import the movies.csv dataset into a DataFrame
ML_movies = pd.DataFrame()
ML_movies = pd.read_csv('ml-latest-small/movies.csv')

# Remove the year of release placed next to each move title, using regular expressions
ML_movies['title'] = ML_movies['title'].str.replace(' \([0-9]+\)', '')

# Print the first five titles and their pipe-delimited genres
ML_movies.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji,Adventure|Children|Fantasy
2,3,Grumpier Old Men,Comedy|Romance
3,4,Waiting to Exhale,Comedy|Drama|Romance
4,5,Father of the Bride Part II,Comedy


In [19]:
# Import the tags.csv dataset into a separate DataFrame
ML_tags = pd.DataFrame()
ML_tags = pd.read_csv('ml-latest-small/tags.csv')

# We can see that different users can give movies different tags of their choosing.
ML_tags.head(10)

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
5,2,89774,Tom Hardy,1445715205
6,2,106782,drugs,1445715054
7,2,106782,Leonardo DiCaprio,1445715051
8,2,106782,Martin Scorsese,1445715056
9,7,48516,way too long,1169687325


In [20]:
# Group all tags by the movie that they are describing
groupedTags = ML_tags.groupby('movieId')

data = []

# For each pair of movie IDs and grouped tags
for ID, tagGroup in groupedTags:
    tagSeries = pd.Series(tagGroup.tag).str.cat(sep=" ")    # Convert the grouped tags into a Pandas Series
    moviesRow = ML_movies[ML_movies.movieId==ID]            # Retrieve the movies.csv row associated with the current ID
    genres = moviesRow.genres.values[0].replace("|", " ")   # Retrieve all genres of the current movie
    tagSeries = tagSeries + " " + genres                    # Append genres to the tag series
    data.append({'movieId': ID, 'tags': tagSeries})         # Add the movie:tags pair to a list

# Convert 'data' into a DataFrame
df_tags_genres = pd.DataFrame(data)

# We can see a longer listing of genres and tags per each movie
df_tags_genres.head(5)

Unnamed: 0,movieId,tags
0,1,pixar pixar fun Adventure Animation Children C...
1,2,fantasy magic board game Robin Williams game A...
2,3,moldy old Comedy Romance
3,5,pregnancy remake Comedy
4,7,remake Comedy Romance


In [21]:
# Import a set of stopwords ('the', 'a', 'and', etc.)
stopWords = set(stopwords.words('english'))

# Initialize the Count Vectorizer, making sure that it ignores any stop words (semantically inconsequential)
vectorizer = CountVectorizer(stop_words=stopWords)

# Using the Vectorizer, create a sparse matrix that lists the frequency of a tag per movie
tag_counter = vectorizer.fit_transform(df_tags_genres[:]['tags'].values)

# We can see that for Movie 0 (Toy Story), Tag 1162 ('pixar') is counted twice, while all other tags only appear once
print(tag_counter[0])

  (0, 1162)	2
  (0, 597)	1
  (0, 36)	1
  (0, 76)	1
  (0, 277)	1
  (0, 320)	1
  (0, 538)	1


In [22]:
# Tag frequency should not matter. Create a sparse matrix that 'flags' a tag if it appears for a movie at least once.
tag_occurrence = tag_counter
tag_occurrence[tag_counter[:,:]>0] = 1

# Tag 1162 ('pixar') is now only accounted for once
print(tag_occurrence[0])

  (0, 1162)	1
  (0, 597)	1
  (0, 36)	1
  (0, 76)	1
  (0, 277)	1
  (0, 320)	1
  (0, 538)	1


In [24]:
# Calculate the cosine similarity scores between each movie, based on the magnitude and content of their tag 'vectors'
tag_similarity = cosine_similarity(tag_occurrence)

# We can see the diagonal of ones, indicating that each movie was compared to itself and perfectly matched with itself
print(tag_similarity)

[[1.         0.40089186 0.18898224 ... 0.12598816 0.25197632 0.26726124]
 [0.40089186 1.         0.         ... 0.         0.23570226 0.        ]
 [0.18898224 0.         1.         ... 0.16666667 0.         0.1767767 ]
 ...
 [0.12598816 0.         0.16666667 ... 1.         0.33333333 0.47140452]
 [0.25197632 0.23570226 0.         ... 0.33333333 1.         0.35355339]
 [0.26726124 0.         0.1767767  ... 0.47140452 0.35355339 1.        ]]


In [25]:
# User enters a movie title (needs to match a title in the movies.csv dataset)
movieTitle = input('Select a favorite movie: ')
type(movieTitle)

# Retrieves the movie ID
movieId = ML_movies[ML_movies['title']==movieTitle]['movieId'].values[0]

# Retrieves the index of the movie ID
movieIndex = df_tags_genres[df_tags_genres['movieId']==movieId].index.values.astype(int)[0]

# Gathers the cosine similarity scores of that movie ID 
movieScores = tag_similarity[movieIndex, :]

# Retrieves the corresponding movie IDs with the five-highest scores, excluding itself
bestMovies = []

for i in range(0, 5):
    bestMovies.append(df_tags_genres.iloc[movieScores.argsort()[-6:-1][::-1][i]]['movieId'])
    
print ('\nRecommended movies:')
for i in range(0, 5):
    print(ML_movies[ML_movies['movieId']==bestMovies[i]]['title'].values[0])

Select a favorite movie: Shrek

Recommended movies:
Toy Story
Porco Rosso (Crimson Pig) (Kurenai no buta)
Shrek 2
Lady and the Tramp
Sinbad: Legend of the Seven Seas
