## EECS 731 Project 3: Clustering
### by Matthew Taylor

### Import required modules

In [1]:
import random
import pandas as pd
from statistics import mean
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

### Load Datasets
This dataset contains three different CSV files. The first contains a mapping between movie titles and unique identifiers, as well as which genres apply to each movie. The second contains user ratings for each movie. The third contains keywords from reviews about the movie.

In [2]:
movies_df = pd.read_csv('data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
tags_df = pd.read_csv('data/tags.csv')
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


### Split and encode genres
First, we must transform the provided information into a format that our models can accept. Here, I split the values in the genres column from the first dataframe into a binary format.

In [5]:
genres = {"Action": [],
          "Adventure": [],
          "Animation": [],
          "Children": [],
          "Comedy": [],
          "Crime": [],
          "Documentary": [],
          "Drama": [],
          "Fantasy": [],
          "Film-Noir": [],
          "Horror": [],
          "Musical": [],
          "Mystery": [],
          "Romance": [],
          "Sci-Fi": [],
          "Thriller": [],
          "War": [],
          "Western": []}

for index, row in movies_df.iterrows():
    current_movie_genres = row.genres.split('|')
    
    for key in genres.keys():
        genres[key].append(1 if key in current_movie_genres else 0)

for key in genres.keys():
    movies_df[key] = genres[key]
    
movies_df = movies_df.drop(columns="genres")

movies_df.head()

Unnamed: 0,movieId,title,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0
1,2,Jumanji (1995),0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale (1995),0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II (1995),0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0


### Calculate average rating for each movie
Next, I aggregate user ratings for each film and calculate the average rating. This information will be used to recommend films with similar ratings.

In [6]:
movie_ids = movies_df.movieId.values

average_ratings = []

for movie_id in movie_ids:
    ratings = ratings_df.loc[ratings_df['movieId'] == movie_id].rating.values
    
    average_ratings.append(mean(ratings) if len(ratings) != 0 else 0)
    
movies_df.insert(2, 'avg_rating', average_ratings)

### Group tags for each movie
The last step of the feature engineering process involves parsing and combining the information in the tags column of the third dataframe. Since tags for each movie can be spread out across multiple rows, appending the concatenation of all of these tags to their corresponding movie will simplify things later on.

In [7]:
tags = []

for movie_id in movie_ids:
    temp_tags = tags_df.loc[tags_df.movieId == movie_id].tag.values
    tags.append(' '.join(temp_tags))

movies_df['tags'] = tags

### Inspect resulting dataframe
Now we have all of the relevant information in one place.

In [8]:
movies_df.head()

Unnamed: 0,movieId,title,avg_rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western,tags
0,1,Toy Story (1995),3.92093,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,pixar pixar fun
1,2,Jumanji (1995),3.431818,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,fantasy magic board game Robin Williams game
2,3,Grumpier Old Men (1995),3.259615,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,moldy old
3,4,Waiting to Exhale (1995),2.357143,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,
4,5,Father of the Bride Part II (1995),3.071429,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,pregnancy remake


### Attempt clustering based on tags
My first attempt at clustering movies was based on using tags. It would stand to reason that two closely related funny movies would contain words like 'hysterical' or 'hilarious'. Recommending related movies like this is fairly logical. Unfortunately, only a small subset of the movies in this dataset have any tags at all.

In [9]:
# Only a small percentage of movies have tags, so clustering on this may not yield the best results

tags = movies_df.loc[movies_df.tags != ''].tags.values

print('{} movies out of {} have any tags at all'.format(len(tags), len(movies_df)))

1572 movies out of 9742 have any tags at all


### Create word vectorizer and encode tags
The TFIDF vectorizer transforms the tags into a forms that can be clustered by the K-Means algorithm.

In [10]:
v = TfidfVectorizer(stop_words='english')
x = v.fit_transform(tags)

### Cluster movies based on tags using KMeans
Clustering movies based on tags gives less than ideal performance. The model tends to group a large majority of the movies in a single cluster. This behavior persists through parameter manipulation.

In [11]:
n_clusters = 15
model = KMeans(n_clusters=n_clusters, random_state=1).fit(x)

In [12]:
# A large majority of the movies are placed in one cluster, perhaps clustering based on tags isn't the best option

movies_in_each_cluster = [0] * n_clusters

for tag in tags:
    word_encoding = v.transform([tag])
    predicted_cluster = model.predict(word_encoding)[0]
    movies_in_each_cluster[predicted_cluster] += 1
    
movies_in_each_cluster

[53, 1145, 19, 20, 119, 22, 30, 43, 24, 11, 26, 7, 8, 33, 12]

### Attempt to cluster based on numerical features
Since clustering based on tag was unsuccessful, the next thing I try is clustering based on everything else. The binary nature of the genres will undoubtedly lead to better results. The ratings feature may add some additional quality (e.g. a critically-acclaimed children's movie will not be grouped together with a low-rated children's movie).

Here, I create the K-Means model and create the various clusters.

In [13]:
# Average ratings an genres may cluster movies more appropriately

clustering_columns = movies_df[['avg_rating'] + list(genres.keys())].to_numpy()

model = KMeans(n_clusters=n_clusters, random_state=1).fit(clustering_columns)

In [14]:
# Using the same KMeans model yields a much more uniform distribution, which bodes well

movies_in_each_cluster = [0] * n_clusters

for movie in clustering_columns:
    predicted_cluster = model.predict(movie.reshape(1, -1))[0]
    movies_in_each_cluster[predicted_cluster] += 1
    
movies_in_each_cluster

[811, 1257, 449, 970, 587, 665, 465, 439, 506, 583, 505, 433, 757, 556, 759]

### Cluster similar movies
Here, I record which cluster each movie belongs to.

In [15]:
movie_clusters = [[] for i in range(n_clusters)]

for index, row in movies_df.iterrows():
    model_input = row[['avg_rating'] + list(genres.keys())].to_numpy().reshape(1, -1)
    cluster = model.predict(model_input)[0]
    movie_clusters[cluster].append(row['title'])

### Define a function that uses our clustering model to make recommendations
This function simply takes in the title of a movie in the dataset and randomly returns three movies in the input movie's cluster as recommendations.

In [16]:
def recommend_movies(title):
    movie_row = movies_df.loc[movies_df.title == title]
    model_input = movie_row[['avg_rating'] + list(genres.keys())].to_numpy().reshape(1, -1)
    cluster = model.predict(model_input)[0]
    print(random.sample(movie_clusters[cluster], 3))

### Testing the model
This is the expected behavior of a movie recommendation system.

In [17]:
# Expect children's movies
recommend_movies('Toy Story (1995)')

['New Adventures of Pippi Longstocking, The (1988)', 'Who Framed Roger Rabbit? (1988)', 'Frozen (2013)']


In [18]:
# Expect action movies
recommend_movies('John Wick (2014)')

['15 Minutes (2001)', 'Marnie (1964)', 'Death Sentence (2007)']
