Content-based recommender is the recommending system to rely on the similarity of items when it recommends items to users. For example, when a user likes a specific movie, the system finds and recommends the items which have similar features to the movie the user likes.

In [None]:
# Based on content based collaborative filtering
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
# get data from S3
# reference to S3
import boto3
import io

bucket='sagemaker-sangam-2021'
movie_file = 'movies.csv'

# create S3 client
s3_client = boto3.client('s3')

obj = s3_client.get_object(Bucket=bucket, Key=movie_file)

movies = pd.read_csv(io.BytesIO(obj['Body'].read()))

movies.head()

In [None]:
movies.shape

In [None]:
# create a function to create a table showing the numbers of missing values for each feature
def create_missing_df(dataframe):
  
  missing_index = dataframe.columns.tolist() 
  missing = dataframe.isnull().sum().tolist()
  missing_df = pd.DataFrame({'Missing':missing}, index=missing_index)

  return missing_df

In [None]:
create_missing_df(movies)

The column title has title and year embedded ...creating two functions to split title and year

In [None]:
# the function to extract titles
def extract_title(title):

  year = title[len(title)-5:len(title)-1]

  # some movies do not have the info about year in the column title. So, we should take care of the case as well.
  if year.isnumeric():
    title_no_year = title[:len(title)-7]
    return title_no_year

  else:
    return title

In [None]:
# the function to extract years
def extract_year(title):

  year = title[len(title)-5:len(title)-1]

  # some movies do not have the info about year in the column title. So, we should take care of the case as well.
  if year.isnumeric():
    return int(year)

  else:
    return np.nan

In [None]:
movies.rename(columns={'title':'title_year'}, inplace=True) # change the column name from title to title_year
movies['title_year'] = movies['title_year'].apply(lambda x: x.strip()) # remove leading and ending whitespaces in title_year
movies['title'] = movies['title_year'].apply(extract_title) # create the column for title
movies['year'] = movies['title_year'].apply(extract_year) # create the column for year

In [None]:
create_missing_df(movies)

The column genres is the only feature used for this recommending system

In [None]:
r,c = movies[movies['genres']=='(no genres listed)'].shape 
print('The number of movies which do not have info about genres:',r)

In [None]:
# since genre is the main feature, deleting all moviews which have no genre
movies = movies[~(movies['genres']=='(no genres listed)')].reset_index(drop=True)

In [None]:
# movies have multiple genre too
movies[['title','genres']].head(5)

Let's see how many times each genre appears in the data.

In [None]:
# remove '|' in the genres column
movies['genres'] = movies['genres'].str.replace('|',' ')

In [None]:
# count the number of occurences for each genre in the data set
counts = dict()

for i in movies.index:
  for g in movies.loc[i,'genres'].split(' '):
    if g not in counts:
      counts[g] = 1
    else:
      counts[g] = counts[g] + 1

In [None]:
plt.figure(figsize=(12,6))
plt.bar(list(counts.keys()), counts.values(), color='g')
plt.xticks(rotation=45)
plt.xlabel('Genres')
plt.ylabel('Counts')

Term Frequency and Inverse Document Frequency (tf-idf)

The TfidVectorizer() class from the sklearn.feature_extraction.text library helps us to calculate the tf-idf scores for each genre in each movie.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

movies['genres'] = movies['genres'].str.replace('Sci-Fi','SciFi')
movies['genres'] = movies['genres'].str.replace('Film-Noir','Noir')

tfidf_vector = TfidfVectorizer(stop_words='english') # create an object for TfidfVectorizer
tfidf_matrix = tfidf_vector.fit_transform(movies['genres']) # apply the object to the genres column

In [None]:
print(list(enumerate(tfidf_vector.get_feature_names())))

Let's see the examples of the vectorization for the first 5 movies.

In [None]:
print(tfidf_matrix[:5])

The first argument in the index above is the same as the index in the dataframe movies (Not movieId), and the second argument represents the index number of genres. The numbers in the second argument correspond to the index number in the previous code output. [(0, 'action'), (1, 'adventure'), (3, 'children'),...] For example, (0,8) = 0.4830 means that movie index = 0 (Toy Story), genre index = 8 ('fantasy'), and the tf-idf score = 0.4830. The tf-idf score of comedy for Toy Story is (0,4)=0.2674. As mentioned earlier, the comedy has lower score than fantasy because it is very common genre in the whole data set, and is considered less important in defining the genre of the movie.

In [None]:
tfidf_matrix.shape

In [None]:
# the first row vector of tfidf_matrix (Toy Story)
tfidf_matrix.todense()[0]

Next, let's caculate the similarity between two movies using the tf-idf matrix. The cosine similarity is used to calculate the similarity, and linear_kernel() class is used to calculate this.

In [None]:
from sklearn.metrics.pairwise import linear_kernel

In [None]:
sim_matrix = linear_kernel(tfidf_matrix,tfidf_matrix) # create the cosine similarity matrix
print(sim_matrix)

sim_matrix$_{ij}$ is the similarity score between movie $i$ and movie $j$.Therefore, the diagonal elements in the matrix represent the similarity scores of a movie with itself, and therefore, the value should be 1. By the definition of the matrix, sim_matrix$_{ij}$ = sim_matrix$_{ji}$

the Recommendation Engine

In [None]:
# the function to convert from index to title_year
def get_title_year_from_index(index):

  return movies[movies.index == index]['title_year'].values[0]

# the function to convert from title to index
def get_index_from_title(title):

  return movies[movies.title == title].index.values[0]

# a function to convert index to title
def get_title_from_index(index):

  return movies[movies.index == index]['title'].values[0]

Apply Levenshtein Distance for 'Did you mean?' Algorithm

We often misspell titles of movies when we use movie recommending system. When we use Google to search movies, if we misspell the title of the movie, we see that Google asks us, 'Did you mean...?'. Let's create the similar function to correct misspelled title in order to make the system work easily. I apply Levenshtein Distance in order to implement 'Did you mean...?' algorithm. fuzzywuzzy library in Python provides fuzz class for the Levenshtein Distance.

In [None]:
!pip install fuzzywuzzy

In [None]:
!pip install python-Levenshtein

In [None]:
from fuzzywuzzy import fuzz

In [None]:
# create a function to find the closest title
def matching_score(a,b):

  return fuzz.ratio(a,b)

fuzz.ratio(a,b) calculates the Levenshtein Distance between a and b, and return the score for the distance. If the two words, a and b, are exactly the same, the score becomes 100. As the distance between the words increases, the score falls.

In [None]:
# the function to return the most similar title to the words a user types
# The function find_closest_title() is supposed to return the most similar title to the words a user types. 
# Without this, the recommending system only works when the movie title a user types is exactly the same 
# as the movie title the system has

def find_closest_title(title):

  leven_scores = list(enumerate(movies['title'].apply(matching_score, b=title)))
  sorted_leven_scores = sorted(leven_scores, key=lambda x: x[1], reverse=True)
  closest_title = get_title_from_index(sorted_leven_scores[0][0])
  distance_score = sorted_leven_scores[0][1]

  return closest_title, distance_score

In [None]:
# the main recommender
def contents_based_recommender(movie_user_likes, how_many):

  closest_title, distance_score = find_closest_title(movie_user_likes)

  if distance_score == 100:

    movie_index = get_index_from_title(closest_title)
    movie_list = list(enumerate(sim_matrix[int(movie_index)]))
    similar_movies = list(filter(lambda x:x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse=True))) # remove the typed movie itself

    print('Here\'s the list of movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')

    for i,s in similar_movies[:how_many]: 
      print(get_title_year_from_index(i))
    

  else:
    print('Did you mean '+'\033[1m'+str(closest_title)+'\033[0m'+'?','\n')

    movie_index = get_index_from_title(closest_title)
    movie_list = list(enumerate(sim_matrix[int(movie_index)]))
    similar_movies = list(filter(lambda x:x[0] != int(movie_index), sorted(movie_list,key=lambda x:x[1], reverse=True)))

    print('Here\'s the list of movies similar to '+'\033[1m'+str(closest_title)+'\033[0m'+'.\n')

    for i,s in similar_movies[:how_many]:
      print(get_title_year_from_index(i))

In [None]:
# testing the system
contents_based_recommender('Monsters, Inc.', 5)

In [None]:
# testing with incorrect name

contents_based_recommender('Monster Incorporation.', 5)

In [None]:
# testing the system
contents_based_recommender('Grumpier Old Men', 5)

In [None]:
# testing the system
contents_based_recommender('Grumpr Ol Men', 5)