In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import linear_kernel
from ast import literal_eval
import warnings
warnings.filterwarnings('ignore')

In [2]:
movie_keywords = pd.read_csv("/home/kimberly/PycharmProjects/pythonProject2/keywords.csv", low_memory=True)
movie_credit = pd.read_csv("/home/kimberly/PycharmProjects/pythonProject2/credits.csv", low_memory=True)
movie_metadata = pd.read_csv("/home/kimberly/PycharmProjects/pythonProject2/movies_metadata.csv", low_memory=True)

In [3]:
movie_metadata1 = movie_metadata.drop([19730, 29503, 35587])

In [4]:
movie_metadata1['id'] = movie_metadata1['id'].astype(int)

In [5]:
movie_ = movie_metadata1.merge(movie_credit, on="id")
movie_ = movie_.merge(movie_keywords, on="id")

In [6]:
movie_['director']

In [None]:
movie_.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 46628 entries, 0 to 46627
Data columns (total 27 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  46628 non-null  object 
 1   belongs_to_collection  4574 non-null   object 
 2   budget                 46628 non-null  object 
 3   genres                 46628 non-null  object 
 4   homepage               8009 non-null   object 
 5   id                     46628 non-null  int64  
 6   imdb_id                46611 non-null  object 
 7   original_language      46617 non-null  object 
 8   original_title         46628 non-null  object 
 9   overview               45633 non-null  object 
 10  popularity             46624 non-null  object 
 11  poster_path            46229 non-null  object 
 12  production_companies   46624 non-null  object 
 13  production_countries   46624 non-null  object 
 14  release_date           46540 non-null  object 
 15  re

In [None]:
movie_.isna().sum()

In [None]:
features = ['genres', 'cast', 'crew', 'keywords']
for feature in features:
    movie_[feature] = movie_[feature].apply(literal_eval)

In [None]:
movie_.to_csv('recommending_movies')

In [None]:
import numpy as np


def get_director(x):
    for i in x:
        if i['job'] == 'Director':
            return i['name']
    return np.nan

In [None]:
def get_list(x):
    if isinstance(x, list):
        names = [i['name'] for i in x]
        #check if more than 3 elements exist. If yes, return only first three. if no,return entire list.
        if len(names) > 3:
            names = names[:3]
        return names

    #return empty list in case of missing/ malformed data
    return []

In [None]:
#Define new director, cast, genres, and keywords features that are in a suitable form.
movie_['director'] = movie_['crew'].apply(get_director)

features = ['cast', 'keywords', 'genres']
for feature in features:
    movie_[feature] = movie_[feature].apply(get_list)

In [None]:
movie_[['title', 'cast', 'director', 'keywords', 'genres']].head(5)

In [None]:
def clean_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        #check if director exists, if not, return empty string
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ''

In [None]:
features = ['cast', 'keywords', 'director', 'genres']

for feature in features:
    movie_[feature] = movie_[feature].apply(clean_data)

In [None]:
def create_soup(x):
    return " ".join(x['keywords'])+ " "+" ".join(x['director']+" "+" ".join(x['genres']))

In [None]:
#create a new soup feature
movie_['soup'] = movie_.apply(create_soup, axis = 1)

In [None]:
movie_[['soup']].head()

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_count = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf_count.fit_transform(movie_['soup'])

In [None]:
tfidf_matrix.shape

In [None]:
from scipy.spatial.distance import hamming
hamming_matric = hamming(tfidf_matrix, tfidf_matrix)

In [None]:
metadata = movie_.reset_index()
indices = pd.Series(metadata.index, index=metadata['title'])

In [None]:
def get_recommendations(title, cosine_sim=hamming_matric):
    # get the index of the movie that matches the title
    idx = indices[title]

    # get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim[idx]))

    # sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)

    # get the scores of the 10 most similar movies
    sim_scores = sim_scores[1:11]

    # get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 10 most similar movies
    return metadata['title'].iloc[movie_indices]

In [None]:
get_recommendations('The Godfather', hamming_matric)

In [None]:
from collections import defaultdict
from surprise import SVD
from surprise.model_selection import KFold

def precision_recall_at_k(predictions, k=10, threshold=3.5):
    """Return precision and recall at k  metrics for each user"""

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold)) for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls

In [None]:
recom_mod = pd.read_csv('recommending_movies')
kf = KFold(n_splits=5)
algo = SVD()

for trainset, testset in kf.split(recom_mod):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

# Precision and recall can then be averaged over all users
print(sum(prec for prec in precisions.values()) / len(precisions))
print(sum(rec for rec in recalls.values()) / len(recalls))