In [2]:
import pandas as pd
import os
import glob
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import adjusted_rand_score
import matplotlib.pyplot as plt
import numpy as np
import ast

In [3]:
def combine_document(df,movielist):
    DocumentDict = {}
    for movie in movielist:
        keywords = ''
        document = df[df['original_title'] == movie]['keywords']
        for ele in ast.literal_eval(document.values[0]):
            keywords += ele['name']+' '
        keywords += str(df[df['original_title'] == movie]['overview'])
        DocumentDict[movie] = keywords
    return DocumentDict

def train_model(Dict,nclusters,vectorizer):
    documents = DocumentDict.values()
    X = vectorizer.fit_transform(documents)
    return X

def score_movies(Dict,vectorizer):
    MovieDict = {}
    for movie in Dict:
        Y = vectorizer.transform([Dict[movie]])
        MovieDict[movie] = Y
    return MovieDict

def recommend_movies(Dict,movie1,n_recs):
    RecDict = {}
    for movie in Dict:
        dotprod = np.dot(Dict[movie1].toarray(),np.transpose(Dict[movie].toarray()))
        RecDict[movie] = dotprod
    recs = sorted(RecDict, key=RecDict.get, reverse=True)[1:n_recs+1]
    print('If you liked '+movie1+' you will also like:')
    print(recs)
    return RecDict

In [5]:
df1 = pd.read_csv('./sentiment_dataset/ratings_and_sentiment_v2.csv')
directory = './dataset_5000/tmdb_5000_movies.csv'
df = pd.read_csv(directory)
# Find the common movies between the two conflicting datasets
common = []    
for movie in df['original_title']:
    if movie in df1['original_title'].tolist():
        common.append(movie)

# Initialize the TFIDF vectorizer
vectorizer = TfidfVectorizer(stop_words='english')
# Compile all the movie documents, i.e. keywords and movie descriptions
DocumentDict = combine_document(df,common)
# Once we have the list of movies and dataframes we compute the TFIDF scores of all the documents
model = train_model(DocumentDict,10,vectorizer)
# Create a dictionary that organizes movies with their TFIDF matrix
MovieDict = score_movies(DocumentDict,vectorizer)
# Finally given any movie in the dataset, the 5 movies with the most in common will be recommended
recommend_movies(MovieDict,'Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan',5)


If you liked Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan you will also like:
['Good Night, and Good Luck.', 'Chicken Run', 'Zodiac', 'Love Actually', 'American Beauty']


{'Avatar': array([[0.02644983]]),
 "Pirates of the Caribbean: At World's End": array([[0.01790174]]),
 'Spectre': array([[0.00427673]]),
 'The Dark Knight Rises': array([[0.00283706]]),
 'Spider-Man 3': array([[0.00327108]]),
 'Tangled': array([[0.01415989]]),
 'Avengers: Age of Ultron': array([[0.01777868]]),
 'Batman v Superman: Dawn of Justice': array([[0.00341671]]),
 'Quantum of Solace': array([[0.00479516]]),
 "Pirates of the Caribbean: Dead Man's Chest": array([[0.01751397]]),
 'Man of Steel': array([[0.00382914]]),
 'The Avengers': array([[0.03375739]]),
 'Pirates of the Caribbean: On Stranger Tides': array([[0.01807983]]),
 'Men in Black 3': array([[0.00390799]]),
 'The Hobbit: The Battle of the Five Armies': array([[0.00372206]]),
 'The Amazing Spider-Man': array([[0.01348586]]),
 'The Hobbit: The Desolation of Smaug': array([[0.0041235]]),
 'Titanic': array([[0.00261242]]),
 'Captain America: Civil War': array([[0.03556539]]),
 'Jurassic World': array([[0.00295545]]),
 'Skyf