In [8]:
import pandas
import numpy
import sklearn.feature_extraction.text
import sklearn.metrics.pairwise
import ast
import pickle

credits_data = pandas.read_csv("tmdb_5000_credits.csv")
movies_data = pandas.read_csv("tmdb_5000_movies.csv")

credits_data.columns = ["id", "title", "cast", "crew"]

data = movies_data.merge(credits_data[["id", "cast", "crew"]], on = "id")

####################################################################################################

# Movie Recommendation based on overview(main plot points, storyline, etc.)

tfidf = sklearn.feature_extraction.text.TfidfVectorizer(stop_words = "english")
data["overview"] = data["overview"].fillna("")
tfidf_matrix = tfidf.fit_transform(data["overview"])

cosine_similarity1 = sklearn.metrics.pairwise.linear_kernel(tfidf_matrix, tfidf_matrix)

####################################################################################################



####################################################################################################

# Movie Recommendation based on cast, director, genres and keywords

features1 = ["cast", "crew", "genres", "keywords"]
for feature in features1:
    data[feature] = data[feature].apply(ast.literal_eval)

def get_director(x):
    for i in x:
        if i["job"] == "Director":
            return i["name"]
    return numpy.nan

data["director"] = data["crew"].apply(get_director)

def get_list(x):
    if isinstance(x, list):
        names = [i["name"] for i in x]
        if len(names) > 3:
            names = names[:3]
        return names
    return []

features2 = ["cast", "genres", "keywords"]
for feature in features2:
    data[feature] = data[feature].apply(get_list)

def adjust_data(x):
    if isinstance(x, list):
        return [str.lower(i.replace(" ", "")) for i in x]
    else:
        if isinstance(x, str):
            return str.lower(x.replace(" ", ""))
        else:
            return ""

features3 = ["cast", "director", "genres", "keywords"]
for feature in features3:
    data[feature] = data[feature].apply(adjust_data)

def concatenate_data(x):
    cast = " ".join(x["cast"])
    director = x["director"]
    genres = " ".join(x["genres"])
    keywords = " ".join(x["keywords"])
    return cast + " " + director + " " + genres + " " + keywords

data["concatenated_data"] = data.apply(concatenate_data, axis = 1)

count = sklearn.feature_extraction.text.CountVectorizer(stop_words = "english")
count_matrix = count.fit_transform(data["concatenated_data"])

cosine_similarity2 = sklearn.metrics.pairwise.cosine_similarity(count_matrix, count_matrix)

####################################################################################################

data = data.reset_index()
movies = data[["id", "title"]].copy()

pickle.dump(movies, open("movies.pickle", "wb"))
pickle.dump(cosine_similarity1, open("cosine_similarity1.pickle", "wb"))
pickle.dump(cosine_similarity2, open("cosine_similarity2.pickle", "wb"))