In [180]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import json
import ast
import pickle

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [151]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
import re

In [152]:
df_credit = pd.read_csv(
    "D:\\Projects\\Movie Recomendation System\\Data\\tmdb_5000_credits.csv"
)
df_movies = pd.read_csv(
    "D:\\Projects\\Movie Recomendation System\\Data\\tmdb_5000_movies.csv"
)

In [198]:
def merge_dataframes(df1, df2, on_column):
    """
    Merge two DataFrames based on a specific column.

    Parameters:
    - df1 (pd.DataFrame): The first DataFrame.
    - df2 (pd.DataFrame): The second DataFrame.
    - on_column (str): The name of the column to merge on.

    Returns:
    - pd.DataFrame: The merged DataFrame.
    """
    # Use merge() function to merge df1 and df2 on the specified column
    merged_df = pd.merge(df1, df2, on=on_column)
    return merged_df


def extract_names(genre_list):
    """
    Extracts the 'name' values from a list of dictionaries or a stringified list of dictionaries in the 'genre' column.

    Parameters:
    - genre_list: A list of dictionaries, or a string representation of such a list, with 'id' and 'name' keys.

    Returns:
    - list: A list of genre names.
    """
    # Check if genre_list is a string and convert it to a list if necessary
    if isinstance(genre_list, str):
        try:
            genre_list = json.loads(genre_list)
        except json.JSONDecodeError:
            # Return an empty list if there's an issue with converting the string
            return []

    # Extract the 'name' from each dictionary in the list and return them as a list
    return [genre["name"] for genre in genre_list]


def extract_star_names(cast_string):
    """
    Converts a string representation of a list of dictionaries to a list and extracts names of the first three stars.

    Parameters:
    - cast_string (str): A string representation of a list of dictionaries.

    Returns:
    - list: A list containing the names of the first three stars.
    """
    # Convert the string to a list of dictionaries using ast.literal_eval
    cast_list = ast.literal_eval(cast_string)
    # Extract the 'name' key from each dictionary for the first three entries
    return [cast["name"] for cast in cast_list[:3]]  # Limit to first three


def extract_director_name(crew_str):
    try:
        # Safely evaluate the string to a list
        crew_list = ast.literal_eval(crew_str)
        # Extract and return the director's name
        directors = [crew["name"] for crew in crew_list if crew["job"] == "Director"]
        return directors[0] if directors else None
    except (ValueError, SyntaxError):
        # Return None in case of an error
        return None


def process_text(text):
    if not isinstance(text, str):
        return []  # Return an empty list if the input is not a string

    # Convert text to lower case
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # Removing special characters
    tokens = [re.sub(r"[^a-zA-Z0-9]", "", token) for token in tokens]
    tokens = [token for token in tokens if token]  # Remove empty strings

    # Removing stop words and punctuation
    stop_words = set(stopwords.words("english"))
    tokens = [token for token in tokens if token not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(token) for token in tokens]

    return tokens


def lowercase_text(text):
    """
    Convert a text or a list of texts to lowercase.
    If text is a list, it applies lowercase to each element of the list.
    If text is None, returns an empty string.
    """
    if isinstance(text, list):
        return [str(t).lower() for t in text]
    elif isinstance(text, str):
        return text.lower()
    return text


def remove_spaces_from_names(names):
    """
    Remove spaces from each name in a list of names. Handles None values by returning an empty list.
    """
    if names is None:
        return []
    return [name.replace(" ", "") for name in names if name is not None]


def join_list_or_str(value):
    if isinstance(value, list):
        return " ".join(str(v) for v in value)  # Ensure all elements are strings
    return str(value)


def recommend_movies(movie_title, cosine_sim_matrix=cosine_sim_tfidf):
    """
    Recommends movies based on a given movie title.

    Parameters:
    - movie_title (str): The title of the movie to find recommendations for.
    - cosine_sim_matrix (ndarray): The cosine similarity matrix to use.

    Returns:
    - list: Titles of the recommended movies.
    """
    # Find the index of the movie that matches the title
    if movie_title not in df["original_title"].values:
        return "Movie not found."

    movie_idx = df.index[df["original_title"] == movie_title].tolist()[0]

    # Get the pairwise similarity scores of all movies with that movie
    sim_scores = list(enumerate(cosine_sim_matrix[movie_idx]))

    # Sort the movies based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the scores of the 6 most similar movies
    sim_scores = sim_scores[1:7]  # Exclude the first movie itself

    # Get the movie indices
    movie_indices = [i[0] for i in sim_scores]

    # Return the top 6 most similar movies
    return df["original_title"].iloc[movie_indices].tolist()

In [154]:
df_credit.rename(columns={"movie_id": "id"}, inplace=True)

In [155]:
df = merge_dataframes(df_movies, df_credit, "id")

In [156]:
df

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title_x,vote_average,vote_count,title_y,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Spectre,"[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,250000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",http://www.thedarkknightrises.com/,49026,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.312950,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,The Dark Knight Rises,"[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,260000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://movies.disney.com/john-carter,49529,"[{""id"": 818, ""name"": ""based on novel""}, {""id"":...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,John Carter,"[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4798,220000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",,9367,"[{""id"": 5616, ""name"": ""united states\u2013mexi...",es,El Mariachi,El Mariachi just wants to play his guitar and ...,14.269792,"[{""name"": ""Columbia Pictures"", ""id"": 5}]",...,81.0,"[{""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,"He didn't come looking for trouble, but troubl...",El Mariachi,6.6,238,El Mariachi,"[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de..."
4799,9000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",,72766,[],en,Newlyweds,A newlywed couple's honeymoon is upended by th...,0.642552,[],...,85.0,[],Released,A newlywed couple's honeymoon is upended by th...,Newlyweds,5.9,5,Newlyweds,"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de..."
4800,0,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...",http://www.hallmarkchannel.com/signedsealeddel...,231617,"[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...",en,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...",1.444476,"[{""name"": ""Front Street Pictures"", ""id"": 3958}...",...,120.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,,"Signed, Sealed, Delivered",7.0,6,"Signed, Sealed, Delivered","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de..."
4801,0,[],http://shanghaicalling.com/,126186,[],en,Shanghai Calling,When ambitious New York attorney Sam is sent t...,0.857008,[],...,98.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,A New Yorker in Shanghai,Shanghai Calling,5.7,7,Shanghai Calling,"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de..."


In [157]:
df.columns

Index(['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language',
       'original_title', 'overview', 'popularity', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title_x', 'vote_average',
       'vote_count', 'title_y', 'cast', 'crew'],
      dtype='object')

In [158]:
df = df[
    [
        "id",
        "original_title",
        "overview",
        "genres",
        "keywords",
        "cast",
        "crew",
        "tagline",
    ]
]

In [159]:
df

Unnamed: 0,id,original_title,overview,genres,keywords,cast,crew,tagline
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",Enter the World of Pandora.
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...","At the end of the world, the adventure begins."
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",A Plan No One Escapes
3,49026,The Dark Knight Rises,Following the death of District Attorney Harve...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853,...","[{""cast_id"": 2, ""character"": ""Bruce Wayne / Ba...","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de...",The Legend Ends
4,49529,John Carter,"John Carter is a war-weary, former military ca...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 818, ""name"": ""based on novel""}, {""id"":...","[{""cast_id"": 5, ""character"": ""John Carter"", ""c...","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de...","Lost in our world, found in another."
...,...,...,...,...,...,...,...,...
4798,9367,El Mariachi,El Mariachi just wants to play his guitar and ...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...","[{""id"": 5616, ""name"": ""united states\u2013mexi...","[{""cast_id"": 1, ""character"": ""El Mariachi"", ""c...","[{""credit_id"": ""52fe44eec3a36847f80b280b"", ""de...","He didn't come looking for trouble, but troubl..."
4799,72766,Newlyweds,A newlywed couple's honeymoon is upended by th...,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 10749, ""...",[],"[{""cast_id"": 1, ""character"": ""Buzzy"", ""credit_...","[{""credit_id"": ""52fe487dc3a368484e0fb013"", ""de...",A newlywed couple's honeymoon is upended by th...
4800,231617,"Signed, Sealed, Delivered","""Signed, Sealed, Delivered"" introduces a dedic...","[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""nam...","[{""id"": 248, ""name"": ""date""}, {""id"": 699, ""nam...","[{""cast_id"": 8, ""character"": ""Oliver O\u2019To...","[{""credit_id"": ""52fe4df3c3a36847f8275ecf"", ""de...",
4801,126186,Shanghai Calling,When ambitious New York attorney Sam is sent t...,[],[],"[{""cast_id"": 3, ""character"": ""Sam"", ""credit_id...","[{""credit_id"": ""52fe4ad9c3a368484e16a36b"", ""de...",A New Yorker in Shanghai


In [160]:
df["genres"] = df["genres"].apply(extract_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["genres"] = df["genres"].apply(extract_names)


In [161]:
df["keywords"] = df["keywords"].apply(extract_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["keywords"] = df["keywords"].apply(extract_names)


In [162]:
df["cast"] = df["cast"].apply(extract_star_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["cast"] = df["cast"].apply(extract_star_names)


In [163]:
df["crew"] = df["crew"].apply(extract_director_name)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["crew"] = df["crew"].apply(extract_director_name)


In [164]:
df["overview"] = df["overview"].apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["overview"] = df["overview"].apply(process_text)


In [165]:
df["tagline"] = df["tagline"].apply(process_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tagline"] = df["tagline"].apply(process_text)


In [166]:
text_columns = ["overview", "genres", "keywords", "tagline"]
for col in text_columns:
    df[col] = df[col].apply(lowercase_text)
name_columns = ["cast"]
for col in name_columns:
    df[col] = df[col].apply(remove_spaces_from_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(lowercase_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].apply(remove_spaces_from_names)


In [168]:
df["crew"] = df["crew"].apply(lambda x: [x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["crew"] = df["crew"].apply(lambda x: [x])


In [170]:
df["crew"] = df["crew"].apply(remove_spaces_from_names)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["crew"] = df["crew"].apply(remove_spaces_from_names)


In [176]:
df["tags"] = df.apply(
    lambda row: " ".join(
        join_list_or_str(row[col])
        for col in ["overview", "genres", "keywords", "cast", "crew", "tagline"]
    ),
    axis=1,
)

columns_to_drop = ["overview", "genres", "keywords", "cast", "crew", "tagline"]
df.drop(columns=columns_to_drop, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["tags"] = df.apply(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.drop(columns=columns_to_drop, inplace=True)


In [181]:
df["tags"][0]

'22nd centuri parapleg marin dispatch moon pandora uniqu mission becom torn follow order protect alien civil action adventure fantasy science fiction culture clash future space war space colony society space travel futuristic romance space alien tribe alien planet cgi marine soldier battle love affair anti war power relations mind and soul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron enter world pandora'

In [182]:
count_vectorizer = CountVectorizer(max_features=5000, stop_words="english")
bow_matrix = count_vectorizer.fit_transform(df["tags"]).toarray()

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
tfidf_matrix = tfidf_vectorizer.fit_transform(df["tags"]).toarray()

cosine_sim_bow = cosine_similarity(bow_matrix, bow_matrix)
cosine_sim_tfidf = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [208]:
movie_name = "Superman Returns"
recommendations = recommend_movies(movie_name)
print(recommendations)

['Superman II', 'Superman IV: The Quest for Peace', 'Superman', 'Superman III', 'Man of Steel', 'Batman v Superman: Dawn of Justice']


In [209]:
pickle.dump(df, open("movie_list.pkl", "wb"))
pickle.dump(cosine_sim_tfidf, open("similarity.pkl", "wb"))