# **Movie Recommender System**

## **Importing Libraries & Dataset**

In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import ast
import re
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from sklearn.metrics.pairwise import cosine_similarity
!pip install fuzzywuzzy


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collecting fuzzywuzzy
  Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl.metadata (4.9 kB)
Downloading fuzzywuzzy-0.18.0-py2.py3-none-any.whl (18 kB)
Installing collected packages: fuzzywuzzy
Successfully installed fuzzywuzzy-0.18.0


In [6]:
movies_df = pd.read_csv("tmdb_5000_movies.csv")
credits_df = pd.read_csv("tmdb_5000_credits.csv")

In [5]:
movies_df.shape

(4803, 20)

In [7]:
credits_df.shape

(4803, 4)

In [8]:
df = movies_df.merge(credits_df, on="title")

## **Data Preprocessing**

In [9]:
# Removing unnecessary columns and keeping only useful columns

df = df[["movie_id", "title", "overview", "genres", "keywords", "cast", "crew"]]
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [10]:
df.isna().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [11]:
df.dropna(inplace=True)

In [12]:
df.duplicated().sum()

0

In [13]:
def string_extractor(obj):
    string_list = []
    for i in ast.literal_eval(obj):
        string_list.append(i["name"])
    return string_list

In [14]:
df['genres']= df.genres.apply(string_extractor)

In [15]:
df['keywords']= df.keywords.apply(string_extractor)

In [19]:
def cast_extractor(obj):
    return [i["character"] for i in ast.literal_eval(obj)[:3]]

In [17]:
df['cast'] = df.cast.apply(cast_extractor)

In [20]:
def director_extractor(obj):
    string_list = []
    for i in ast.literal_eval(obj):
      if i["job"] == "Director":
          string_list.append(i["name"])
    return string_list

In [21]:
df['director'] = df.crew.apply(director_extractor)
df.drop("crew", axis=1, inplace=True)

In [22]:
df['overview'] = df.overview.apply(lambda x: x.split())

In [23]:
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Jake Sully, Neytiri, Dr. Grace Augustine]",[James Cameron]


In [24]:
df['genres'] = df.genres.apply(lambda x: [i.replace(" ","") for i in x])
df['keywords'] = df.keywords.apply(lambda x: [i.replace(" ","") for i in x])
df['cast'] = df.cast.apply(lambda x: [i.replace(" ","") for i in x])
df['director'] = df.director.apply(lambda x: [i.replace(" ","") for i in x])
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[JakeSully, Neytiri, Dr.GraceAugustine]",[JamesCameron]


In [25]:
df['tags'] = df.overview + df.genres + df.keywords + df.cast + df.director
df.head(1)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,director,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[JakeSully, Neytiri, Dr.GraceAugustine]",[JamesCameron],"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [27]:
movies = df[["movie_id", "title", "tags"]]
movies.head(1)

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."


In [28]:
movies.shape

(4806, 3)

In [29]:
movies.loc[:, 'tags'] = movies['tags'].apply(lambda x: " ".join(x))

In [30]:
movies.loc[:, 'tags'] = movies['tags'].apply(lambda x: x.lower())

In [31]:
movies.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."


## **Text Vectorization**

In [32]:
stemmer = PorterStemmer()

def preprocess_and_stem(text):
    # Tokenizer function to process and stem the text
    def tokenizer(text):
        # Tokenize the text
        tokens = word_tokenize(text)
        # Process tokens
        processed_tokens = []
        for token in tokens:
            # Check if token is a digit and greater than 1800
            if token.isdigit():
                if int(token) > 1800:
                    processed_tokens.append(stemmer.stem(token))
            # Process words
            elif re.match(r'[a-zA-Z]+', token):
                processed_tokens.append(stemmer.stem(token))
        return processed_tokens

    # Apply the tokenizer
    return tokenizer(text)


In [33]:
cv = CountVectorizer(
    max_features=5000,
    stop_words='english',
    tokenizer=preprocess_and_stem
)
vectors = cv.fit_transform(movies["tags"]).toarray()



In [34]:
cv.get_feature_names_out()

array(['1944', '1950', '1959', ..., 'zombieapocalyps', 'zone', 'zoo'],
      dtype=object)

In [44]:
similarity = cosine_similarity(vectors)

In [46]:
similarity.shape

(4806, 4806)

## **Recommendation**

In [58]:
from fuzzywuzzy import process

def recommend(movie_title):
    # Find the closest match in the movies dataset
    best_match, score, _ = process.extractOne(movie_title, movies['title'])

    if score < 70:  # You can adjust the threshold as needed
        print("No close match found.")
        return

    # Get the index of the closest match
    movie_index = movies[movies["title"] == best_match].index[0]

    # Compute distances from the movie index
    distances = similarity[movie_index]

    # Get the indices of the most similar movies
    similar_movies = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    # Print recommended movie titles
    print(f"Recommendations based on your input '{movie_title}' (closest match: '{best_match}'):")
    for i in similar_movies:
        print(movies.iloc[i[0]].title)


In [65]:
recommend("super")

Recommendations based on your input 'super' (closest match: 'Super'):
My Big Fat Greek Wedding 2
Lovely & Amazing
Punch-Drunk Love
Eulogy
Keeping Up with the Steins


In [66]:
import pickle
pickle.dump(movies, open("movies.pkl", "wb"))