In [231]:
import pandas as pd
import numpy as np

# Loading Data

In [232]:
df = pd.read_csv(r"data\movie_data.csv", lineterminator="\n")

In [233]:
df.columns

Index(['Unnamed: 0', 'adult', 'backdrop_path', 'movie_id', 'original_language',
       'original_title', 'overview', 'popularity', 'poster_path',
       'release_date', 'title', 'video', 'vote_average', 'vote_count',
       'genres', 'keywords', 'cast', 'crew'],
      dtype='object')

In [234]:
df.head(1)

Unnamed: 0.1,Unnamed: 0,adult,backdrop_path,movie_id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres,keywords,cast,crew
0,0,False,/kQM7o3NIkruIZLoQ9E2XzZQ8Ujl.jpg,783461,hi,लूप लपेटा,"When her boyfriend loses a mobster's cash, Sav...",56.311,/onGdT8sYi89drvSJyEJnft97rOq.jpg,2022-02-04,Looop Lapeta,False,6.2,54,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...","['remake', 'looop lapeta', 'saade saati']","[{'adult': False, 'gender': 1, 'id': 550167, '...","[{'adult': False, 'gender': 2, 'id': 1071, 'kn..."


# EDA

In [235]:
#dropping the unnamed column
df.drop(columns=['Unnamed: 0'], inplace=True)

In [236]:
df.head(1)

Unnamed: 0,adult,backdrop_path,movie_id,original_language,original_title,overview,popularity,poster_path,release_date,title,video,vote_average,vote_count,genres,keywords,cast,crew
0,False,/kQM7o3NIkruIZLoQ9E2XzZQ8Ujl.jpg,783461,hi,लूप लपेटा,"When her boyfriend loses a mobster's cash, Sav...",56.311,/onGdT8sYi89drvSJyEJnft97rOq.jpg,2022-02-04,Looop Lapeta,False,6.2,54,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...","['remake', 'looop lapeta', 'saade saati']","[{'adult': False, 'gender': 1, 'id': 550167, '...","[{'adult': False, 'gender': 2, 'id': 1071, 'kn..."


In [237]:
#checking for missing values in the dataset
df.isnull().sum()

adult                   0
backdrop_path        2851
movie_id                0
original_language       0
original_title          0
overview              216
popularity              0
poster_path           652
release_date            0
title                   0
video                   0
vote_average            0
vote_count              0
genres                  0
keywords                0
cast                    0
crew                    0
dtype: int64

- Overview is our most important feature, therefore we will remove the missing values from the dataset.


In [238]:
df = df.dropna(subset=['overview'])


In [239]:
#dropping columns which are nt needed for the problem statement

df = df.drop(columns=['popularity', 'backdrop_path', 'vote_count', 'vote_average', 'video', 'popularity'])

df.head(1)

Unnamed: 0,adult,movie_id,original_language,original_title,overview,poster_path,release_date,title,genres,keywords,cast,crew
0,False,783461,hi,लूप लपेटा,"When her boyfriend loses a mobster's cash, Sav...",/onGdT8sYi89drvSJyEJnft97rOq.jpg,2022-02-04,Looop Lapeta,"[{'id': 28, 'name': 'Action'}, {'id': 35, 'nam...","['remake', 'looop lapeta', 'saade saati']","[{'adult': False, 'gender': 1, 'id': 550167, '...","[{'adult': False, 'gender': 2, 'id': 1071, 'kn..."


In [240]:
#checking for duplicates in the dataset
df.duplicated().sum()

0

# Data Cleaning

Extracting useful information from the columns - 
- genres, cast and crew

1. Genres

In [241]:
df['genres'][0]

"[{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}]"

In [242]:
import ast

In [243]:
#creating a helper function for genre
def process_genres(obj):
    genres = ["Genre"]
    obj = ast.literal_eval(obj)
    for dict in obj:
        genres.append(dict['name'].replace(" ", ""))
    return genres


In [244]:
#example of function to process genres
process_genres("[{'id': 28, 'name': 'Action'}, {'id': 35, 'name': 'Comedy'}, {'id': 80, 'name': 'Crime'}]")

['Genre', 'Action', 'Comedy', 'Crime']

In [245]:
#applying the function on entire dataset
df["genres"] = df["genres"].map(process_genres)

In [246]:
df["genres"].head()

0          [Genre, Action, Comedy, Crime]
1        [Genre, Action, Crime, Thriller]
2    [Genre, Action, Adventure, Thriller]
3                [Genre, Crime, Thriller]
4                  [Genre, Drama, Comedy]
Name: genres, dtype: object

2. Cast

In [247]:
df.cast[0]

"[{'adult': False, 'gender': 1, 'id': 550167, 'known_for_department': 'Acting', 'name': 'Taapsee Pannu', 'original_name': 'Taapsee Pannu', 'popularity': 6.156, 'profile_path': '/u9Gwg3J0bdczTPunP8qoWK2Vhu4.jpg', 'cast_id': 1, 'character': 'Savi', 'credit_id': '5ff5b30128723c0040af9605', 'order': 0}, {'adult': False, 'gender': 2, 'id': 1714229, 'known_for_department': 'Acting', 'name': 'Tahir Raj Bhasin', 'original_name': 'Tahir Raj Bhasin', 'popularity': 2.203, 'profile_path': '/A4ylMYWj8IqnmX74Two8c4Lp8jY.jpg', 'cast_id': 3, 'character': 'Satya', 'credit_id': '5ff5b31dd38b580043cb4047', 'order': 1}, {'adult': False, 'gender': 1, 'id': 2092738, 'known_for_department': 'Acting', 'name': 'Shreya Dhanwanthary', 'original_name': 'Shreya Dhanwanthary', 'popularity': 8.802, 'profile_path': '/dSWHYuEWjPzczAzdLFgYHt3Y5Dd.jpg', 'cast_id': 15, 'character': 'Julia', 'credit_id': '6136133e2cde980089dc0082', 'order': 2}, {'adult': False, 'gender': 2, 'id': 101823, 'known_for_department': 'Acting', 

In [248]:
a = ast.literal_eval(df['cast'][0])
a[0]['name']

'Taapsee Pannu'

In [249]:
#we will extract only the top 3 actors from the cast for each movie

def process_cast(obj):
    cast = ["Actors"]
    obj = ast.literal_eval(obj)
    for dict in obj:
        cast.append(dict['name'].replace(" ", ""))
        if len(cast) >= 3:
            break
    return cast

In [250]:
process_cast(df.cast[0])

['Actors', 'TaapseePannu', 'TahirRajBhasin']

In [251]:
df['cast'] = df['cast'].map(process_cast)

In [252]:
df['cast'].head()

0     [Actors, TaapseePannu, TahirRajBhasin]
1         [Actors, AkshayKumar, KatrinaKaif]
2    [Actors, ShahRukhKhan, DeepikaPadukone]
3         [Actors, YamiGautam, SunnyKaushal]
4            [Actors, AamirKhan, R.Madhavan]
Name: cast, dtype: object

3. Crew

In [253]:
df.crew[0]

"[{'adult': False, 'gender': 2, 'id': 1071, 'known_for_department': 'Directing', 'name': 'Tom Tykwer', 'original_name': 'Tom Tykwer', 'popularity': 4.312, 'profile_path': '/cu7F4AulzRbj0dZH9s5pRM61w64.jpg', 'credit_id': '617796df9ee0ef008c45483e', 'department': 'Writing', 'job': 'Original Story'}, {'adult': False, 'gender': 2, 'id': 1108802, 'known_for_department': 'Production', 'name': 'Mukesh Chhabra', 'original_name': 'Mukesh Chhabra', 'popularity': 2.972, 'profile_path': '/f5XI2wJs2JPcd5FSECsdcnkyOMW.jpg', 'credit_id': '61e09db05bce9e0041751368', 'department': 'Production', 'job': 'Casting Director'}, {'adult': False, 'gender': 0, 'id': 1208683, 'known_for_department': 'Production', 'name': 'Tanuj Garg', 'original_name': 'Tanuj Garg', 'popularity': 0.6, 'profile_path': None, 'credit_id': '5ff5b40d245dbe003d301373', 'department': 'Production', 'job': 'Producer'}, {'adult': False, 'gender': 0, 'id': 1376941, 'known_for_department': 'Production', 'name': 'Siddharth Atha', 'original_na

In [254]:
#extracting only director name from the crew feature

def process_crew(obj):
    crew = ['Director']
    obj = ast.literal_eval(obj)
    for dict in obj:
        if dict['job'] == 'Director':
            crew.append(dict['name'].replace(' ', ''))
            break
    return crew

In [255]:
process_crew(df.crew[0])

['Director', 'AakashBhatia']

In [256]:
df['crew'] = df['crew'].map(process_crew)

In [257]:
df.crew.head()

0      [Director, AakashBhatia]
1       [Director, RohitShetty]
2    [Director, SiddharthAnand]
3         [Director, AjaySingh]
4    [Director, RajkumarHirani]
Name: crew, dtype: object

In [258]:
df['keywords'] = df['keywords'].map(ast.literal_eval)

In [259]:
df.head(2)

Unnamed: 0,adult,movie_id,original_language,original_title,overview,poster_path,release_date,title,genres,keywords,cast,crew
0,False,783461,hi,लूप लपेटा,"When her boyfriend loses a mobster's cash, Sav...",/onGdT8sYi89drvSJyEJnft97rOq.jpg,2022-02-04,Looop Lapeta,"[Genre, Action, Comedy, Crime]","[remake, looop lapeta, saade saati]","[Actors, TaapseePannu, TahirRajBhasin]","[Director, AakashBhatia]"
1,False,592508,hi,Sooryavanshi,"A fearless, faithful albeit slightly forgetful...",/8p3mhjyLjHKtaAv8tFKfvEBtir0.jpg,2021-11-05,Sooryavanshi,"[Genre, Action, Crime, Thriller]","[police, sequel, police officer, cop universe]","[Actors, AkshayKumar, KatrinaKaif]","[Director, RohitShetty]"


In [260]:
#    df['genres'] = df['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
 #   df['cast'] = df['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
  #  df['crew'] = df['crew'].apply(lambda x: [i.replace(' ', '') for i in x])
   # df['keywords'] = df['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])

In [261]:
#Tokenizing the overview column

df['overview'] = df['overview'].apply(lambda x: x.split())

In [262]:
df['overview']

0        [When, her, boyfriend, loses, a, mobster's, ca...
1        [A, fearless,, faithful, albeit, slightly, for...
2        [A, soldier, caught, by, enemies, and, presume...
3        [A, flight, attendant, and, her, boyfriend, mu...
4        [Rascal., Joker., Dreamer., Genius..., You've,...
                               ...                        
15866    [Two, veterans, of, the, Bosnian, War,, one, A...
15867    [Ten, years, ago,, a, tragedy, changed, the, t...
15868    [Beautiful, Mandy, Lane, isn't, a, party, girl...
15869    [As, the, countdown, to, graduation, begins,, ...
15870    [Charlie, and, Dan, have, been, best, friends,...
Name: overview, Length: 15655, dtype: object

In [263]:
df.head(1)

Unnamed: 0,adult,movie_id,original_language,original_title,overview,poster_path,release_date,title,genres,keywords,cast,crew
0,False,783461,hi,लूप लपेटा,"[When, her, boyfriend, loses, a, mobster's, ca...",/onGdT8sYi89drvSJyEJnft97rOq.jpg,2022-02-04,Looop Lapeta,"[Genre, Action, Comedy, Crime]","[remake, looop lapeta, saade saati]","[Actors, TaapseePannu, TahirRajBhasin]","[Director, AakashBhatia]"


In [264]:
#creating a new column which will have all the information of the movie

df['info'] = df['title'].apply(lambda x: "Movie Name " + x).apply(lambda x: x.split()) + df['original_language'].apply(lambda x: "Language" + x).apply(lambda x: x.split()) + df['release_date'].apply(lambda x: "Released" + x[:4] + ".").apply(lambda x: x.split()) + df['cast'] + df['crew'] + df['overview'] + df['keywords']

In [265]:
df['info'][0]

['Movie',
 'Name',
 'Looop',
 'Lapeta',
 'Languagehi',
 'Released2022.',
 'Actors',
 'TaapseePannu',
 'TahirRajBhasin',
 'Director',
 'AakashBhatia',
 'When',
 'her',
 'boyfriend',
 'loses',
 'a',
 "mobster's",
 'cash,',
 'Savi',
 'races',
 'against',
 'the',
 'clock',
 'to',
 'save',
 'the',
 'day',
 '—',
 'if',
 'only',
 'she',
 'can',
 'break',
 'out',
 'of',
 'a',
 'curious',
 'cycle',
 'of',
 'dead',
 'ends.',
 'remake',
 'looop lapeta',
 'saade saati']

Creating the full path for the movie poster, as per the TMDB site the full path for psoter is - 
- https://image.tmdb.org/t/p/w500/onGdT8sYi89drvSJyEJnft97rOq.jpg


In [266]:
df['poster'] = df['poster_path'].apply(lambda x: 'https://image.tmdb.org/t/p/w500' + str(x))

In [267]:
df['poster'][0]

'https://image.tmdb.org/t/p/w500/onGdT8sYi89drvSJyEJnft97rOq.jpg'

# Text Preprocessing

In [277]:
#creatign a new dataframe with the needed columns

movies = df[['movie_id','title', 'info', 'poster']]

In [278]:
movies.head()

Unnamed: 0,movie_id,title,info,poster
0,783461,Looop Lapeta,"[Movie, Name, Looop, Lapeta, Languagehi, Relea...",https://image.tmdb.org/t/p/w500/onGdT8sYi89drv...
1,592508,Sooryavanshi,"[Movie, Name, Sooryavanshi, Languagehi, Releas...",https://image.tmdb.org/t/p/w500/8p3mhjyLjHKtaA...
2,864692,Pathaan,"[Movie, Name, Pathaan, Languagehi, Released202...",https://image.tmdb.org/t/p/w500/m1b9toKYyCujHu...
3,1018228,Chor Nikal Ke Bhaga,"[Movie, Name, Chor, Nikal, Ke, Bhaga, Language...",https://image.tmdb.org/t/p/w500/1MIDERaEUMw1rY...
4,20453,3 Idiots,"[Movie, Name, 3, Idiots, Languagehi, Released2...",https://image.tmdb.org/t/p/w500/66A9MqXOyVFCss...


In [279]:
from nltk import PorterStemmer
from string import punctuation

stemmer = PorterStemmer()


In [280]:
# we will now stem the info column, lower case it  and remove punctuations

def preprocess(obj):
    info = []
    for word in obj:
        word = word.lower()
        if word not in punctuation:
            word = stemmer.stem(word)
            info.append(word)
    return info
            


In [272]:
preprocess(movies['info'][0])

['movi',
 'name',
 'looop',
 'lapeta',
 'languagehi',
 'released2022.',
 'actor',
 'taapseepannu',
 'tahirrajbhasin',
 'director',
 'aakashbhatia',
 'when',
 'her',
 'boyfriend',
 'lose',
 'a',
 "mobster'",
 'cash,',
 'savi',
 'race',
 'against',
 'the',
 'clock',
 'to',
 'save',
 'the',
 'day',
 '—',
 'if',
 'onli',
 'she',
 'can',
 'break',
 'out',
 'of',
 'a',
 'curiou',
 'cycl',
 'of',
 'dead',
 'ends.',
 'remak',
 'looop lapeta',
 'saade saati']

In [281]:
movies['info'].head()

0    [Movie, Name, Looop, Lapeta, Languagehi, Relea...
1    [Movie, Name, Sooryavanshi, Languagehi, Releas...
2    [Movie, Name, Pathaan, Languagehi, Released202...
3    [Movie, Name, Chor, Nikal, Ke, Bhaga, Language...
4    [Movie, Name, 3, Idiots, Languagehi, Released2...
Name: info, dtype: object

In [282]:
movies['info'] = movies['info'].map(preprocess)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['info'] = movies['info'].map(preprocess)


In [283]:
movies['info'].head()

0    [movi, name, looop, lapeta, languagehi, releas...
1    [movi, name, sooryavanshi, languagehi, release...
2    [movi, name, pathaan, languagehi, released2023...
3    [movi, name, chor, nikal, ke, bhaga, languageh...
4    [movi, name, 3, idiot, languagehi, released200...
Name: info, dtype: object

In [284]:
#converting the info column back into a string from the list

movies['info'] = movies['info'].apply(lambda x: " ".join(x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  movies['info'] = movies['info'].apply(lambda x: " ".join(x))


In [285]:
movies['info'][0]

"movi name looop lapeta languagehi released2022. actor taapseepannu tahirrajbhasin director aakashbhatia when her boyfriend lose a mobster' cash, savi race against the clock to save the day — if onli she can break out of a curiou cycl of dead ends. remak looop lapeta saade saati"

# TF-IDF Vectorization

In [286]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [287]:
# Create a TfidfVectorizer object
tfidf = TfidfVectorizer(stop_words='english')

# Apply the vectorizer on the 'info' column
tfidf_matrix = tfidf.fit_transform(movies['info'])

In [288]:
tfidf_matrix[0]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 28 stored elements and shape (1, 55746)>

# Recommender System with Cosine Similarity

In [289]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

This matrix contains the distance of each 15655 movies with all the othe movies

In [290]:
cosine_sim.shape

(15655, 15655)

In [291]:
#function to recommend movies based on user's input

def recommend_movies(movie_name):
    # Get the index of the movie that matches the title
    movie_index = movies[movies['title'].apply(lambda x: x.lower()) == movie_name.lower()].index[0]

    # Get the similarity scores of all movies with the movie_index
    similarity_scores = list(enumerate(cosine_sim[movie_index]))
    
    # Sort the movies based on the similarity scores in descending order
    sorted_movies = sorted(similarity_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 recommended movies
    recommended_movies = [movies.iloc[i[0]]['title'] for i in sorted_movies[1:11]]

    return recommended_movies

In [293]:
recommend_movies('3 idiots')

['3 Idiots',
 'College Campus',
 'Animal House',
 'Yaariyan',
 'Bheja Fry',
 'Smart Ass',
 'Decision',
 'The Dinner Game',
 'Kristy',
 'Doctor G']

# Saving artifacts

In [None]:
movies.head()

Unnamed: 0,movie_id,title,info,poster
0,783461,Looop Lapeta,Movie Name: Looop Lapeta Released in 2022. Act...,https://image.tmdb.org/t/p/w500/onGdT8sYi89drv...
1,592508,Sooryavanshi,Movie Name: Sooryavanshi Released in 2021. Act...,https://image.tmdb.org/t/p/w500/8p3mhjyLjHKtaA...
2,864692,Pathaan,Movie Name: Pathaan Released in 2023. Actors a...,https://image.tmdb.org/t/p/w500/m1b9toKYyCujHu...
3,1018228,Chor Nikal Ke Bhaga,Movie Name: Chor Nikal Ke Bhaga Released in 20...,https://image.tmdb.org/t/p/w500/1MIDERaEUMw1rY...
4,20453,3 Idiots,Movie Name: 3 Idiots Released in 2009. Actors ...,https://image.tmdb.org/t/p/w500/66A9MqXOyVFCss...


In [None]:
movies.to_csv("movies.csv")

In [297]:
# Save the cosine similarity matrix as a .npy file
np.save('cosine_sim.npy', cosine_sim)

In [294]:
#saving cosine similarity matrix 

import pickle 

with open('cosine_sim.pkl', 'wb') as f:
    pickle.dump(cosine_sim, f)

In [295]:
#checkign the loaded object
with open('cosine_sim.pkl', 'rb') as f:
    cosine_sim = pickle.load(f)

In [296]:
cosine_sim

array([[1.        , 0.00415146, 0.01586104, ..., 0.00116486, 0.00151555,
        0.01200664],
       [0.00415146, 1.        , 0.04630842, ..., 0.00127902, 0.02732387,
        0.00130957],
       [0.01586104, 0.04630842, 1.        , ..., 0.00698716, 0.00165243,
        0.00730305],
       ...,
       [0.00116486, 0.00127902, 0.00698716, ..., 1.        , 0.07555539,
        0.00170947],
       [0.00151555, 0.02732387, 0.00165243, ..., 0.07555539, 1.        ,
        0.01724861],
       [0.01200664, 0.00130957, 0.00730305, ..., 0.00170947, 0.01724861,
        1.        ]])