## importing libraries

In [1]:
import pandas as pd
import numpy as np

## loading datasets

In [2]:
movies = pd.read_csv('./data/movies.csv')
credits = pd.read_csv('./data/credits.csv')
print(movies.shape, credits.shape)
movies.head(1)

(4803, 20) (4803, 4)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [3]:
credits.head(2)

Unnamed: 0,movie_id,title,cast,crew
0,19995,Avatar,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [4]:
#merge credits with movies dataframe
movies_df = movies.merge(credits, on="title")
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


## Data preprocessing

In [5]:
#extracting only required columns
movies_df = movies_df.loc[:, ['movie_id','title','genres','overview','keywords','cast','crew']]
movies_df.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [6]:
#check if any missing value present in the dataset
movies_df.isnull().sum()

movie_id    0
title       0
genres      0
overview    3
keywords    0
cast        0
crew        0
dtype: int64

In [7]:
# drop rows having missing values
movies_df.dropna(inplace = True)

In [8]:
# check if any duplicate rows present
movies_df.duplicated().sum()

0

In [9]:
movies_df['overview']

0       In the 22nd century, a paraplegic Marine is di...
1       Captain Barbossa, long believed to be dead, ha...
2       A cryptic message from Bond’s past sends him o...
3       Following the death of District Attorney Harve...
4       John Carter is a war-weary, former military ca...
                              ...                        
4804    El Mariachi just wants to play his guitar and ...
4805    A newlywed couple's honeymoon is upended by th...
4806    "Signed, Sealed, Delivered" introduces a dedic...
4807    When ambitious New York attorney Sam is sent t...
4808    Ever since the second grade when he first saw ...
Name: overview, Length: 4806, dtype: object

In [10]:
#converting string into list
movies_df['overview'] = movies_df['overview'].apply(lambda x:x.split())

In [11]:
movies_df['genres'].values

array(['[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]',
       '[{"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 28, "name": "Action"}]',
       '[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 80, "name": "Crime"}]',
       ...,
       '[{"id": 35, "name": "Comedy"}, {"id": 18, "name": "Drama"}, {"id": 10749, "name": "Romance"}, {"id": 10770, "name": "TV Movie"}]',
       '[]', '[{"id": 99, "name": "Documentary"}]'], dtype=object)

In [12]:
import ast

def convert(obj):
    '''
    this function returns the name from the dict 
    '''
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [13]:
movies_df['genres'] = movies_df['genres'].apply(convert)
movies_df.head(1)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."


In [14]:
movies_df['keywords'] = movies_df['keywords'].apply(convert)

In [15]:
def convert2(obj):
    '''
    this function returns the first three name from the dict
    '''
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3:
            L.append(i['name'])
            counter+=1
        else:
            break
    return L

In [16]:
movies_df['cast'] = movies_df['cast'].apply(convert2)

In [17]:
movies_df.head(5)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller]","[Following, the, death, of, District, Attorney...","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman]","[{""credit_id"": ""52fe4781c3a36847f81398c3"", ""de..."
4,49529,John Carter,"[Action, Adventure, Science Fiction]","[John, Carter, is, a, war-weary,, former, mili...","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton]","[{""credit_id"": ""52fe479ac3a36847f813eaa3"", ""de..."


In [18]:
movies_df.shape

(4806, 7)

In [19]:
movies_df.describe()

Unnamed: 0,movie_id
count,4806.0
mean,56922.559509
std,88309.447559
min,5.0
25%,9009.75
50%,14615.5
75%,58476.75
max,447027.0


In [20]:
def convert_crew(obj):
    '''
    this function returns the name from the dict where the job is Director
    '''
    L = []
    for i in ast.literal_eval(obj):
        if i['job']=='Director':
            L.append(i['name'])
            break
    return L

In [21]:
movies_df['crew'] = movies_df['crew'].apply(convert_crew)

In [22]:
movies_df.head(3)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux]",[Sam Mendes]


In [23]:
#transforming the string having space to nonspace 
movies_df['genres'] = movies_df['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x: [i.replace(" ","") for i in x])
movies_df['cast'] = movies_df['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies_df['crew'] = movies_df['crew'].apply(lambda x: [i.replace(" ","") for i in x])

In [24]:
movies_df.head(2)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski]


In [25]:
movies_df['tags'] = movies_df['genres'] + movies_df['overview'] + movies_df['keywords'] + movies_df['cast'] + movies_df['crew']
movies_df.head(3)

Unnamed: 0,movie_id,title,genres,overview,keywords,cast,crew,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction]","[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction, I..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Captain, Barbossa,, long, believed, to, be, d...","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley]",[GoreVerbinski],"[Adventure, Fantasy, Action, Captain, Barbossa..."
2,206647,Spectre,"[Action, Adventure, Crime]","[A, cryptic, message, from, Bond’s, past, send...","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux]",[SamMendes],"[Action, Adventure, Crime, A, cryptic, message..."


In [26]:
#taking only necessary columns
new_movies_df = movies_df.loc[:, ['movie_id', 'title', 'tags']]
new_movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[Action, Adventure, Fantasy, ScienceFiction, I..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, Captain, Barbossa..."
2,206647,Spectre,"[Action, Adventure, Crime, A, cryptic, message..."
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, Following, th..."
4,49529,John Carter,"[Action, Adventure, ScienceFiction, John, Cart..."


In [27]:
#converting list to string 
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x: " ".join(x))
new_movies_df['tags'][0]

'Action Adventure Fantasy ScienceFiction In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d SamWorthington ZoeSaldana SigourneyWeaver JamesCameron'

In [28]:
#convert all letters to lowercase
new_movies_df['tags'] = new_movies_df['tags'].apply(lambda x: x.lower())
new_movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventure fantasy sciencefiction in the...
1,285,Pirates of the Caribbean: At World's End,"adventure fantasy action captain barbossa, lon..."
2,206647,Spectre,action adventure crime a cryptic message from ...
3,49026,The Dark Knight Rises,action crime drama thriller following the deat...
4,49529,John Carter,action adventure sciencefiction john carter is...


In [29]:
#steming the words
import nltk

from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [30]:
ps.stem("loving")

'love'

In [31]:
def stem(text):
    """
    this stem function is used to stemming the words
    text - words given by the users which needs to be stemmed
    """
    y = []
    for i in text.split():
        y.append(ps.stem(i))
        
    string = " ".join(y)
    return string

In [32]:
new_movies_df['tags'] = new_movies_df['tags'].apply(stem)
new_movies_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,action adventur fantasi sciencefict in the 22n...
1,285,Pirates of the Caribbean: At World's End,"adventur fantasi action captain barbossa, long..."
2,206647,Spectre,action adventur crime a cryptic messag from bo...
3,49026,The Dark Knight Rises,action crime drama thriller follow the death o...
4,49529,John Carter,action adventur sciencefict john carter is a w...


### text vectorization

In [33]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000, stop_words="english")

In [34]:
vectors = cv.fit_transform(new_movies_df['tags']).toarray()
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [35]:
#calculating the cosine similarity between movies
from sklearn.metrics.pairwise import cosine_similarity

similarity = cosine_similarity(vectors)

## Recommendation

In [41]:
def recommend(movie_title, movies_df, similarity_matrix):
    try:
        # Find the index of the movie in the DataFrame
        movie_index = movies_df[movies_df['title'].str.lower() == movie_title.lower()].index[0]

        # Calculate the cosine similarity distances for the specified movie
        distances = similarity_matrix[movie_index]

        # Sort the distances and retrieve the top 5 similar movies
        similar_movies = sorted(list(enumerate(distances)), key=lambda x: x[1], reverse=True)[1:6]

        # Extract and return the titles of the recommended movies
        recommended_movies = [movies_df.iloc[i[0]].title for i in similar_movies]

        return recommended_movies

    except IndexError:
        print(f"Movie '{movie_title}' not found in the database. Please try a different movie.")
        return []

In [43]:
recommended_movies = recommend('Avatar', new_movies_df, similarity)
if recommended_movies:
    print(f"Top 5 recommended movies for 'Avengers' are:")
    for idx, movie in enumerate(recommended_movies, 1):
        print(f"{idx}. {movie}")

Top 5 recommended movies for 'Avengers' are:
1. Aliens vs Predator: Requiem
2. Falcon Rising
3. Independence Day
4. Jupiter Ascending
5. Titan A.E.


In [None]:
import joblib

joblib.dump(cv, 'count_vectorizer.joblib')
joblib.dump(similarity, 'cosine_similarity_matrix.joblib')

print("Model components saved successfully!")


Model components saved successfully!
