# Goal: To make a recommender system that takes input and recommends 5 similair movies.

In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
movies = movies.merge(credits,on='title')

# Columns to keep
- genres
- id
- keywords
- title
- overview
- tagline
- cast 
- crew

In [4]:
movies = movies[['id','genres','keywords','overview','title','tagline','cast','crew']]

In [6]:
movies.isnull().sum()

id            0
genres        0
keywords      0
overview      3
title         0
tagline     844
cast          0
crew          0
dtype: int64

In [149]:
movies.dropna(inplace=True) # removing the movies with missing overview.

In [8]:
movies.duplicated().sum() # no duplicates.

np.int64(0)

# Now we make the genres,keywords,cast,crew into list as right now they are in different format.

In [91]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [151]:
import ast

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L


In [29]:
convert('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]')
# the genres has list inside string so we first need to convert them back to list

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [152]:
# Now for every movie

movies['genres'] = movies['genres'].apply(convert)

In [153]:
movies['genres']
# this is giving result as expected.
# now the same thing with keywords.
movies['keywords'] = movies['keywords'].apply(convert)
# now the cast - taking top 5 actors.

In [156]:
import ast

def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
           if(i['job']=='Director'):
            L.append(i['name'])
    return L


In [155]:
movies['cast'] = movies['cast'].apply(convert)

In [157]:
movies['crew'] = movies['crew'].apply(convert)

In [158]:
movies['tagline']=movies['tagline'].apply(lambda x: x.lower().split())
movies['overview']=movies['overview'].apply(lambda x: x.lower().split())

In [25]:
def collapse(para):
    return [i.replace(' ','').lower() for i in para]

In [160]:
movies['genres'] = movies['genres'].apply(collapse)
movies['keywords'] = movies['keywords'].apply(collapse)
movies['cast'] = movies['cast'].apply(collapse)
movies['crew'] = movies['crew'].apply(collapse)

In [162]:
movies['tags'] = movies['overview']+movies['genres']+movies['keywords']+movies['cast']+movies['crew']+movies['tagline']

In [163]:
movies.head(1)['tags']

0    [in, the, 22nd, century,, a, paraplegic, marin...
Name: tags, dtype: object

In [166]:
import re

MOVIES['tags'] = MOVIES['tags'].apply(lambda x: ' '.join(re.sub(r'[^\w\s]', '', word) for word in x))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MOVIES['tags'] = MOVIES['tags'].apply(lambda x: ' '.join(re.sub(r'[^\w\s]', '', word) for word in x))


In [167]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [169]:
MOVIES['tags'] = MOVIES['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  MOVIES['tags'] = MOVIES['tags'].apply(stem)


# Preprocessing has been completed.
- Now going to convert the movies into vectors and use cosine similairity to get the how close the movies are to each other and recommend movies closest to them.
- How vectorization is going to happen first find the most common words in the total movies and for me here i am choosing total 5000 words for common words.
- then out of those 5000 common words every movie will be checked kind of like if the contain those common words are in that movie and this will be done for all the 5000 words with the movie tags and this will vectors.
- next what will happen is after the movies are converted into vectors, now I will use cosine similarity (A.B/|A|x|B|)

In [None]:
pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl.metadata (11 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.7.2-cp313-cp313-win_amd64.whl (8.7 MB)
   ---------------------------------------- 0.0/8.7 MB ? eta -:--:--
   ---------------------------------------- 8.7/8.7 MB 89.1 MB/s eta 0:00:00
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scikit-learn

   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------------------- ------------------- 1/2 [scikit-learn]
   -------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [179]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfid.fit_transform(MOVIES['tags']).toarray()
similarity = cosine_similarity(vectors)


In [201]:
MOVIES[MOVIES['tags']=="Avatar"].index[0]

IndexError: index 0 is out of bounds for axis 0 with size 0

In [227]:
def recommend(movie):
    movie_index = MOVIES[MOVIES['title']==movie].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])

    for i in movie_list[1:6]:
        print(MOVIES.iloc[i[0]].title)


In [232]:
recommend("Hero")

Malone
Slow Burn
Patriot Games
Nixon
Jason Bourne


In [233]:
MOVIES.to_csv('cleaned_movies.csv', index=False)


- The MOVIES dataset is good but it only has hollywood movies so right now takin one bollywood movies dataset and after preprocessing bollywood dataset will concatinate with MOVIES

In [8]:
bollywood = pd.read_csv('BollywoodMovieDetail.csv')
bollywood.shape

(1284, 10)

- Going to keep all the same columns as the previous movie Dataset. One flaw that this movie has is it doesn't have overview so i have to get the overview from api if i can find or else have to deal with it without api.

In [34]:
bollywood.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1284 entries, 0 to 1283
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   imdbId       1284 non-null   object 
 1   title        1284 non-null   object 
 2   releaseYear  1284 non-null   int64  
 3   releaseDate  1231 non-null   object 
 4   genre        1282 non-null   object 
 5   writers      1165 non-null   object 
 6   actors       1281 non-null   object 
 7   directors    1280 non-null   object 
 8   sequel       1281 non-null   float64
 9   hitFlop      1284 non-null   int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 100.4+ KB


In [9]:
bollywood = bollywood[['imdbId','title','genre','actors','directors']]

In [10]:
bollywood.isnull().sum()
bollywood.dropna(inplace=True)

- Now cleaning the columns for commas or pipe [|,]

In [11]:
bollywood['title'] = bollywood['title'].str.strip().str.lower()


In [38]:
bollywood['genre'] = bollywood['genre'].str.replace(r'[|,]', ' ', regex=True)
bollywood['actors'] = bollywood['actors'].str.replace(r'[|,]', ' ', regex=True)
bollywood['directors'] = bollywood['directors'].str.replace(r'[|,]', ' ', regex=True)


In [26]:
bollywood['genre'] = bollywood['genre'].apply(collapse)
bollywood['actors'] = bollywood['actors'].apply(collapse)
bollywood['directors'] = bollywood['directors'].apply(collapse)

In [15]:
bollywood.head()

Unnamed: 0,imdbId,title,genre,actors,directors
0,tt0118578,albela,romance,govinda aishwaryaraibachchan jackieshroff namr...,deepaksareen
1,tt0169102,lagaan: once upon a time in india,adventure drama musical,aamirkhan gracysingh rachelshelley paulblackth...,ashutoshgowariker
2,tt0187279,meri biwi ka jawab nahin,action comedy,akshaykumar sridevi gulshangrover laxmikantberde,pankajparashar s.m.iqbal
3,tt0222024,hum tumhare hain sanam,drama romance,shahrukhkhan madhuridixit salmankhan atulagnih...,k.s.adiyaman
4,tt0227194,one 2 ka 4,action comedy drama,shahrukhkhan juhichawla jackieshroff nirmalpandey,shashilalk.nair


In [13]:
def convertNames(name):
        name=name.split('|')
        clean_name=  []
        for i in name:
            word = i.strip().replace(' ','').lower()
            if word:
                  clean_name.append(word)
        return " ".join(clean_name)

In [14]:
bollywood['actors'] = bollywood['actors'].apply(convertNames)
bollywood['genre'] = bollywood['genre'].apply(convertNames)
bollywood['directors'] = bollywood['directors'].apply(convertNames)

In [59]:
pip install requests

Collecting requests
  Downloading requests-2.32.5-py3-none-any.whl.metadata (4.9 kB)
Collecting charset_normalizer<4,>=2 (from requests)
  Downloading charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl.metadata (38 kB)
Collecting idna<4,>=2.5 (from requests)
  Downloading idna-3.11-py3-none-any.whl.metadata (8.4 kB)
Collecting urllib3<3,>=1.21.1 (from requests)
  Downloading urllib3-2.5.0-py3-none-any.whl.metadata (6.5 kB)
Collecting certifi>=2017.4.17 (from requests)
  Downloading certifi-2025.10.5-py3-none-any.whl.metadata (2.5 kB)
Downloading requests-2.32.5-py3-none-any.whl (64 kB)
Downloading charset_normalizer-3.4.4-cp313-cp313-win_amd64.whl (107 kB)
Downloading idna-3.11-py3-none-any.whl (71 kB)
Downloading urllib3-2.5.0-py3-none-any.whl (129 kB)
Downloading certifi-2025.10.5-py3-none-any.whl (163 kB)
Installing collected packages: urllib3, idna, charset_normalizer, certifi, requests

   ---------------------------------------- 0/5 [urllib3]
   ---------------- -----------------


[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import requests
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from nltk.stem.porter import PorterStemmer
API_KEY = "118d023eb1bdf2ef152b4b5a220eee01"

def get_movie_overview(movie_name):
    try:
        url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie_name}"
        response = requests.get(url)
        data = response.json()
        
        # if results found
        if data['results']:
            overview = data['results'][0]['overview']
            return overview
        else:
            return ""
    except Exception as e:
        print(f"Error fetching overview for {movie_name}: {e}")
        return ""


In [18]:
bollywood['overview'] = bollywood['title'].apply(get_movie_overview)

In [23]:
bollywood['overview'] = bollywood['overview'].apply(lambda x: " ".join(x))


In [30]:
bollywood['tags'] = bollywood['genre']+' ' +bollywood['actors']+' ' + bollywood['directors']+' ' + bollywood['overview']

In [33]:
bollywood = bollywood[['imdbId', 'title', 'tags']]

In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=5000, stop_words='english')
vectorsB = tfid.fit_transform(bollywood['tags']).toarray()
similarityB = cosine_similarity(vectorsB)

In [41]:
def recommend(movie):
    movie_index = bollywood[bollywood['title']==movie.lower()].index[0]
    distanceB = similarityB[movie_index]
    movie_list = sorted(list(enumerate(distanceB)),reverse=True,key=lambda x: x[1])

    for i in movie_list[1:6]:
        print(bollywood.iloc[i[0]].title)

In [43]:
recommend('Devdas')

zindagi tere naam
99
grahan
shudra the rising
lajja


In [44]:
MOVIES = pd.read_csv('cleaned_movies.csv')

In [47]:
MOVIES['title'] = MOVIES['title'].str.strip().str.lower()


In [45]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfid.fit_transform(MOVIES['tags']).toarray()
similarity = cosine_similarity(vectors)


In [48]:
def recommend(movie):
    movie_index = MOVIES[MOVIES['title']==movie.lower()].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])

    for i in movie_list[1:6]:
        print(MOVIES.iloc[i[0]].title)

In [49]:
recommend('avatar')

aliens
star trek into darkness
meet dave
apollo 18
titan a.e.


In [50]:
final_Movies = pd.concat([MOVIES,bollywood],ignore_index=True)
final_Movies.drop_duplicates(subset='title', inplace=True)

In [54]:
final_Movies.head()

Unnamed: 0,id,title,tags,imdbId
0,2223.0,avatar,in the 22nd centuri a parapleg marin is dispat...,
1,2223.0,pirates of the caribbean: at world's end,captain barbossa long believ to be dead ha com...,
2,2223.0,spectre,a cryptic messag from bond past send him on a ...,
3,2223.0,the dark knight rises,follow the death of district attorney harvey d...,
4,2223.0,john carter,john carter is a warweari former militari capt...,


In [78]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=8000, stop_words='english')
vectors = tfid.fit_transform(final_Movies['tags']).toarray()
similarity = cosine_similarity(vectors)


In [79]:
def recommend(movie):
    if movie not in final_Movies['title'].str.lower().values:
        print(f"No search results for '{movie}'.")
        return
    
    movie_index = final_Movies[final_Movies['title']==movie.lower()].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])

    for i in movie_list[1:6]:
        print(final_Movies.iloc[i[0]].title)

In [None]:
recommend('')

No search results for 'koi mil gaya'.


In [77]:
final_Movies.reset_index(drop=True, inplace=True)


Day 5

In [2]:
Movies = pd.read_csv('Main_Movies_Dataset.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=8000, stop_words='english')
vectors = tfid.fit_transform(Movies['tags']).toarray()
similarity = cosine_similarity(vectors)


In [None]:
def recommend(movie):
    if movie not in Movies['title'].str.lower().values:
        print(f"No search results for '{movie}'.")
        
        return
    
    movie_index = Movies[Movies['title']==movie.lower()].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])

    for i in movie_list[1:6]:
        print(Movies.iloc[i[0]].title)

In [6]:
recommend('home')

meet dave
planet 51
battlefield earth
escape from planet earth
titan a.e.


import requests

API_KEY = "118d023eb1bdf2ef152b4b5a220eee01"

def get_movie_overview(movie_name):
    try:
        url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie_name}"
        response = requests.get(url)
        data = response.json()
        
        # if results found
        if data['results']:
            overview = data['results'][0]['overview']
            return overview
        else:
            return ""
    except Exception as e:
        print(f"Error fetching overview for {movie_name}: {e}")
        return ""

In [1]:
import requests
import pandas as pd
import numpy as np
import pickle
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import requests
from nltk.stem.porter import PorterStemmer
API_KEY = "118d023eb1bdf2ef152b4b5a220eee01"

In [1]:
pip install python-dotenv

Collecting python-dotenv
  Downloading python_dotenv-1.1.1-py3-none-any.whl.metadata (24 kB)
Downloading python_dotenv-1.1.1-py3-none-any.whl (20 kB)
Installing collected packages: python-dotenv
Successfully installed python-dotenv-1.1.1
Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 25.1.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [32]:
def getMovie(movie):
    try:
        url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie}"
        response = requests.get(url)
        data = response.json()
        if data['results']:
            print(f"{data['results'][0]['title']}")
            movie_id = data['results'][0]['id']
            details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&append_to_response=credits,keywords"
            details = requests.get(details_url).json()
            return details
        else:
            return f"No search results for {movie}"
    except Exception as e:
        return (f"Error finding the {movie}: {e}")

In [33]:
details = getMovie("lagaan")

Lagaan: Once Upon a Time in India


[{'adult': False,
  'gender': 2,
  'id': 52763,
  'known_for_department': 'Acting',
  'name': 'Aamir Khan',
  'original_name': 'Aamir Khan',
  'popularity': 1.2512,
  'profile_path': '/iCBtJHaCmdashFEaFOyO0gSteJk.jpg',
  'cast_id': 1,
  'character': 'Bhuvan',
  'credit_id': '52fe47e99251416c750a99a9',
  'order': 0},
 {'adult': False,
  'gender': 1,
  'id': 85240,
  'known_for_department': 'Acting',
  'name': 'Gracy Singh',
  'original_name': 'Gracy Singh',
  'popularity': 1.0324,
  'profile_path': '/nJRYXqpctHNuswpesaSm6GFsyX0.jpg',
  'cast_id': 2,
  'character': 'Gauri',
  'credit_id': '52fe47e99251416c750a99ad',
  'order': 1},
 {'adult': False,
  'gender': 1,
  'id': 80385,
  'known_for_department': 'Acting',
  'name': 'Rachel Shelley',
  'original_name': 'Rachel Shelley',
  'popularity': 1.025,
  'profile_path': '/AoZAGf20aNUCqh2ujPxfyC1Sk5O.jpg',
  'cast_id': 3,
  'character': 'Elizabeth Russell',
  'credit_id': '52fe47e99251416c750a99b1',
  'order': 2},
 {'adult': False,
  'gender

In [73]:
def clean_Details(details):
    movie_id = details.get('id',None)
    title = details.get('title','').lower()
    
    overview = details.get('overview', '').lower()
    genres = [g['name'].lower().replace(" ", "") for g in details.get('genres', [])]
    keywords = [k['name'].lower().replace(" ", "") for k in details.get('keywords', {}).get('keywords', [])]
    cast = [c['name'].lower().replace(" ", "") for c in details.get('credits', {}).get('cast', [])[:5]]
    crew = [c['name'].lower().replace(" ", "") for c in details.get('credits', {}).get('crew', []) if c.get('job') == 'Director']
    tagline = details.get('tagline','').lower()

    tags = " ".join(genres + keywords + cast + crew + [overview]+[tagline])
    cleaned = pd.DataFrame([{
        'id': movie_id,
        'title': title,
        'tags': tags
    }])


    def stem(text):
        y=[]
        for i in text.split():
         y.append(ps.stem(i))

        return " ".join(y)

    cleaned['tags'] = cleaned['tags'].apply(stem)
    return cleaned

In [72]:
def recommend(movie):
    if movie not in Movies['title'].str.lower().values:
        details = getMovie(movie)
        if not details:
             return
        if details:
            new_Movie = clean_Details(details)
            Movies_dataSet = pd.concat([Movies_dataSet, new_Movie], ignore_index=True)


    movie_index = Movies[Movies['title']==movie.lower()].index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)),reverse=True,key=lambda x: x[1])

    for i in movie_list[1:6]:
        print(Movies.iloc[i[0]].title)

In [31]:
recommend('big hero')

Big Hero 6


KeyError: 'results'

In [17]:
bollywood = pd.read_csv('cleaned_bollywood.csv')

In [18]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem(text):
    y=[]
    for i in text.split():
        y.append(ps.stem(i))

    return " ".join(y)

In [19]:
bollywood['tags'] = bollywood['tags'].apply(stem)

In [20]:
hollywood = pd.read_csv('cleaned_movies.csv')
hollywood['title'] = hollywood['title'].str.strip().str.lower()

In [21]:
Moives_dataSet = pd.concat([hollywood,bollywood],ignore_index=True)
Moives_dataSet.drop_duplicates(subset='title', inplace=True)

In [22]:
Moives_dataSet.reset_index(drop=True, inplace=True)

In [23]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=8000, stop_words='english')
vectors = tfid.fit_transform(Moives_dataSet['tags']).toarray()
similarity = cosine_similarity(vectors)


In [24]:
def recommend(movie):
    global Moives_dataSet, similarity  # make sure we can update the dataset
    
    movie = movie.lower()  
    
    # Check if movie exists in current dataset
    movie_row = Moives_dataSet[Moives_dataSet['title'] == movie]

    if movie not in Moives_dataSet['title'].values:
        details = getMovie(movie)
        if not details:
            print(f"No search results for '{movie}'.")
            return
        
        new_Movie = clean_Details(details)
        # Add and reset index
        Moives_dataSet = pd.concat([Moives_dataSet, new_Movie], ignore_index=True)
        Moives_dataSet.drop_duplicates(subset='title', inplace=True)
        Moives_dataSet.reset_index(drop=True, inplace=True)
        
        # Recompute similarity for updated dataset
        vectors = tfid.fit_transform(Moives_dataSet['tags']).toarray()
        similarity = cosine_similarity(vectors)
    
    # Find index safely
    movie_row = Moives_dataSet[Moives_dataSet['title'] == movie]
    if movie_row.empty:
        print(f"Movie '{movie}' not found even after adding.")
        return
    
    movie_index = movie_row.index[0]
    distance = similarity[movie_index]
    
    # Get top 5 recommendations
    movie_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])[1:6]
    
    for i in movie_list:
        print(Moives_dataSet.iloc[i[0]].title)


In [51]:
recommend("the avengers")

Movies: (5209, 4)
Vectors: (5211, 8000)
Similarity: (5211, 5211)
avengers: age of ultron
avengers: endgame
captain america: the winter soldier
captain america: civil war
thor


In [127]:
Moives_dataSet.to_csv('Movies.csv',index=False)

Day 6

In [2]:
Movies = pd.read_csv('Movies.csv')

In [3]:
def getMovie(movie):
    try:
        url = f"https://api.themoviedb.org/3/search/movie?api_key={API_KEY}&query={movie}"
        response = requests.get(url)
        data = response.json()
        if data['results']:
            movie_id = data['results'][0]['id']
            details_url = f"https://api.themoviedb.org/3/movie/{movie_id}?api_key={API_KEY}&append_to_response=credits,keywords"
            details = requests.get(details_url).json()
            return details
        else:
            return f"No search results for {movie}"
    except Exception as e:
        return (f"Error finding the {movie}: {e}")

In [4]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def clean_Details(details):
    movie_id = details.get('id',None)
    title = details.get('title','').lower()
    
    overview = details.get('overview', '').lower()
    genres = [g['name'].lower().replace(" ", "") for g in details.get('genres', [])]
    keywords = [k['name'].lower().replace(" ", "") for k in details.get('keywords', {}).get('keywords', [])]
    cast = [c['name'].lower().replace(" ", "") for c in details.get('credits', {}).get('cast', [])[:5]]
    crew = [c['name'].lower().replace(" ", "") for c in details.get('credits', {}).get('crew', []) if c.get('job') == 'Director']
    tagline = details.get('tagline','').lower()

    tags = " ".join(genres + keywords + cast + crew + [overview]+[tagline])
    cleaned = pd.DataFrame([{
        'id': movie_id,
        'title': title,
        'tags': tags
    }])


    def stem(text):
        y=[]
        for i in text.split():
         y.append(ps.stem(i))

        return " ".join(y)

    cleaned['tags'] = cleaned['tags'].apply(stem)
    return cleaned

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

tfid = TfidfVectorizer(max_features=8000, stop_words='english')
vectors = tfid.fit_transform(Movies['tags']).toarray()
similarity = cosine_similarity(vectors)


In [6]:
def recommend(movie):
    global Movies, similarity,vectors
    
    movie = movie.lower()

    # If not in dataset
    if movie not in Movies['title'].values:
        details = getMovie(movie)
        if not details:
            print(f"No search results for '{movie}'.")
            return
        
        #details is a dict (valid API response)
        if isinstance(details, dict):
            new_Movie = clean_Details(details)
            new_tags = new_Movie['tags'].iloc[0]

            #tranform only the new movie tags and caluclate similarity with existing matrix
            new_vector = tfid.transform([new_tags]).toarray()
            new_sim = cosine_similarity(new_vector, vectors)[0]
            
            #append the new movie and vector
            Movies = pd.concat([Movies, new_Movie], ignore_index=True)
            Movies.drop_duplicates(subset='title', inplace=True)
            Movies.reset_index(drop=True, inplace=True)
            vectors = np.vstack([vectors, new_vector])

            #adding the new sim to the similarity row and column
            similarity = np.vstack([similarity, new_sim])
            new_col = np.append(new_sim, 1.0).reshape(-1, 1)
            similarity = np.hstack([similarity, new_col])

            Movies.to_csv("movies_data.csv", index=False)
            with open("vectors.pkl", "wb") as f:
                pickle.dump(vectors, f)
            with open("similarity.pkl", "wb") as f:
                pickle.dump(similarity, f)
        else:
            print(details)
            return
        


    movie_row = Movies[Movies['title'] == movie]
    if movie_row.empty:
        print("Do you mean","'",new_Movie['title'].iloc[0],"'"" ?")
        return

    movie_index = movie_row.index[0]
    distance = similarity[movie_index]
    movie_list = sorted(list(enumerate(distance)), reverse=True, key=lambda x: x[1])[1:6]
    print("Movies:", Movies.shape)
    print("Vectors:", vectors.shape)
    print("Similarity:", similarity.shape)
    for i in movie_list:
        print(Movies.iloc[i[0]].title)


In [7]:
recommend("A working man")

Movies: (5207, 4)
Vectors: (5207, 8000)
Similarity: (5207, 5207)
taken
edge of darkness
trade of innocents
homefront
heli
