In [1]:
import pandas as pd
import numpy as np

# !pip uninstall
# !pip install numpy==1.26.4
np.version.version

'1.26.4'

In [25]:
pwd

'c:\\Users\\mayan\\Desktop\\Portfolio Projects\\Recommender_Systems\\Zee_Movies_Recommender'

In [2]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv') 

In [3]:
movies = movies.merge(credits,on='title')
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]
movies.dropna(inplace=True)

In [4]:
# text.split() → quick & dirty
# word_tokenize(text) → clean, robust, NLP-ready
# If you’re doing POS tagging, lemmatization, or semantic vectorization, always use word_tokenize()

import nltk
# nltk.download('punkt')

from nltk.tokenize import word_tokenize
word_tokenize("Hello, world! I don't know.")

['Hello', ',', 'world', '!', 'I', 'do', "n't", 'know', '.']

In [5]:
# If you use pretrained embeddings (e.g., Word2Vec, BERT, SentenceTransformers):
# Then skip stemming/lemmatization
# Because those models already understand word forms and context — modifying words can hurt semantic meaning.
# However, removing stopwords and lowercasing can still help reduce noise. (but countvectors or tfidf automatically handle lowercasing and stopwords)
# You should not remove alphnumeric or numeric characters as some movie names or keywords may contain them

# By default, WordNet Lemmatizer assumes words to be nouns. For more accurate lemmatization, especially for verbs and adjectives, Part of Speech (POS) tagging is required.
# https://www.geeksforgeeks.org/machine-learning/python-lemmatization-approaches-with-examples/
# https://www.geeksforgeeks.org/nlp/nlp-part-of-speech-default-tagging/


import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet, stopwords
from nltk.stem import WordNetLemmatizer
import re
import pandas as pd

# Download required NLTK data (run once)

# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('stopwords')

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def clean_and_lemmatize(text):

    # Lowercase + remove non-alphabetic characters
    # text = re.sub(r'[^a-zA-Z\s]', ' ', text.lower())
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]
    
    # POS tagging
    tagged = pos_tag(tokens)
    
    # Lemmatize with POS
    lemmatized = [
        lemmatizer.lemmatize(word, pos='v' if tag.startswith('V') else 'n')
        for word, tag in tagged
    ]
    
    return lemmatized


movies['overview'] = movies['overview'].apply(clean_and_lemmatize)

In [6]:
# Below function converts string to list

import ast

def convert(text):
    L = []
    for i in ast.literal_eval(text):
        L.append(i['name']) 
    return L 

print(ast.literal_eval('[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'))

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert)
movies['cast'] = movies['cast'].apply(lambda x:x[0:5])

[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {'id': 878, 'name': 'Science Fiction'}]


In [7]:
def fetch_director(text):
    L = []
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            L.append(i['name'])
    return L 

movies['crew'] = movies['crew'].apply(fetch_director)

In [8]:
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ","") for i in x])
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ","") for i in x])

# def collapse(L):
#     L1 = []
#     for i in L:
#         L1.append(i.replace(" ",""))
#     return L1
# movies['cast'] = movies['cast'].apply(collapse)

movies.head()

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, 22nd, century, ,, paraplegic, Marine, dis...","[Action, Adventure, Fantasy, ScienceFiction]","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa, ,, long, believe, dead, ,,...","[Adventure, Fantasy, Action]","[ocean, drugabuse, exoticisland, eastindiatrad...","[JohnnyDepp, OrlandoBloom, KeiraKnightley, Ste...",[GoreVerbinski]
2,206647,Spectre,"[A, cryptic, message, Bond, ’, past, sends, tr...","[Action, Adventure, Crime]","[spy, basedonnovel, secretagent, sequel, mi6, ...","[DanielCraig, ChristophWaltz, LéaSeydoux, Ralp...",[SamMendes]
3,49026,The Dark Knight Rises,"[Following, death, District, Attorney, Harvey,...","[Action, Crime, Drama, Thriller]","[dccomics, crimefighter, terrorist, secretiden...","[ChristianBale, MichaelCaine, GaryOldman, Anne...",[ChristopherNolan]
4,49529,John Carter,"[John, Carter, war-weary, ,, former, military,...","[Action, Adventure, ScienceFiction]","[basedonnovel, mars, medallion, spacetravel, p...","[TaylorKitsch, LynnCollins, SamanthaMorton, Wi...",[AndrewStanton]


In [9]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

new.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"In 22nd century , paraplegic Marine dispatch m..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa , long believe dead , come ba..."
2,206647,Spectre,A cryptic message Bond ’ past sends trail unco...
3,49026,The Dark Knight Rises,Following death District Attorney Harvey Dent ...
4,49529,John Carter,"John Carter war-weary , former military captai..."


In [10]:
# CountVectorizer is giving better results than TfidfVectorizer in this case
# You should not remove alphnumeric or numeric characters as some movie names or keywords may contain them

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000,stop_words='english')
vectors = vectorizer.fit_transform(new['tags']).toarray()

# from sklearn.feature_extraction.text import TfidfVectorizer
# vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
# vectors = vectorizer.fit_transform(new['tags']).toarray()

In [12]:
list(vectorizer.get_feature_names_out())[:20]

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930s',
 '1940s',
 '1950s',
 '1960s',
 '1970s',
 '1980']

In [13]:
vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [14]:
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)

In [15]:
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04588315, 0.0270369 ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.04850713, 0.        ,
        0.        ],
       [0.0860309 , 0.06063391, 1.        , ..., 0.05      , 0.        ,
        0.        ],
       ...,
       [0.04588315, 0.04850713, 0.05      , ..., 1.        , 0.07071068,
        0.04417261],
       [0.0270369 , 0.        , 0.        , ..., 0.07071068, 1.        ,
        0.05205792],
       [0.        , 0.        , 0.        , ..., 0.04417261, 0.05205792,
        1.        ]])

In [16]:
new[new['title'] == 'The Lego Movie'].index[0]

744

In [17]:
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:6]:
        print(new.iloc[i[0]].title)

In [18]:
recommend('Titanic')

The Notebook
Ghost Ship
Captain Phillips
Poseidon
Supernova


In [19]:
recommend('Yeh Jawaani Hai Deewani')

The Fall of the Roman Empire
Coriolanus
The Heart of Me
Pompeii
The House of Mirth


In [20]:
recommend('Gandhi')

Gandhi, My Father
The Wind That Shakes the Barley
A Passage to India
Guiana 1838
Ramanujan


In [None]:
import pickle, gzip
pickle.dump(new,open('movie_list.pkl','wb'))
# pickle.dump(similarity,open('similarity.pkl','wb'))

with gzip.open("similarity.pkl.gz", "wb") as f:
    pickle.dump(similarity, f)

In [None]:
# for loading compressed file

# import gzip, pickle
# with gzip.open("similarity_compressed.pkl.gz", "rb") as f:
#     similarity = pickle.load(f)

In [None]:
# below command to generate requirements.txt

# pip install pipreqs
!pipreqs . --force


INFO: Not scanning for jupyter notebooks.
Please, verify manually the final list of requirements.txt to avoid possible dependency confusions.
INFO: Successfully saved requirements file in .\requirements.txt


In [None]:
# Advanced Approach for text embeddings (pretrained models, no stemming/lemmatization needed):
# 
# ➡️ Sentence-BERT (SBERT) — model: all-MiniLM-L6-v2
# Captures semantic meaning of entire sentences/plots
# Lightweight (only ~80 MB) faster than larger BERT models
# Fast to encode thousands of movies
# Works great with cosine_similarity
# Used in many professional recommender systems

# But we are not using it here because of resource constraints in this environment.
# Have to install additional libraries which may not be supported here.




# from sentence_transformers import SentenceTransformer
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# model = SentenceTransformer('all-MiniLM-L6-v2')

# # Example: encode movie descriptions/tags
# embeddings = model.encode(df['tags'].tolist(), normalize_embeddings=True)

# # Compute similarity
# similarity = cosine_similarity(embeddings)

# def recommend(movie_title):
#     idx = df[df['title'] == movie_title].index[0]
#     distances = sorted(list(enumerate(similarity[idx])), key=lambda x: x[1], reverse=True)
#     for i in distances[1:6]:
#         print(df.iloc[i[0]].title)
