In [1]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [4]:
df = pd.read_csv("IMDB-Movie-Dataset(2023-1951).csv")
df.dropna(subset=['movie_name','genre', 'cast', 'director'], inplace=True)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,movie_id,movie_name,year,genre,overview,director,cast
0,tt15354916,Jawan,2023,"Action, Thriller",A high-octane action thriller which outlines t...,Atlee,"Shah Rukh Khan, Nayanthara, Vijay Sethupathi, ..."
1,tt15748830,Jaane Jaan,2023,"Crime, Drama, Mystery",A single mother and her daughter who commit a ...,Sujoy Ghosh,"Kareena Kapoor, Jaideep Ahlawat, Vijay Varma, ..."
2,tt11663228,Jailer,2023,"Action, Comedy, Crime",A retired jailer goes on a manhunt to find his...,Nelson Dilipkumar,"Rajinikanth, Mohanlal, Shivarajkumar, Jackie S..."
3,tt14993250,Rocky Aur Rani Kii Prem Kahaani,2023,"Comedy, Drama, Family",Flamboyant Punjabi Rocky and intellectual Beng...,Karan Johar,"Ranveer Singh, Alia Bhatt, Dharmendra, Shabana..."
4,tt15732324,OMG 2,2023,"Comedy, Drama",An unhappy civilian asks the court to mandate ...,Amit Rai,"Pankaj Tripathi, Akshay Kumar, Yami Gautam, Pa..."
...,...,...,...,...,...,...,...
2194,tt11112474,Heeriye,,Thriller,Add a Plot,Subhash Ghai,"Shatrughan Sinha, Reena Roy, Ajit Khan, Premna..."
2195,tt0332766,Sur: The Melody of Life,2002,"Drama, Musical, Romance",A renowned music teacher mentors a promising y...,Tanuja Chandra,"Lucky Ali, Simone Singh, Achint Kaur, Ehsan Khan"
2196,tt8622232,Time to Dance,2021,"Musical, Romance",When a ballroom dancer's shot at a crucial tou...,Stanley D'Costa,"Sooraj Pancholi, Isabelle Kaif, Waluscha D'Sou..."
2197,tt0187351,Nigahen: Nagina Part II,1989,"Drama, Family, Fantasy",After the tragic deaths of his son Ajit and da...,Harmesh Malhotra,"Sunny Deol, Sridevi, Anupam Kher, Gulshan Grover"


In [5]:
df['tags'] = (
    df['genre'].astype(str) +" " +
    df['cast'].astype(str) + " " +
    df['director'].astype(str)
).str.replace(" ", "").str.lower()


In [9]:
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tags']).toarray()

In [10]:
similarity = cosine_similarity(vectors)

In [11]:
def recommend(movie_name):
    if movie_name not in df['movie_name'].values:
        return ['Movie not found in database']
    index = df[df['movie_name']==movie_name].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key= lambda x: x[1])
    recommendations = [df.iloc[i[0]]['movie_name'] for i in distances[1:11]]
    return recommendations

In [13]:
from sklearn.model_selection import train_test_split
import numpy as np

# Split dataset into training and test sets (80/20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create vectorizer and fit only on training data
cv = CountVectorizer(max_features=5000, stop_words='english')
train_vectors = cv.fit_transform(train_df['tags']).toarray()
test_vectors = cv.transform(test_df['tags']).toarray()

# Compute cosine similarity for both sets
train_similarity = cosine_similarity(train_vectors)
test_similarity = cosine_similarity(test_vectors)

# Check if similar movies remain consistent
def check_consistency(movie_name):
    if movie_name not in test_df['movie_name'].values:
        return "Movie not found in test data."
    test_index = test_df[test_df['movie_name'] == movie_name].index[0]
    distances = sorted(list(enumerate(test_similarity[test_index])), reverse=True, key=lambda x: x[1])
    top_5 = [test_df.iloc[i[0]]['movie_name'] for i in distances[1:6]]
    return top_5

# Example
check_consistency(test_df.iloc[0]['movie_name'])

["Rakshak India's Braves",
 'August 16 1947',
 'Brahmastra 2',
 'Maine Dil Tujhko Diya',
 'Badass Ravikumar']

In [12]:
pickle.dump(df, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))
pickle.dump(cv, open('model.pkl', 'wb'))