In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from wordcloud import WordCloud
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.metrics.pairwise import linear_kernel

In [2]:
data=pd.read_csv('filtered_data.csv')
df=pd.read_csv('temp.csv')
data.columns

Index(['title', 'combined', 'popularity', 'release_date', 'director',
       'cleaned_text'],
      dtype='object')

In [3]:
vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 2), min_df=3)
count_matrix = vectorizer.fit_transform(data['cleaned_text'])

In [4]:

cosine_sim = cosine_similarity(count_matrix)
pd.DataFrame(cosine_sim).to_csv("cosine_similarity_matrix.csv", index=False)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorizer just for titles
title_vectorizer = TfidfVectorizer(stop_words='english')
title_tfidf_matrix = title_vectorizer.fit_transform(data['title'].fillna('').str.lower())

In [6]:
def recommend_movies(movie_name, cosine__sim=cosine_sim, df=data, top_n=10, sort_by='popularity'):
    movie_name = movie_name.strip().lower()
    idx = df[df['title'].str.lower() == movie_name].index
    if len(idx) == 0:
        print(f"'{movie_name}' not found in the dataset — showing closest matches by title similarity.")
        input_title_vec = title_vectorizer.transform([movie_name])
        sim_scores = list(enumerate(cosine_similarity(input_title_vec, title_tfidf_matrix)[0]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[:top_n]
        movie_indices = [i[0] for i in sim_scores]
        recommended = df.iloc[movie_indices]
    else:
        idx = idx[0]
        sim_scores = list(enumerate(cosine__sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n+1] 
        movie_indices = [i[0] for i in sim_scores]
        similar_movies = df.iloc[movie_indices]
        if sort_by in similar_movies.columns:
            similar_movies = similar_movies.sort_values(by=sort_by, ascending=False)
        searched_movie = df.iloc[[idx]]
        recommended = pd.concat([searched_movie, similar_movies])
    return recommended[['title', 'director','release_date']]


In [7]:
import ipywidgets as widgets
from IPython.display import display, clear_output
movie_titles = sorted(df['title'].dropna().unique())
manual_input = widgets.Text(
    value='',
    description='Manual input:',
    placeholder='Type a movie title'
)
dropdown_input = widgets.Dropdown(
    options=movie_titles,
    description='Pick a title:',
    value='Jumanji' 
)
recommendation_list = widgets.Output()
def update_recommendations(change):
    with recommendation_list:
        clear_output()
        title = manual_input.value.strip()
        if title:
            display(recommend_movies(title))
        else: 
            display(recommend_movies(dropdown_input.value))
manual_input.observe(update_recommendations, names='value')
dropdown_input.observe(update_recommendations, names='value')
display(manual_input, dropdown_input, recommendation_list)


Text(value='', description='Manual input:', placeholder='Type a movie title')

Dropdown(description='Pick a title:', index=1945, options=("'night, Mother", '...And God Created Woman', '...A…

Output()