# Content-Based Movie Recommendation System

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
import ipywidgets as widgets
from IPython.display import display, HTML




### Dataset
For the dataset, I have used a wikiPedia dataset, which has the movie name, origin, director, plot, etc, but I have extracted only movie name and the plot, where we will use the plot to align with the user input

In [2]:
# Load Dataset
df = pd.read_csv('dataset/wiki_movie_plots_deduped.csv')
# Keep only relevant columns, drop null values
df = df[['Title', 'Plot']].dropna().head(500)

### Methodology
The text is forst preprocessed in order to remove stopwords and special characters, and all the data is converted into lowercase.
For the content-based recommendation, the main 2 methods that I have used are the TF-IDF vectorizer and the SBERT methods
- TF-IDF as a method that finds the most important words in a text and assigns weights to them, so unique words can help identify certain genres of movies, and align with the user input

- Sentence-BERT (SBERT) is a version of BERT (a deep learning model trained on massive data) designed to find similarities between entire sentences, instead of just words, and it finds similarities between the sentences using cosine similarity.

In [3]:
# Preprocess the text, coversion to lowercase, removing special characters and stopwords
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])

    return text
df['Processed_Plot'] = df['Plot'].apply(preprocess_text)

#Compute TF-IDF and Cosine Similarity.
def compute_tfidf_similarity(user_query, df):
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df['Processed_Plot'])
    user_vector = vectorizer.transform([user_query])
    similarity_scores = cosine_similarity(user_vector, tfidf_matrix).flatten()
    return similarity_scores

#Compute SBERT embeddings and similarity
def compute_sbert_similarity(user_query, df):
    sbert_model = SentenceTransformer('all-MiniLM-L6-v2')
    embeddings = sbert_model.encode(df['Processed_Plot'].tolist())
    user_embedding = sbert_model.encode([user_query])
    similarity_scores = cosine_similarity(user_embedding, embeddings).flatten()
    return similarity_scores

    
# Hybrid approach combining TF-IDF and SBERT similarity, using weighted combination of similarity scores
def recommend_movies(user_query, df, top_n=5, weight_tfidf=0.5, weight_sbert=0.5):
    tfidf_scores = compute_tfidf_similarity(user_query, df)
    sbert_scores = compute_sbert_similarity(user_query, df)
    
    # Normalize scores
    def scale_scores(scores):
        return (scores - np.min(scores)) / (np.max(scores) - np.min(scores)) if np.max(scores) != np.min(scores) else scores

    tfidf_scores = scale_scores(tfidf_scores)
    sbert_scores = scale_scores(sbert_scores)

    # Weighted sum of similarities
    final_scores = (weight_tfidf * tfidf_scores) + (weight_sbert * sbert_scores)
    top_indices = np.argsort(final_scores)[::-1][:top_n]
    recommendations = df.iloc[top_indices][['Title']]
    recommendations['Similarity Score'] = final_scores[top_indices]
    return recommendations

### User Input and Output 

In [4]:
# Interactive Widget for Input Query
query_input = widgets.Textarea(
    placeholder='Enter a description of the movie you like',
    layout=widgets.Layout(width='80%', height='100px')
)

output_area = widgets.Output()

def on_button_click(b):
    user_query = query_input.value
    results = recommend_movies(user_query, df)

    # Format output in HTML table for better display
    with output_area:
        output_area.clear_output()
        display(HTML(results.to_html(index=False)))

button = widgets.Button(description="Recommend!", button_style='success')
button.on_click(on_button_click)

display(query_input, button, output_area)

Textarea(value='', layout=Layout(height='100px', width='80%'), placeholder='Enter a description of the movie y…

Button(button_style='success', description='Recommend!', style=ButtonStyle())

Output()