In [1]:
# === Mount Google Drive and Import Libraries ===
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings
import gc
import pickle

warnings.filterwarnings("ignore")

#from google.colab import drive
#drive.mount('/content/drive')

# === Load Datasets ===
movies_df = pd.read_csv(r'C:\Users\Admin\Desktop\Integrated_Movie_Recommendation_System\rotten_tomatoes_movies.csv', encoding='latin')
reviews_df = pd.read_csv(r'C:\Users\Admin\Desktop\Integrated_Movie_Recommendation_System\rotten_tomatoes_movie_reviews.csv', encoding='latin')

print("numpy", np.__version__)
print("pandas", pd.__version__)
print("sklearn", sklearn.__version__)

numpy 2.0.2
pandas 2.2.2
sklearn 1.6.1


In [None]:
# === Clean Reviews Dataset ===
reviews_df = reviews_df.fillna(0)
reviews_df = reviews_df.loc[:, ~reviews_df.columns.str.contains('^Unnamed')]

# === Optimize Data Types ===
movies_df['genre'] = movies_df['genre'].astype('category')
movies_df['title'] = movies_df['title'].astype('category')

# === Limit Dataset Size ===
movies_df = movies_df.head(80000)
reviews_df = reviews_df.head(80000)

# === Merge Datasets ===
merged_df = pd.merge(movies_df, reviews_df, on='id')
merged_df = merged_df.drop_duplicates()

# === Select Relevant Columns ===
merged_df = merged_df[['id', 'title', 'audienceScore', 'tomatoMeter', 'genre', 'rating',
                       'director', 'originalLanguage', 'releaseDateTheaters', 'runtimeMinutes',
                       'reviewId', 'criticName', 'isTopCritic', 'originalScore', 'scoreSentiment']]

# === Drop Duplicates ===
merged_df = merged_df.drop_duplicates(subset=['title', 'genre', 'director'])

# === Handle Missing Values ===
numeric_cols = ['audienceScore', 'tomatoMeter']
for col in numeric_cols:
    merged_df[col].fillna(merged_df[col].median(), inplace=True)

categorical_cols = merged_df.select_dtypes(include='object').columns
for col in categorical_cols:
    merged_df[col].fillna(merged_df[col].mode()[0], inplace=True)

# === Normalize Scores ===
def min_max_normalize(column):
    return (column - column.min()) / (column.max() - column.min())

merged_df.dropna(subset=['audienceScore', 'tomatoMeter'], inplace=True)
merged_df['audienceScore'] = min_max_normalize(merged_df['audienceScore'])
merged_df['tomatoMeter'] = min_max_normalize(merged_df['tomatoMeter'])

# === Convert originalScore to numeric ===
def parse_score(score):
    if isinstance(score, str):
        score = score.strip()
        if '/' in score:
            try:
                num, denom = score.split('/')
                return float(num) / float(denom) * 10
            except:
                return np.nan
        elif score.replace('.', '', 1).isdigit():
            return float(score)
        else:
            return np.nan
    return score

merged_df['numeric_score'] = merged_df['originalScore'].apply(parse_score)

# === Handle genre and director missing ===
merged_df['genre'] = merged_df['genre'].astype('category').cat.add_categories('Unknown').fillna("Unknown")
merged_df['director'] = merged_df['director'].astype('category').cat.add_categories('Unknown').fillna("Unknown")




In [3]:
# === Step 1: Content-Based Filtering ===
def build_content_similarity(df):
    df = df.reset_index(drop=True).copy()
    df['combined_features'] = (
        df['title'].astype(str) + " " +
        df['genre'].astype(str) + " " +
        df['director'].astype(str) + " " +
        df['rating'].astype(str)
    ).fillna("")

    vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
    feature_matrix = vectorizer.fit_transform(df['combined_features'])

    content_sim = cosine_similarity(feature_matrix)

    del feature_matrix
    gc.collect()

    return content_sim, df

# === Step 2: Collaborative Filtering ===
def build_collaborative_similarity(df):
    pivot = df.pivot_table(index='title', columns='criticName', values='numeric_score', aggfunc='mean').fillna(0)
    collab_sim = cosine_similarity(pivot)
    title_to_index = {title: i for i, title in enumerate(pivot.index)}
    return collab_sim, title_to_index, list(pivot.index)

# === Step 3: Hybrid Recommender ===
def hybrid_recommend(keyword, df, content_sim, collab_sim, title_to_collab_index, collab_titles, top_n=10):
    filtered = df[
        df['title'].str.contains(keyword, case=False, na=False) |
        df['genre'].str.contains(keyword, case=False, na=False) |
        df['director'].str.contains(keyword, case=False, na=False) |
        df['rating'].astype(str).str.contains(keyword, case=False, na=False)
    ]

    if filtered.empty:
        return f"No movies found containing the keyword '{keyword}'!"

    filtered_idx = filtered.index.tolist()
    hybrid_scores = np.zeros(len(df))

    for idx in filtered_idx:
        content_scores = content_sim[idx]
        title = df.iloc[idx]['title']
        collab_idx = title_to_collab_index.get(title, None)

        if collab_idx is not None:
            collab_scores = collab_sim[collab_idx]
            collab_scores_resized = np.zeros(len(df))
            for i, t in enumerate(df['title']):
                if t in title_to_collab_index:
                    collab_scores_resized[i] = collab_scores[title_to_collab_index[t]]
            combined = 0.5 * content_scores + 0.5 * collab_scores_resized
        else:
            combined = content_scores

        hybrid_scores += combined

    recommended_indices = hybrid_scores.argsort()[::-1][:top_n]
    recommendations = df.iloc[recommended_indices]

    return recommendations[['title', 'genre', 'director', 'rating', 'audienceScore', 'tomatoMeter',
                            'originalLanguage', 'releaseDateTheaters', 'runtimeMinutes']]

# === Build Models ===
content_sim, content_df = build_content_similarity(merged_df)
collab_sim, title_to_index, collab_titles = build_collaborative_similarity(merged_df)

# === Recommend Function ===
def print_hybrid_recommendations(keyword, top_n=10):
    print(f"\n=== Hybrid Recommendations for: '{keyword}' ===")
    result = hybrid_recommend(keyword, content_df, content_sim, collab_sim, title_to_index, collab_titles, top_n)
    if isinstance(result, str):
        print(result)
    else:
        for _, row in result.iterrows():
            print(f"{row['title']} | Genre: {row['genre']} | Director: {row['director']} | "
                  f"Rating: {row['rating']} | Audience Score: {row['audienceScore']:.2f} | "
                  f"Tomato Meter: {row['tomatoMeter']:.2f} | Language: {row['originalLanguage']} | "
                  f"Year: {row['releaseDateTheaters']} | Runtime: {row['runtimeMinutes']} mins")

print_hybrid_recommendations("Action", top_n=10)


=== Hybrid Recommendations for: 'Action' ===
Ala Vaikunthapurramuloo | Genre: Action, Drama | Director: Trivikram Srinivas | Rating: R | Audience Score: 0.76 | Tomato Meter: 1.00 | Language: Telugu | Year: 07-12-2018 | Runtime: nan mins
The Last Mercenary | Genre: Action, Comedy | Director: David Charhon | Rating: R | Audience Score: 0.38 | Tomato Meter: 0.58 | Language: French (France) | Year: 07-12-2018 | Runtime: 110.0 mins
Bartkowiak | Genre: Drama, Action, Adventure | Director: Daniel Markowicz | Rating: R | Audience Score: 0.21 | Tomato Meter: 0.74 | Language: Polish | Year: 07-12-2018 | Runtime: 91.0 mins
Rollerball | Genre: Action | Director: John McTiernan | Rating: PG-13 | Audience Score: 0.14 | Tomato Meter: 0.03 | Language: English | Year: 08-02-2002 | Runtime: 98.0 mins
Christopher | Genre: Action, Crime, Drama, Mystery & thriller | Director: B. Unnikrishnan | Rating: R | Audience Score: 0.60 | Tomato Meter: 0.74 | Language: Malayalam | Year: 07-12-2018 | Runtime: 150.0 m

In [4]:
%pip install gradio

Note: you may need to restart the kernel to use updated packages.


In [5]:
# === Gradio Frontend ===
import gradio as gr

In [6]:
keyword_list = sorted(set(
    map(str, content_df['title'].dropna().tolist() +
             content_df['genre'].dropna().tolist() +
             content_df['director'].dropna().tolist() +
             content_df['rating'].dropna().tolist())
))

def hybrid_recommend_gradio(keyword, top_n):
    result = hybrid_recommend(keyword, content_df, content_sim, collab_sim, title_to_index, collab_titles, top_n)
    if isinstance(result, str):
        return result
    else:
        result = result.rename(columns={
            'title': 'Title',
            'genre': 'Genre',
            'director': 'Director',
            'rating': 'Rating',
            'audienceScore': 'Audience Score',
            'tomatoMeter': 'Tomato Meter',
            'originalLanguage': 'Language',
            'releaseDateTheaters': 'Year',
            'runtimeMinutes': 'Runtime'
        })
        return result.reset_index(drop=True)

iface = gr.Interface(
    fn=hybrid_recommend_gradio,
    inputs=[
        gr.Dropdown(choices=keyword_list, label="Select Movie Title, Genre, Director, or Rating", allow_custom_value=True),
        gr.Slider(minimum=5, maximum=20, step=1, label="Number of Recommendations", value=10),
    ],
    outputs=gr.Dataframe(),
    title="🎬 Integrated Movie Recommendation System",
    description="Select or enter a movie title, genre, director, or rating to get recommendations powered by content-based and collaborative filtering."
)

iface.launch(share=True)


* Running on local URL:  http://127.0.0.1:7860

Could not create share link. Missing file: C:\Users\Admin\.cache\huggingface\gradio\frpc\frpc_windows_amd64_v0.3. 

Please check your internet connection. This can happen if your antivirus software blocks the download of this file. You can install manually by following these steps: 

1. Download this file: https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_windows_amd64.exe
2. Rename the downloaded file to: frpc_windows_amd64_v0.3
3. Move the file to this location: C:\Users\Admin\.cache\huggingface\gradio\frpc


