## Imports

In [None]:
import kagglehub
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from collections import Counter
from mpl_toolkits.mplot3d import Axes3D
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

: 

## Dataset 1: Movies on Netflix, Prime Video, Hulu and Disney+

* Source: Kaggle  
* Link: [https://www.kaggle.com/datasets/...](https://www.kaggle.com/datasets/ruchi798/movies-on-netflix-prime-video-hulu-and-disney  )
* Format: CSV file    
* Size: ~1 MB  

**Details:** This dataset includes the fields, movie title, release year, age, rotten tomatoes, and availability on Netflix, Prime Video, Hulu, and Disney+, and Type (Movie: 0 TV Show: 1)  

In [None]:
path = kagglehub.dataset_download("ruchi798/movies-on-netflix-prime-video-hulu-and-disney")
print("Path to dataset files:", path)

In [None]:
# Shows first few columns of the dataset 
df_1 = pd.read_csv(path + '/MoviesOnStreamingPlatforms.csv')
df_1.head()

: 

In [None]:
# Basic dataframe info
print("Number of Rows: ", df_1.shape[0])
print("Number of Columns: ", df_1.shape[1])
print("\nColumn info:\n")
print(df_1.info())

### Exploratory Data Analysis (EDA):
*   Visualize feature distributions
*   Create correlation heatmaps for numerical features
*   Explore relationships between features

In [None]:
# Visualize Feature Distributions
df_1['Rotten Tomatoes'] = df_1['Rotten Tomatoes'].str.split('/').str[0].astype(float)

plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
sns.histplot(df_1['Year'], bins=20, kde=True)
plt.title('Distribution of Year')

plt.subplot(1, 3, 2)
sns.countplot(data=df_1, x='Age', order=df_1['Age'].value_counts().index)
plt.title('Distribution of Age Ratings')

plt.subplot(1, 3, 3)
sns.histplot(df_1['Rotten Tomatoes'], bins=20, kde=True)
plt.title('Distribution of Rotten Tomatoes Scores')
plt.tight_layout()
plt.show()

In [None]:
# Correlation Heatmap for Numerical Features ####
correlation_matrix = df_1[['Year', 'Rotten Tomatoes', 'Netflix', 'Hulu', 'Prime Video', 'Disney+']].corr()

plt.figure(figsize=(10, 6))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f", linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Explore Relationships Between Features

# Rotten Tomatoes Score vs. Streaming Platforms
# Use: Recommend highly rated movies based on the user’s subscribed platforms (e.g., Netflix, Prime Video).
df_1['Platforms'] = df_1[['Netflix', 'Hulu', 'Prime Video', 'Disney+']].sum(axis=1)
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_1, x='Platforms', y='Rotten Tomatoes', hue='Age', palette='Set1')
plt.title('Rotten Tomatoes Scores vs Number of Streaming Platforms')
plt.xlabel('Number of Streaming Platforms')
plt.ylabel('Rotten Tomatoes Score')
plt.legend(title='Age Rating')
plt.show()

# Rotten Tomatoes Score vs. Release Year
# Use: Recommend highly rated movies within certain eras that the user prefers
df_1 = df_1.copy()
df_1.loc[:, 'Rotten Tomatoes'] = df_1['Rotten Tomatoes'].astype(str).str.split('/').str[0]
df_1.loc[:, 'Rotten Tomatoes'] = pd.to_numeric(df_1['Rotten Tomatoes'], errors='coerce')
df_1 = df_1.dropna(subset=['Rotten Tomatoes', 'Year'])
plt.figure(figsize=(10, 6))
sns.scatterplot(x='Year', y='Rotten Tomatoes', data=df_1, alpha=0.6, color='blue')
plt.title('Rotten Tomatoes Score vs. Release Year')
plt.xlabel('Release Year')
plt.ylabel('Rotten Tomatoes Score')
plt.grid(True)
plt.show()

## Dataset 2: Full TMDB Movies Dataset 2024  

* Source: Kaggle
* Link: [Here](https://www.kaggle.com/datasets/asaniczka/tmdb-movies-dataset-2023-930k-movies)
* Format: CSV file
* Size: ~500 MB, containing extensive information on nearly 1 million movies, including genres, cast, crew, keywords, and other metadata.  

**Details:** This dataset provides a rich set of attributes for each movie. Important features include genres, movie synopsis, director, and popularity score.  

In [None]:
path = kagglehub.dataset_download("asaniczka/tmdb-movies-dataset-2023-930k-movies")
print("Path to dataset files:", path)

In [None]:
# Shows first few columns of the dataset 
df_2 = pd.read_csv(path + '/TMDB_movie_dataset_v11.csv')
df_2.head()

In [None]:
# Basic dataframe info
print("Number of Rows: ", df_2.shape[0])
print("Number of Columns: ", df_2.shape[1])
print("\nColumn info:\n")
print(df_2.info())

### Exploratory Data Analysis (EDA): 
*   Visualize feature distributions
*   Create correlation heatmaps for numerical features
*   Explore relationships between features

In [None]:
# Visualize Feature Distributions

# Distribution of IMDb (vote_average) Scores
plt.figure(figsize=(15, 5))
plt.subplot(1, 2, 1)
sns.histplot(df_2['vote_average'], bins=20, kde=True, color='teal')
plt.title('Distribution of IMDb Scores')
plt.xlabel('IMDb Score (Vote Average)')

# Distribution of Movies by genre
df_genres = df_2.assign(genres=df_2['genres'].str.split(',')).explode('genres')

df_genres['genres'] = df_genres['genres'].str.strip()
df_genres = df_genres[df_genres['genres'].notna() & (df_genres['genres'] != '')]

unique_genres = df_genres['genres'].value_counts().index

plt.figure(figsize=(15, 5))
sns.countplot(data=df_genres, x='genres', order=unique_genres, legend=False)
plt.xticks(rotation=90)
plt.title('Number of Movies by Genre')
plt.xlabel('Genre')
plt.ylabel('Number of Movies')

plt.tight_layout()
plt.show()

# Distribution of Movies by release year
plt.figure(figsize=(15, 5))
df_2['release_year'] = pd.to_datetime(df_2['release_date'], errors='coerce').dt.year
sns.histplot(df_2['release_year'].dropna(), bins=20, kde=True, color='coral')
plt.title('Number of Movies by Release Year')
plt.xlabel('Release Year')
plt.ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Create a Correlation Heatmap for Numerical Features
df_encoded = df_2.copy()
df_encoded['adult'] = df_encoded['adult'].astype(int)
df_encoded['release_year'] = pd.to_datetime(df_encoded['release_date']).dt.year
df_encoded = df_encoded.drop(columns=['status', 'release_date', 'backdrop_path', 'poster_path', 'original_title', 'overview', 'keywords', 'production_companies', 'production_countries', 'spoken_languages'], errors='ignore')
numerical_features = df_encoded.select_dtypes(include=['float64', 'int64'])
correlation_matrix = numerical_features.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, fmt='.2f', cmap='coolwarm', square=True, linewidths=0.5)
plt.title('Correlation Heatmap')
plt.show()

In [None]:
# Explore Relationships Between Features

# IMDb Scores vs. Genres
top_genres = df_2['genres'].value_counts().nlargest(10).index
filtered_df = df_2[df_2['genres'].isin(top_genres)]

plt.figure(figsize=(15, 6))
sns.boxplot(data=filtered_df, x='genres', y='vote_average', palette='viridis', hue='genres')
plt.xticks(rotation=45)
plt.title('IMDb Scores Distribution Across Top 10 Genres')
plt.xlabel('Genres')
plt.ylabel('IMDb Score')
plt.show()

# IMDb Scores vs. Revenue
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_2, x='revenue', y='vote_average', alpha=0.6)
plt.title('IMDb Scores vs. Revenue')
plt.xscale('log')
plt.xlabel('Revenue (log scale)')
plt.ylabel('IMDb Score')
plt.grid()
plt.show()

# Release Year vs. IMDb Scores
df_2['release_year'] = pd.to_datetime(df_2['release_date']).dt.year
plt.figure(figsize=(10, 5))
sns.lineplot(data=df_2, x='release_year', y='vote_average', estimator='mean')
plt.title('Average IMDb Score Over Years')
plt.xlabel('Release Year')
plt.ylabel('Average IMDb Score')
plt.xticks(rotation=45)
plt.show()

## Merge the 2 Datasets 

In [None]:
df_1['title_lower'] = df_1['Title'].str.lower()
df_2['title_lower'] = df_2['title'].str.lower()

df_2.dropna(subset=['release_date'], inplace=True)
df_2['Year'] = pd.to_datetime(df_2['release_date']).dt.year
df_2['Year'] = df_2['Year'].astype(int)

df_merged = df_1.merge(
    df_2[['title_lower', 'genres', 'keywords', 'overview', 'production_countries', 'Year', 'spoken_languages']],
    on=['title_lower', 'Year'],
    how='inner'
)

df_merged.drop(columns=['title_lower'], inplace=True)

# New Rows vs Columns
df_merged.shape

: 

In [None]:
# Shows first few columns of the new dataset 
df_merged.head()

## Movie Recommendations using KNN

To accomplish this, lets make a tag column for our text data, and then drop all unnesesary columns.

In [None]:
df_merged[['genres', 'keywords', 'overview', 'production_countries', 'spoken_languages']] = df_merged[['genres', 'keywords', 'overview', 'production_countries', 'spoken_languages']].fillna('')

df_merged['tags'] = (
    df_merged['genres'] + ' ' +
    df_merged['keywords'] + ' ' +
    df_merged['overview'] + ' ' +
    df_merged['production_countries'] + ' ' +
    df_merged['spoken_languages']
)

df_merged['tags'] = df_merged['tags'].apply(
    lambda x: re.sub(r'\s+', ' ', x.strip())
)

df_merged.drop(columns=['genres', 'keywords', 'overview', 'production_countries', 'spoken_languages'], inplace=True)

Now, we make our TF-IDF Vectorizer.

TF-IDF is a numerical representation of text that reflects how important a word is in a document relative to the entire corpus.

In [None]:
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1,2)
)
tfidf_matrix = vectorizer.fit_transform(df_merged['tags'])

Now, we make our KNN Model.

In [None]:
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(tfidf_matrix)

Here is the function to get the nearest neighbors.

In [None]:
def get_recs(movie_title, n_recs=5):
  try:
    index = df_merged[df_merged['Title'].str.lower() == movie_title.lower()].index[0]
  except IndexError:
    print("Movie not found")
    return []

  movie_vector = tfidf_matrix[index]

  distances, indices = knn.kneighbors(movie_vector, n_neighbors=n_recs)

  recommended_movies = [df_merged.iloc[i]['Title'] for i in indices.flatten()[1:]]

  return recommended_movies

Now, lets test our function and see how well it does.

In [None]:
genres_to_test = [
    "Limitless",      # Sci-Fi
    "Mad Max",        # Action
    "Love actually",  # Romance
    "The Conjuring",  # Horror
    "Superbad",       # Comedy
]

for movie in genres_to_test:
    print(f"Recommendations for '{movie}':")
    print(get_recs(movie, n_recs=5))
    print("-" * 50)

Nice, now lets try to evaluate how this model is actually doing - and how similar these recomendations actually are...

In [None]:
def plot_heatmap(movie_title, n_recs=5):
  try:
      index = df_merged[df_merged['Title'].str.lower() == movie_title.lower()].index[0]
  except IndexError:
      print("Movie not found")
      return

  movie_vector = tfidf_matrix[index]
  distances, indices = knn.kneighbors(movie_vector, n_neighbors=n_recs)

  recommend_titles = [df_merged.iloc[i]['Title'] for i in indices.flatten()]
  distances = distances.flatten()

  sns.heatmap(
      [distances],
      annot=True,
      fmt=".2f",
      xticklabels=recommend_titles,
      yticklabels=["Cosine Distances"],
      cmap="coolwarm"
  )

  plt.title(f"Distances from '{movie_title}' to recommendad movies")
  plt.show()

genres_to_test = [
    "Limitless",      # Sci-Fi
    "Mad Max",        # Action
    "Love actually",  # Romance
    "The Conjuring",  # Horror
    "Superbad",       # Comedy
]

for movie in genres_to_test:
    print(f"Recommendations for '{movie}':")
    print(get_recs(movie, n_recs=5))
    plot_heatmap(movie, n_recs=5)
    print("-" * 50)

Not bad, lets see if we can get the 2D project of the distances, using principle component analysis (PCA) or the t-distributed stochastic neighbor embedding (t-SNE).

T-SNE is just a way to visualize high-dimensional data by placing each data point in a two- or three-dimensional map.

In [None]:
def plot_tsne_space(n_movies=100):
  reduced_matrix = (TSNE(n_components=2, random_state=42).fit_transform(tfidf_matrix[:n_movies].toarray()))

  plt.figure(figsize=(12, 8))
  plt.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], alpha=0.7)

  for i, title in enumerate(df_merged['Title'][:n_movies]):
      plt.text(reduced_matrix[i, 0], reduced_matrix[i, 1], title, fontsize=8)

  plt.title(f"Movie t-SNE Space Visualization")
  plt.xlabel("Component 1")
  plt.ylabel("Component 2")
  plt.show()

def plot_PCA_space(n_movies=100):
  reduced_matrix = (PCA(n_components=2, random_state=42).fit_transform(tfidf_matrix[:n_movies].toarray()))

  plt.figure(figsize=(12, 8))
  plt.scatter(reduced_matrix[:, 0], reduced_matrix[:, 1], alpha=0.7)

  for i, title in enumerate(df_merged['Title'][:n_movies]):
      plt.text(reduced_matrix[i, 0], reduced_matrix[i, 1], title, fontsize=8)

  plt.title(f"Movie PCA Space Visualization")
  plt.xlabel("Component 1")
  plt.ylabel("Component 2")
  plt.show()

In [None]:
plot_tsne_space()

In [None]:
plot_PCA_space()

## Movie Recommendations using K-Means Regular

In [None]:
k = 20
kmeans = KMeans(n_clusters=k, random_state=42)
df['Cluster'] = kmeans.fit_predict(df)

In [None]:
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df.drop(columns=['Cluster']))

plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Cluster'], cmap='viridis')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means Clustering')
plt.show()

In [None]:
df_reference['Cluster'] = df['Cluster']

for cluster in range(k):
    print(f"\nCluster {cluster}")
    print(df_reference[df_reference['Cluster'] == cluster][['Title', 'Year', 'Age', 'Rotten Tomatoes', 'tags']].head())

## Movie Recommendaitons using K-Means(++) Clustering

### User Input 

In [None]:
# Designated lists of options
ages = ["7+", "13+", "16+", "18+"]
platforms = ["Netflix", "Hulu", "Prime Video", "Disney+"]
regions = ["United States of America", "India", "United Kingdom", "China", "Australia", "Canada", "France", "Germany", "Italy", "New Zealand"]
languages = ["English", "Spanish", "French", "German", "Japanese", "Indian", "Korean", "Mandarin", "Hindi", "Arabic", "Italian"]

# Function to get validated input
def get_input(prompt, options):
    while True:
        print(f"Options: {', '.join(options)}")
        choice = input(prompt)
        if choice in options:
            return choice
        else:
            print("Invalid choice. Please select from the options above.")

# User inputs with validation
age = get_input("Please enter your age: ", ages)
platform = get_input("Please enter your streaming platform: ", platforms)
region = get_input("Please enter your region: ", regions)
language = get_input("Please enter your language: ", languages)

# Display user selections
print(f"\nYou selected:")
print(f"Age: {age}")
print(f"Platform: {platform}")
print(f"Region: {region}")
print(f"language: {language}")

### Model Training 

In [None]:
df_reference = df_merged.copy()
df = df_merged.drop(columns=['Unnamed: 0', 'ID', 'Title', 'Rotten Tomatoes']) # CAN ALSO REMOVE ROTTEN TOMATOES!
df.head()

In [None]:
categorical_columns = ['Age', 'Type', 'Platforms']
df = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

In [None]:
tfidf = TfidfVectorizer(max_features=50)  # Limit to top 50 words
tags_tfidf = tfidf.fit_transform(df['tags']).toarray()
tags_df = pd.DataFrame(tags_tfidf, columns=tfidf.get_feature_names_out())
df = pd.concat([df.drop(columns=['tags']), tags_df], axis=1)

In [None]:
scaler = StandardScaler()
df[['Year']] = scaler.fit_transform(df[['Year']]) # CAN ALSO REMOVE ROTTEN TOMATOES

In [None]:
df = df.drop_duplicates()

### K-Means++

In [None]:
k = 15
train_df, validation_df = train_test_split(df, train_size=0.8, random_state=42)
kmeans_plus = KMeans(n_clusters=k, init='k-means++', random_state=39)
df['Cluster'] = kmeans_plus.fit_predict(df)

In [None]:
# Optionally, visualize the clusters using PCA to reduce to two dimensions
pca = PCA(n_components=2)
df_pca = pca.fit_transform(df.drop(columns=['Cluster']))

plt.scatter(df_pca[:, 0], df_pca[:, 1], c=df['Cluster'], cmap='viridis')
plt.xlabel('PCA Component 1')
plt.ylabel('PCA Component 2')
plt.title('K-Means++ Clustering')
plt.show()

In [None]:
# Apply PCA to reduce dimensions to 3 for 3D visualization
pca = PCA(n_components=3)
df_pca = pca.fit_transform(df.drop(columns=['Cluster']))

# Create a 3D scatter plot of the clusters
fig = plt.figure(figsize=(12, 8))
ax = fig.add_subplot(111, projection='3d')
scatter = ax.scatter(df_pca[:, 0], df_pca[:, 1], df_pca[:, 2], c=df['Cluster'], cmap='viridis', marker='o')

# Add labels and title
ax.set_xlabel('PCA Component 1')
ax.set_ylabel('PCA Component 2')
ax.set_zlabel('PCA Component 3')
ax.set_title('K-Means++ Clustering in 3D')

# Add a legend (using color bar)
cbar = plt.colorbar(scatter, ax=ax, pad=0.1, orientation='vertical')
cbar.set_label('Cluster Labels')

plt.show()

In [None]:
df_reference['Cluster'] = df['Cluster']
for cluster in range(k):
    print(f"\nCluster {cluster}")
    print(df_reference[df_reference['Cluster'] == cluster][['Title', 'Year', 'Age', 'Rotten Tomatoes']].head())

In [None]:
def recommend_movies(movie_title, n_recommendations=5):
    # Find the cluster of the given movie
    if movie_title not in df_reference['Title'].values:
        return "Movie not found in database."
    else:
        # print("Reccs for: ", df_reference.loc[df_reference['Title'] == movie_title])
        print("Reccs for: ", movie_title)

    cluster_label = df_reference[df_reference['Title'] == movie_title]['Cluster'].values[0]

    # Get other movies in the same cluster
    recommendations = df_reference[(df_reference['Cluster'] == cluster_label) & (df_reference['Title'] != movie_title)]
    recommendations = recommendations.drop_duplicates(subset='Title')

    # Limit to n recommendations
    return recommendations[['Title', 'Netflix', 'Hulu', 'Prime Video', 'Disney+', 'tags', 'Year', 'Age', 'Rotten Tomatoes']].head(n_recommendations)

In [None]:
recommend_movies('Guardians of the Galaxy', 15)