Procure and Analyze Data

Import Data

In [31]:
pip install implicit

Collecting implicit
  Downloading implicit-0.7.2-cp311-cp311-macosx_10_9_x86_64.whl.metadata (6.1 kB)
Downloading implicit-0.7.2-cp311-cp311-macosx_10_9_x86_64.whl (811 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m812.0/812.0 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: implicit
Successfully installed implicit-0.7.2
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd 
import numpy as np
import ast
import warnings
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from implicit.als import AlternatingLeastSquares

# Suppress all warnings
warnings.filterwarnings("ignore")

In [2]:
ratings = pd.read_csv('/Users/madelinehuynh/Downloads/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [3]:
movies_metadata = pd.read_csv('/Users/madelinehuynh/Downloads/movies_metadata.csv')
movies_metadata.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0


In [4]:
keywords = pd.read_csv('/Users/madelinehuynh/Downloads/keywords.csv')
keywords.head()

Unnamed: 0,id,keywords
0,862,"[{'id': 931, 'name': 'jealousy'}, {'id': 4290,..."
1,8844,"[{'id': 10090, 'name': 'board game'}, {'id': 1..."
2,15602,"[{'id': 1495, 'name': 'fishing'}, {'id': 12392..."
3,31357,"[{'id': 818, 'name': 'based on novel'}, {'id':..."
4,11862,"[{'id': 1009, 'name': 'baby'}, {'id': 1599, 'n..."


In [5]:
credits = pd.read_csv('/Users/madelinehuynh/Downloads/credits.csv')
credits.head()

Unnamed: 0,cast,crew,id
0,"[{'cast_id': 14, 'character': 'Woody (voice)',...","[{'credit_id': '52fe4284c3a36847f8024f49', 'de...",862
1,"[{'cast_id': 1, 'character': 'Alan Parrish', '...","[{'credit_id': '52fe44bfc3a36847f80a7cd1', 'de...",8844
2,"[{'cast_id': 2, 'character': 'Max Goldman', 'c...","[{'credit_id': '52fe466a9251416c75077a89', 'de...",15602
3,"[{'cast_id': 1, 'character': ""Savannah 'Vannah...","[{'credit_id': '52fe44779251416c91011acb', 'de...",31357
4,"[{'cast_id': 1, 'character': 'George Banks', '...","[{'credit_id': '52fe44959251416c75039ed7', 'de...",11862


Data Cleaning and Feature Engineering

In [6]:
# Check for NaNs in ratings dataset
print('ratings', ratings.isnull().sum())

ratings userId       0
movieId      0
rating       0
timestamp    0
dtype: int64


In [7]:
# Check for NaNs in movies_metadata dataset
print('movies metadata', movies_metadata.isnull().sum())

movies metadata adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
id                           0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64


In [8]:
# Check for NaNs in keywords dataset
print('keywords', keywords.isnull().sum())

keywords id          0
keywords    0
dtype: int64


In [9]:
# Check for NaNs in credits dataset
print('credits', credits.isnull().sum())

credits cast    0
crew    0
id      0
dtype: int64


Only the movies metadata dataset has noticeable amount of NAs.

In [10]:
# check length of movies metadata set
print(len(movies_metadata))

45466


In [11]:
#Select relevant columns for recommender model
selected_columns = ['id', 'original_title', 'genres', 'overview', 'release_date', 'popularity', 'vote_average', 'vote_count', 'revenue', 'budget']
movies_metadata_subset = movies_metadata[selected_columns]
movies_metadata_subset.head()

Unnamed: 0,id,original_title,genres,overview,release_date,popularity,vote_average,vote_count,revenue,budget
0,862,Toy Story,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...","Led by Woody, Andy's toys live happily in his ...",1995-10-30,21.946943,7.7,5415.0,373554033.0,30000000
1,8844,Jumanji,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",When siblings Judy and Peter discover an encha...,1995-12-15,17.015539,6.9,2413.0,262797249.0,65000000
2,15602,Grumpier Old Men,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",A family wedding reignites the ancient feud be...,1995-12-22,11.7129,6.5,92.0,0.0,0
3,31357,Waiting to Exhale,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...","Cheated on, mistreated and stepped on, the wom...",1995-12-22,3.859495,6.1,34.0,81452156.0,16000000
4,11862,Father of the Bride Part II,"[{'id': 35, 'name': 'Comedy'}]",Just when George Banks has recovered from his ...,1995-02-10,8.387519,5.7,173.0,76578911.0,0


In [12]:
movies_metadata_subset.isna().sum()

id                  0
original_title      0
genres              0
overview          954
release_date       87
popularity          5
vote_average        6
vote_count          6
revenue             6
budget              0
dtype: int64

In [13]:
movies_metadata_subset.dtypes

id                 object
original_title     object
genres             object
overview           object
release_date       object
popularity         object
vote_average      float64
vote_count        float64
revenue           float64
budget             object
dtype: object

In [14]:
# Convert 'release_date' column to datetime objects
movies_metadata_subset['release_date'] = pd.to_datetime(movies_metadata_subset['release_date'], errors = 'coerce')

# Convert 'popularity' column to numeric datatype
movies_metadata_subset['popularity'] = pd.to_numeric(movies_metadata_subset['popularity'], errors = 'coerce')

# Perform mean imputation for numerical columns
numerical_columns = ["popularity", "vote_average", "vote_count", "revenue"]
movies_metadata_subset[numerical_columns] = movies_metadata_subset[numerical_columns].fillna(movies_metadata_subset[numerical_columns].mean())

# Mode imputation for categorical columns
categorical_columns = ["release_date"]
for col in categorical_columns:
    movies_metadata_subset[col] = movies_metadata_subset[col].fillna(movies_metadata_subset[col].mode()[0])

# Fill NAs for overview description with not available
movies_metadata_subset['overview'] = movies_metadata_subset['overview'].fillna('Not available')

# Print the result
print(movies_metadata_subset.isna().sum())

id                0
original_title    0
genres            0
overview          0
release_date      0
popularity        0
vote_average      0
vote_count        0
revenue           0
budget            0
dtype: int64


In [15]:
# Convert timestamp to datetime 
# Convert the 'timestamp' column to datetime format
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'])

# Print the updated DataFrame
print(ratings.head())

   userId  movieId  rating                     timestamp
0       1      110     1.0 1970-01-01 00:00:01.425941529
1       1      147     4.5 1970-01-01 00:00:01.425942435
2       1      858     5.0 1970-01-01 00:00:01.425941523
3       1     1221     5.0 1970-01-01 00:00:01.425941546
4       1     1246     5.0 1970-01-01 00:00:01.425941556


In [16]:
# Parse the 'keywords' column to extract keywords
keywords['keywords'] = keywords['keywords'].apply(lambda x: [item['name'] for item in ast.literal_eval(x)])

# Print the updated DataFrame
print(keywords.head())

      id                                           keywords
0    862  [jealousy, toy, boy, friendship, friends, riva...
1   8844  [board game, disappearance, based on children'...
2  15602  [fishing, best friend, duringcreditsstinger, o...
3  31357  [based on novel, interracial relationship, sin...
4  11862  [baby, midlife crisis, confidence, aging, daug...


In [17]:
# Parse the 'cast' column to extract cast information
credits['cast'] = credits['cast'].apply(lambda x: [item['name'] for item in ast.literal_eval(x)])

# Parse the 'crew' column to extract crew information
credits['crew'] = credits['crew'].apply(lambda x: [item['name'] for item in ast.literal_eval(x)])

# Print the updated DataFrame
print(credits.head())

                                                cast  \
0  [Tom Hanks, Tim Allen, Don Rickles, Jim Varney...   
1  [Robin Williams, Jonathan Hyde, Kirsten Dunst,...   
2  [Walter Matthau, Jack Lemmon, Ann-Margret, Sop...   
3  [Whitney Houston, Angela Bassett, Loretta Devi...   
4  [Steve Martin, Diane Keaton, Martin Short, Kim...   

                                                crew     id  
0  [John Lasseter, Joss Whedon, Andrew Stanton, J...    862  
1  [Larry J. Franco, Jonathan Hensleigh, James Ho...   8844  
2  [Howard Deutch, Mark Steven Johnson, Mark Stev...  15602  
3  [Forest Whitaker, Ronald Bass, Ronald Bass, Ez...  31357  
4  [Alan Silvestri, Elliot Davis, Nancy Meyers, N...  11862  


Additional Feature Engineering

In [18]:
movies_metadata_subset.dtypes

id                        object
original_title            object
genres                    object
overview                  object
release_date      datetime64[ns]
popularity               float64
vote_average             float64
vote_count               float64
revenue                  float64
budget                    object
dtype: object

In [19]:
movies_metadata_subset['budget'] = pd.to_numeric(movies_metadata_subset['budget'], errors = 'coerce')

In [20]:
# Add a new feature 'profit' by subtracting 'budget' from 'revenue'
movies_metadata_subset['profit'] = movies_metadata_subset['revenue'] - movies_metadata_subset['budget']

# Add a new feature 'profit_margin' by dividing 'profit' by 'revenue'
movies_metadata_subset['profit_margin'] = movies_metadata_subset['profit'] / movies_metadata_subset['revenue']

In [21]:
# Get the number of keywords for keywords data, get the number of cast and crew for credits data

# Feature engineering transformation for keywords
keywords['num_keywords'] = keywords['keywords'].apply(lambda x: len(x))

# Feature engineering transformation for credits
credits['num_cast'] = credits['cast'].apply(lambda x: len(x))
credits['num_crew'] = credits['crew'].apply(lambda x: len(x)) 

In [22]:
# Ensure 'id' columns are integers
def clean_id_column(df, column_name):
    df[column_name] = pd.to_numeric(df[column_name], errors='coerce')
    df.dropna(subset=[column_name], inplace=True)
    df[column_name] = df[column_name].astype(int)

# Clean 'id' columns in all datasets
clean_id_column(movies_metadata_subset, 'id')
clean_id_column(keywords, 'id')
clean_id_column(credits, 'id')

# Merge datasets (movies_metadata, keywords, credits)
combined_data = pd.merge(movies_metadata_subset, keywords, on='id', how='left')
combined_data = pd.merge(combined_data, credits, on='id', how='left')

# Convert 'release_date' to datetime and extract year, month, and day
combined_data['release_date'] = pd.to_datetime(combined_data['release_date'], errors='coerce')
combined_data['release_year'] = combined_data['release_date'].dt.year.fillna(0).astype(int)
combined_data['release_month'] = combined_data['release_date'].dt.month.fillna(0).astype(int)
combined_data['release_day'] = combined_data['release_date'].dt.day.fillna(0).astype(int)

# Convert 'budget' and 'popularity' to numeric
combined_data['budget'] = pd.to_numeric(combined_data['budget'], errors='coerce').fillna(0)
combined_data['popularity'] = pd.to_numeric(combined_data['popularity'], errors='coerce').fillna(0)

# Encode genres
def parse_genres(x):
    try:
        return ', '.join([d['name'] for d in literal_eval(x)])
    except (ValueError, SyntaxError, TypeError):
        return ''

combined_data['genres'] = combined_data['genres'].apply(parse_genres)
combined_data['genres_encoded'] = combined_data['genres'].astype('category').cat.codes


In [23]:
combined_data.head()

Unnamed: 0,id,original_title,genres,overview,release_date,popularity,vote_average,vote_count,revenue,budget,...,keywords,num_keywords,cast,crew,num_cast,num_crew,release_year,release_month,release_day,genres_encoded
0,862,Toy Story,"Animation, Comedy, Family","Led by Woody, Andy's toys live happily in his ...",1995-10-30,21.946943,7.7,5415.0,373554033.0,30000000.0,...,"[jealousy, toy, boy, friendship, friends, riva...",9.0,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...","[John Lasseter, Joss Whedon, Andrew Stanton, J...",13.0,106.0,1995,10,30,1079
1,8844,Jumanji,"Adventure, Fantasy, Family",When siblings Judy and Peter discover an encha...,1995-12-15,17.015539,6.9,2413.0,262797249.0,65000000.0,...,"[board game, disappearance, based on children'...",6.0,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...","[Larry J. Franco, Jonathan Hensleigh, James Ho...",26.0,16.0,1995,12,15,943
2,15602,Grumpier Old Men,"Romance, Comedy",A family wedding reignites the ancient feud be...,1995-12-22,11.7129,6.5,92.0,0.0,0.0,...,"[fishing, best friend, duringcreditsstinger, o...",4.0,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...","[Howard Deutch, Mark Steven Johnson, Mark Stev...",7.0,4.0,1995,12,22,3294
3,31357,Waiting to Exhale,"Comedy, Drama, Romance","Cheated on, mistreated and stepped on, the wom...",1995-12-22,3.859495,6.1,34.0,81452156.0,16000000.0,...,"[based on novel, interracial relationship, sin...",5.0,"[Whitney Houston, Angela Bassett, Loretta Devi...","[Forest Whitaker, Ronald Bass, Ronald Bass, Ez...",10.0,10.0,1995,12,22,1388
4,11862,Father of the Bride Part II,Comedy,Just when George Banks has recovered from his ...,1995-02-10,8.387519,5.7,173.0,76578911.0,0.0,...,"[baby, midlife crisis, confidence, aging, daug...",9.0,"[Steve Martin, Diane Keaton, Martin Short, Kim...","[Alan Silvestri, Elliot Davis, Nancy Meyers, N...",12.0,7.0,1995,2,10,1229


In [24]:
combined_data.columns

Index(['id', 'original_title', 'genres', 'overview', 'release_date',
       'popularity', 'vote_average', 'vote_count', 'revenue', 'budget',
       'profit', 'profit_margin', 'keywords', 'num_keywords', 'cast', 'crew',
       'num_cast', 'num_crew', 'release_year', 'release_month', 'release_day',
       'genres_encoded'],
      dtype='object')

In [25]:
combined_data.dtypes

id                         int64
original_title            object
genres                    object
overview                  object
release_date      datetime64[ns]
popularity               float64
vote_average             float64
vote_count               float64
revenue                  float64
budget                   float64
profit                   float64
profit_margin            float64
keywords                  object
num_keywords             float64
cast                      object
crew                      object
num_cast                 float64
num_crew                 float64
release_year               int64
release_month              int64
release_day                int64
genres_encoded             int16
dtype: object

Proposed Model #1: content based filtering based on textual similarities (overview, genre, keywords)

In [26]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['revenue', 'budget', 'popularity', 'vote_average', 'vote_count', 'num_cast', 'num_keywords']
combined_data[numerical_features] = scaler.fit_transform(combined_data[numerical_features])

# Combine text features
combined_data['combined_features'] = combined_data['overview'].fillna('') + " " + combined_data['genres'] + " " + combined_data['keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Vectorize text data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(combined_data['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = combined_data[combined_data['original_title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return combined_data['original_title'].iloc[movie_indices]

# Example usage
print(get_recommendations('Toy Story'))

15444                                    Toy Story 3
3012                                     Toy Story 2
1823                                  Small Soldiers
24671                                      Small Fry
25849    Silent Night, Deadly Night 5: The Toy Maker
37875                                          玩具修理者
10349                         The 40 Year Old Virgin
7593                                           Dolls
2154                                            Toys
1896                                  Child's Play 3
Name: original_title, dtype: object


In [28]:
print(get_recommendations('Money Train'))

13864            The Taking of Pelham 1 2 3
8653                           The Incident
3280     The Taking of Pelham One Two Three
45894                       New York Subway
19528                            Stag Night
13537                   Adrift in Manhattan
334                 While You Were Sleeping
12152                      We Own the Night
11751                                  TMNT
41647                              Marathon
Name: original_title, dtype: object


In [28]:
# Function to get movie recommendations with similarity scores
def get_recommendations_with_scores(title, combined_data):
    if title not in combined_data['original_title'].values:
        return f"Movie '{title}' not found in the dataset."
    
    idx = combined_data[combined_data['original_title'] == title].index[0]
    tfidf = TfidfVectorizer(stop_words='english')
    tfidf_matrix = tfidf.fit_transform(combined_data['combined_features'])

    cosine_sim = cosine_similarity(tfidf_matrix[idx], tfidf_matrix).flatten()
    sim_scores = sorted(list(enumerate(cosine_sim)), key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the movie itself
    
    recommendations = [(combined_data['original_title'].iloc[i[0]], i[1]) for i in sim_scores]
    return recommendations

# Example usage
recommendations_with_scores = get_recommendations_with_scores('Toy Story', combined_data)
for movie, score in recommendations_with_scores:
    print(f"{movie}: {score:.4f}")

Toy Story 3: 0.5042
Toy Story 2: 0.4810
Small Soldiers: 0.2875
Small Fry: 0.2788
Silent Night, Deadly Night 5: The Toy Maker: 0.2716
玩具修理者: 0.2668
The 40 Year Old Virgin: 0.2430
Dolls: 0.2255
Toys: 0.2193
Child's Play 3: 0.2168


In [29]:
recommendations_with_scores = get_recommendations_with_scores('Money Train', combined_data)
for movie, score in recommendations_with_scores:
    print(f"{movie}: {score:.4f}")

The Taking of Pelham 1 2 3: 0.4894
The Incident: 0.4214
The Taking of Pelham One Two Three: 0.3911
New York Subway: 0.3394
Stag Night: 0.3295
Adrift in Manhattan: 0.2907
While You Were Sleeping: 0.2792
We Own the Night: 0.2646
TMNT: 0.2564
Marathon: 0.2491


Proposed Model #2: collaborative filtering using ratings for each movie id (user item interaction)

In [26]:
ratings.dtypes

userId                int64
movieId               int64
rating              float64
timestamp    datetime64[ns]
dtype: object

In [42]:
# Take the first X entries from combined_data
sample_combined_data = combined_data.head(1000)

# Ensure the ratings dataset only includes ratings for the sampled movies
sample_movie_ids = sample_combined_data['id'].tolist()
sample_ratings = ratings[ratings['movieId'].isin(sample_movie_ids)]

# Check for duplicates (if any)
sample_ratings.drop_duplicates(subset=['userId', 'movieId'], inplace=True)

# Print the shapes of the sampled datasets to verify
print("Sampled combined_data shape:", sample_combined_data.shape)
print("Sampled ratings shape:", sample_ratings.shape)

Sampled combined_data shape: (1000, 23)
Sampled ratings shape: (925598, 4)


In [27]:
# Create the user-item matrix for the sampled ratings
ratings_matrix = sample_ratings.pivot(index='userId', columns='movieId', values='rating')

# Fill missing values with 0 (assuming missing values mean the user hasn't rated the movie)
ratings_matrix.fillna(0, inplace=True)

# Normalize ratings by subtracting the mean rating of each user
ratings_matrix_normalized = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)

# Convert the normalized ratings matrix to a sparse matrix
ratings_sparse_matrix = csr_matrix(ratings_matrix_normalized.values)

# Function to compute cosine similarity in batches
def compute_cosine_similarity_in_batches(matrix, batch_size=1000):
    n_items = matrix.shape[1]
    sim_matrix = np.zeros((n_items, n_items))

    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        batch_matrix = matrix[:, start:end]
        batch_sim_matrix = cosine_similarity(batch_matrix.T)
        sim_matrix[start:end, :] = batch_sim_matrix

    return sim_matrix

# Compute cosine similarity in batches for the sampled data
cosine_sim_ratings = compute_cosine_similarity_in_batches(ratings_sparse_matrix)

# Function to get collaborative filtering movie recommendations based on ratings
def get_collaborative_recommendations(title, combined_data=sample_combined_data, cosine_sim_ratings=cosine_sim_ratings):
    if title not in combined_data['original_title'].values:
        return f"Movie '{title}' not found in the sampled data."
    
    movie_id = combined_data[combined_data['original_title'] == title]['id'].values[0]
    idx = ratings_matrix.columns.get_loc(movie_id)
    sim_scores = list(enumerate(cosine_sim_ratings[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the movie itself
    
    recommendations = [(combined_data['original_title'].iloc[i[0]], i[1]) for i in sim_scores]
    return recommendations

# Example usage for collaborative filtering recommendations on the sampled data
collaborative_recommendations = get_collaborative_recommendations('Toy Story')

print("Collaborative Filtering Recommendations:")
for movie, score in collaborative_recommendations:
    print(f"{movie}: {score:.4f}")

Collaborative Filtering Recommendations:
Pocahontas: 0.4528
Just Cause: 0.4513
A Kid in King Arthur's Court: 0.4513
Mi Vida Loca: 0.4511
Kiss of Death: 0.4504
Legends of the Fall: 0.4503
Love Affair: 0.4501
Little Women: 0.4501
L'Enfer: 0.4500
Star Wars: 0.4498


Proposed Model #3: hybrid approach (content based and collaborative filtering)

In [30]:
# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['revenue', 'budget', 'popularity', 'vote_average', 'vote_count', 'num_cast', 'num_keywords']
sample_combined_data[numerical_features] = scaler.fit_transform(sample_combined_data[numerical_features])

# Combine text features
sample_combined_data['combined_features'] = sample_combined_data['overview'].fillna('') + " " + sample_combined_data['genres'] + " " + sample_combined_data['keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Vectorize text data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(sample_combined_data['combined_features'])

# Compute the cosine similarity matrix for content-based filtering
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Create the user-item matrix for the sampled ratings
ratings_matrix = sample_ratings.pivot(index='userId', columns='movieId', values='rating')
ratings_matrix.fillna(0, inplace=True)

# Normalize ratings by subtracting the mean rating of each user
ratings_matrix_normalized = ratings_matrix.sub(ratings_matrix.mean(axis=1), axis=0)

# Convert the normalized ratings matrix to a sparse matrix
ratings_sparse_matrix = csr_matrix(ratings_matrix_normalized.values)

def compute_cosine_similarity_in_batches(matrix, batch_size=100):
    n_items = matrix.shape[1]
    sim_matrix = np.zeros((n_items, n_items))

    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        batch_matrix = matrix[:, start:end]
        batch_sim_matrix = cosine_similarity(batch_matrix.T)
        
        sim_matrix[start:end, start:end] = batch_sim_matrix

        for inner_start in range(0, start, batch_size):
            inner_end = min(inner_start + batch_size, n_items)
            inner_batch_matrix = matrix[:, inner_start:inner_end]
            cross_sim_matrix = cosine_similarity(batch_matrix.T, inner_batch_matrix.T)
            sim_matrix[start:end, inner_start:inner_end] = cross_sim_matrix
            sim_matrix[inner_start:inner_end, start:end] = cross_sim_matrix.T

    return sim_matrix

# Compute cosine similarity in batches for the sampled data
cosine_sim_ratings = compute_cosine_similarity_in_batches(ratings_sparse_matrix)

# Ensure the dimensions of cosine similarity matrices match
min_dim = min(cosine_sim_content.shape[0], ratings_matrix.shape[1])
cosine_sim_content = cosine_sim_content[:min_dim, :min_dim]
cosine_sim_ratings = cosine_sim_ratings[:min_dim, :min_dim]

# Normalize the similarity matrices
cosine_sim_content_normalized = cosine_sim_content / np.max(cosine_sim_content)
cosine_sim_ratings_normalized = cosine_sim_ratings / np.max(cosine_sim_ratings)

alpha = 0.5  # Weight for content-based filtering
beta = 1 - alpha  # Weight for collaborative filtering

combined_sim = alpha * cosine_sim_content_normalized + beta * cosine_sim_ratings_normalized

# Function to get hybrid recommendations
def get_hybrid_recommendations(title, combined_data=sample_combined_data, combined_sim=combined_sim):
    if title not in combined_data['original_title'].values:
        return f"Movie '{title}' not found in the data."
    
    idx = combined_data[combined_data['original_title'] == title].index[0]
    sim_scores = list(enumerate(combined_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the movie itself
    
    recommendations = [(combined_data['original_title'].iloc[i[0]], i[1]) for i in sim_scores]
    return recommendations

# Example usage for hybrid recommendations on the data
hybrid_recommendations = get_hybrid_recommendations('Toy Story')

print("Hybrid Recommendations:")
for movie, score in hybrid_recommendations:
    print(f"{movie}: {score:.4f}")

Hybrid Recommendations:
Heat: 0.1763
Dunston Checks In: 0.1478
The Indian in the Cupboard: 0.1078
Grumpier Old Men: 0.1058
Othello: 0.0966
Jumanji: 0.0960
From Dusk Till Dawn: 0.0875
Dangerous Minds: 0.0666
Twelve Monkeys: 0.0665
Four Rooms: 0.0637


Underfitting/Overfitting Checks w/ Proposed Models

In [29]:
#for content based filtering
# Reduce data size for testing
sample_size = 20000
sample_combined_data = combined_data.sample(n=sample_size, random_state=42)

# Split the data
train_data, test_data = train_test_split(sample_combined_data, test_size=0.2, random_state=42)

# Vectorize text data for training set
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix_train = tfidf.fit_transform(train_data['combined_features'])

# Compute cosine similarity matrix for training set in chunks
def compute_cosine_similarity_in_chunks(matrix, chunk_size=100):
    n = matrix.shape[0]
    cosine_sim_matrix = np.zeros((n, n))
    for start in range(0, n, chunk_size):
        end = min(start + chunk_size, n)
        chunk_sim = cosine_similarity(matrix[start:end], matrix)
        cosine_sim_matrix[start:end] = chunk_sim
    return cosine_sim_matrix

cosine_sim_train = compute_cosine_similarity_in_chunks(tfidf_matrix_train)

# Function to get movie recommendations
def get_recommendations(title, combined_data=train_data, cosine_sim=cosine_sim_train, top_n=10):
    try:
        idx = combined_data[combined_data['original_title'] == title].index[0]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        sim_scores = sim_scores[1:top_n + 1]
        movie_indices = [i[0] for i in sim_scores]
        return combined_data['original_title'].iloc[movie_indices]
    except IndexError:
        return []

# Evaluate on test set in batches
def evaluate_model(test_data, combined_data=train_data, cosine_sim=cosine_sim_train, batch_size=100):
    predictions = []
    n = len(test_data)
    
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        batch = test_data.iloc[start:end]
        
        for idx, row in batch.iterrows():
            title = row['original_title']
            true_ratings = test_data[test_data['original_title'] == title]['vote_average'].values
            pred_titles = get_recommendations(title)
            if len(pred_titles) == 0:
                continue
            pred_ratings = train_data[train_data['original_title'].isin(pred_titles)]['vote_average'].values
            if len(pred_ratings) == 0:
                continue
            predictions.append((true_ratings.mean(), pred_ratings.mean()))
    
    if not predictions:
        return float('nan'), float('nan')
    
    true_values, pred_values = zip(*predictions)
    return mean_squared_error(true_values, pred_values), mean_absolute_error(true_values, pred_values)

mse, mae = evaluate_model(test_data)
print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")

Mean Squared Error: 0.7860080135576667
Mean Absolute Error: 0.6454377999287542


In [32]:
def evaluate_model_train(train_data, combined_data=train_data, cosine_sim=cosine_sim_train, batch_size=100):
    predictions = []
    n = len(train_data)
    
    for start in range(0, n, batch_size):
        end = min(start + batch_size, n)
        batch = train_data.iloc[start:end]
        
        for idx, row in batch.iterrows():
            title = row['original_title']
            true_ratings = train_data[train_data['original_title'] == title]['vote_average'].values
            pred_titles = get_recommendations(title)
            if len(pred_titles) == 0:
                continue
            pred_ratings = train_data[train_data['original_title'].isin(pred_titles)]['vote_average'].values
            if len(pred_ratings) == 0:
                continue
            predictions.append((true_ratings.mean(), pred_ratings.mean()))
    
    # Filter out NaN values
    predictions = [(true, pred) for true, pred in predictions if not np.isnan(pred)]
    
    if not predictions:
        return float('nan'), float('nan')
    
    true_values, pred_values = zip(*predictions)
    return mean_squared_error(true_values, pred_values), mean_absolute_error(true_values, pred_values)

# Calculate MSE and MAE for the training data
mse_train, mae_train = evaluate_model_train(train_data)
print(f"Mean Squared Error (Training): {mse_train}")
print(f"Mean Absolute Error (Training): {mae_train}")


Mean Squared Error (Training): 0.7517367636940092
Mean Absolute Error (Training): 0.6620983245054448


In [26]:
def compute_cosine_similarity_in_batches(matrix, batch_size=100):
    n_items = matrix.shape[1]
    sim_matrix = np.zeros((n_items, n_items))

    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        print(f"Processing batch from {start} to {end}")

        batch_matrix = matrix[:, start:end].toarray()  # Convert only the batch to dense
        batch_sim_matrix = cosine_similarity(batch_matrix.T)

        # Place the batch similarity in the correct position in the full similarity matrix
        sim_matrix[start:end, start:end] = batch_sim_matrix

    return sim_matrix

# Limit the dataset by selecting a subset of users and movies
def limit_dataset(ratings, num_users=1000, num_movies=1000):
    top_users = ratings['userId'].value_counts().head(num_users).index
    top_movies = ratings['movieId'].value_counts().head(num_movies).index
    limited_ratings = ratings[ratings['userId'].isin(top_users) & ratings['movieId'].isin(top_movies)]
    return limited_ratings

# Limit the dataset
limited_ratings = limit_dataset(ratings, num_users=1000, num_movies=1000)

# Create a pivot table
ratings_matrix = limited_ratings.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Convert the ratings matrix to a sparse matrix format
ratings_sparse_matrix = csr_matrix(ratings_matrix.values)

# Compute cosine similarity in batches for the sampled data
cosine_sim_ratings = compute_cosine_similarity_in_batches(ratings_sparse_matrix)

# Function to get collaborative filtering movie recommendations based on ratings
def get_collaborative_recommendations(title, combined_data, cosine_sim_ratings, ratings_matrix):
    if title not in combined_data['original_title'].values:
        return f"Movie '{title}' not found in the sampled data."
    movie_id = combined_data[combined_data['original_title'] == title]['id'].values[0]
    idx = ratings_matrix.columns.get_loc(movie_id)
    sim_scores = list(enumerate(cosine_sim_ratings[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [ratings_matrix.columns[i[0]] for i in sim_scores]
    return combined_data[combined_data['id'].isin(movie_indices)]['original_title']

# Step 1: Split the data
ratings_matrix = ratings_matrix.astype(np.float32)
X_train, X_test = train_test_split(ratings_matrix, test_size=0.2, random_state=42)

# Step 2: Calculate user-user similarities
def compute_user_similarity(matrix):
    return cosine_similarity(matrix)

train_similarities = compute_user_similarity(X_train)
test_similarities = compute_user_similarity(X_test)

# Function to predict ratings using collaborative filtering
def predict_ratings(ratings_matrix, similarities, k=20):
    pred = np.zeros(ratings_matrix.shape)
    for i in range(ratings_matrix.shape[0]):
        top_k_users = np.argsort(similarities[i])[-k:]
        sim_sum = np.sum(np.abs(similarities[i, top_k_users]))
        if sim_sum == 0:
            continue  # Skip users with no similarities
        for j in range(ratings_matrix.shape[1]):
            pred[i, j] = similarities[i, top_k_users].dot(ratings_matrix[top_k_users, j])
            pred[i, j] /= sim_sum
    return pred

# Step 3: Make predictions
train_preds = predict_ratings(X_train.values, train_similarities)
test_preds = predict_ratings(X_test.values, test_similarities)

# Step 4: Compute evaluation metrics
train_mse = mean_squared_error(X_train.values, train_preds)
train_mae = mean_absolute_error(X_train.values, train_preds)

test_mse = mean_squared_error(X_test.values, test_preds)
test_mae = mean_absolute_error(X_test.values, test_preds)

print(f"Training MSE: {train_mse}, MAE: {train_mae}")
print(f"Test MSE: {test_mse}, MAE: {test_mae}")


Processing batch from 0 to 100
Processing batch from 100 to 200
Processing batch from 200 to 300
Processing batch from 300 to 400
Processing batch from 400 to 500
Processing batch from 500 to 600
Processing batch from 600 to 700
Processing batch from 700 to 800
Processing batch from 800 to 900
Processing batch from 900 to 1000
Training MSE: 2.3326603003281, MAE: 1.1929343495833729
Test MSE: 2.2405352118303297, MAE: 1.2267159297928134


In [1]:
#mse mae calculation for hybrid approach

sample_combined_data = combined_data.head(10000)  # Reduced from 20000 to 10000

# Ensure the ratings dataset only includes ratings for the sampled movies
sample_movie_ids = sample_combined_data['id'].tolist()
sample_ratings = ratings[ratings['movieId'].isin(sample_movie_ids)]

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['revenue', 'budget', 'popularity', 'vote_average', 'vote_count', 'num_cast', 'num_keywords']
sample_combined_data[numerical_features] = scaler.fit_transform(sample_combined_data[numerical_features])

# Combine text features
sample_combined_data['combined_features'] = (
    sample_combined_data['overview'].fillna('') + " " +
    sample_combined_data['genres'] + " " +
    sample_combined_data['keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')
)

# Vectorize text data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(sample_combined_data['combined_features'])

# Compute content-based cosine similarity matrix
cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Check for duplicates in ratings
sample_ratings.drop_duplicates(subset=['userId', 'movieId'], inplace=True)

# Split the data into training and testing sets
train_data, test_data = train_test_split(sample_ratings, test_size=0.2, random_state=42)

# Create user-item matrices for training and testing sets
train_ratings_matrix = train_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)
test_ratings_matrix = test_data.pivot(index='userId', columns='movieId', values='rating').fillna(0)

# Normalize ratings by subtracting the mean rating of each user in the training set
train_ratings_matrix_normalized = train_ratings_matrix.sub(train_ratings_matrix.mean(axis=1), axis=0)

# Convert the normalized ratings matrices to sparse matrices
train_ratings_sparse_matrix = csr_matrix(train_ratings_matrix_normalized.values)

# Function to compute cosine similarity in batches
def compute_cosine_similarity_in_batches(matrix, batch_size=100):
    n_items = matrix.shape[1]
    sim_matrix = np.zeros((n_items, n_items))

    for start in range(0, n_items, batch_size):
        end = min(start + batch_size, n_items)
        batch_matrix = matrix[:, start:end].toarray()
        batch_sim_matrix = cosine_similarity(batch_matrix.T)
        
        sim_matrix[start:end, start:end] = batch_sim_matrix

        for inner_start in range(0, start, batch_size):
            inner_end = min(inner_start + batch_size, n_items)
            inner_batch_matrix = matrix[:, inner_start:inner_end].toarray()
            cross_sim_matrix = cosine_similarity(batch_matrix.T, inner_batch_matrix.T)
            sim_matrix[start:end, inner_start:inner_end] = cross_sim_matrix
            sim_matrix[inner_start:inner_end, start:end] = cross_sim_matrix.T

    return sim_matrix

# Compute cosine similarity in batches for the sampled data
cosine_sim_ratings = compute_cosine_similarity_in_batches(train_ratings_sparse_matrix, batch_size=50)  # Adjusted batch size

# Ensure the dimensions of cosine similarity matrices match
min_dim = min(cosine_sim_content.shape[0], train_ratings_matrix.shape[1])
cosine_sim_content = cosine_sim_content[:min_dim, :min_dim]
cosine_sim_ratings = cosine_sim_ratings[:min_dim, :min_dim]

# Normalize the similarity matrices
cosine_sim_content_normalized = cosine_sim_content / np.max(cosine_sim_content)
cosine_sim_ratings_normalized = cosine_sim_ratings / np.max(cosine_sim_ratings)

# Combine similarity matrices with weights
alpha = 0.5  # Weight for content-based filtering
beta = 1 - alpha  # Weight for collaborative filtering
combined_sim = alpha * cosine_sim_content_normalized + beta * cosine_sim_ratings_normalized

# Predict ratings for the test set
def predict_ratings_hybrid(user_id, movie_id, combined_sim, train_ratings_matrix):
    if movie_id not in train_ratings_matrix.columns:
        return 0  # Default rating if movie not in training set
    movie_idx = train_ratings_matrix.columns.get_loc(movie_id)
    user_idx = train_ratings_matrix.index.get_loc(user_id)
    sim_scores = combined_sim[movie_idx]
    user_ratings = train_ratings_matrix.iloc[user_idx]
    weighted_sum = np.dot(sim_scores, user_ratings)
    sum_of_weights = np.sum(sim_scores)
    if sum_of_weights == 0:
        return 0  # Avoid division by zero
    predicted_rating = weighted_sum / sum_of_weights
    return predicted_rating

# Filter test data to include only movies in the training set
train_movie_ids = set(train_ratings_matrix.columns)
test_data_filtered = test_data[test_data['movieId'].isin(train_movie_ids)]

# Apply predictions only to the filtered test data
test_data_filtered['predicted_rating'] = test_data_filtered.apply(
    lambda x: predict_ratings_hybrid(x['userId'], x['movieId'], combined_sim, train_ratings_matrix),
    axis=1
)

# Compute MSE and MAE using the filtered test data
mse = mean_squared_error(test_data_filtered['rating'], test_data_filtered['predicted_rating'])
mae = mean_absolute_error(test_data_filtered['rating'], test_data_filtered['predicted_rating'])

# print("MSE for hybrid recommendations:", mse)
# print("MAE for hybrid recommendations:", mae)

Collaborative Filtering Model w/ Regularization

- Content based does not have regularization like collaborative filtering

In [41]:
#trying on downsized sample

# Normalize numerical features
scaler = StandardScaler()
numerical_features = ['revenue', 'budget', 'popularity', 'vote_average', 'vote_count', 'num_cast', 'num_keywords']
sample_combined_data[numerical_features] = scaler.fit_transform(sample_combined_data[numerical_features])

# Combine text features
sample_combined_data['combined_features'] = sample_combined_data['overview'].fillna('') + " " + sample_combined_data['genres'] + " " + sample_combined_data['keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Vectorize text data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(sample_combined_data['combined_features'])

cosine_sim_content = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Concatenate numerical features and TF-IDF features
numerical_matrix = sample_combined_data[numerical_features].values
combined_matrix = np.hstack([numerical_matrix, tfidf_matrix.toarray()])

# Create the user-item matrix for the ratings
ratings_matrix = sample_ratings.pivot(index='userId', columns='movieId', values='rating')
ratings_matrix.fillna(0, inplace=True)

# Convert ratings_matrix to a sparse matrix
sparse_ratings_matrix = csr_matrix(ratings_matrix.values)

# Train ALS model with regularization
als_model = AlternatingLeastSquares(factors=100, regularization=0.1)
als_model.fit(sparse_ratings_matrix)

# Get item factors and user factors
item_factors = als_model.item_factors
user_factors = als_model.user_factors

# Compute similarity between items using item factors
item_similarity_matrix = cosine_similarity(item_factors)

# Function to get item recommendations for a given item
def get_item_recommendations(item_id, item_similarity_matrix=item_similarity_matrix, combined_data=sample_combined_data):
    if item_id not in sample_combined_data['id'].values:
        return f"Item '{item_id}' not found in the data."
    
    item_idx = sample_combined_data[sample_combined_data['id'] == item_id].index[0]
    
    # Filter out invalid entries from the similarity matrix
    sim_scores = [(idx, score) for idx, score in enumerate(item_similarity_matrix[item_idx]) if not np.isnan(score)]
    
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    recommended_items = []
    for similar_item_idx, score in sim_scores:
        if similar_item_idx != item_idx:
            recommended_items.append((sample_combined_data['original_title'].iloc[similar_item_idx], score))
            if len(recommended_items) >= 10:
                break
    
    return recommended_items

# Example usage for item recommendations
item_id = 862 # Replace with an actual item ID from your data
item_recommendations = get_item_recommendations(item_id)

print("Item Recommendations:")
for item in item_recommendations:
    print(f"{item}")

  0%|          | 0/15 [00:00<?, ?it/s]

Item Recommendations:
('A Little Princess', 0.107552685)
('Heat', 0.07591191)
('Dunston Checks In', 0.07097376)
('Legends of the Fall', 0.06286171)
('Exotica', 0.05749233)
('The Glass Shield', 0.056265943)
('Balto', 0.04396431)
('Bed of Roses', 0.03668877)
('Just Cause', 0.034602586)
("A Kid in King Arthur's Court", 0.03280032)


- Content based filtering so we can see what features of movies contribute to movie popularity
- Although collaborative filtering allows us to determine how ratings influence movie popularity, content based also contains popularity and voting count

Accuracy for Content Based Filtering

In [26]:
scaler = StandardScaler()
numerical_features = ['revenue', 'budget', 'popularity', 'vote_average', 'vote_count', 'num_cast', 'num_keywords']
combined_data[numerical_features] = scaler.fit_transform(combined_data[numerical_features])

# Combine text features
combined_data['combined_features'] = combined_data['overview'].fillna('') + " " + combined_data['genres'] + " " + combined_data['keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else '')

# Vectorize text data
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(combined_data['combined_features'])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Function to get movie recommendations
def get_recommendations(title, cosine_sim=cosine_sim):
    idx = combined_data[combined_data['original_title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return combined_data['original_title'].iloc[movie_indices]

def intra_similarity(recommended_movies, cosine_sim):
    # Calculate cosine similarity matrix for the recommended movies
    recommended_indices = [combined_data[combined_data['original_title'] == movie].index[0] for movie in recommended_movies]
    recommended_cosine_sim = cosine_sim[recommended_indices][:, recommended_indices]
    
    # Compute average similarity among recommended movies
    avg_similarity = np.mean(recommended_cosine_sim[np.triu_indices(len(recommended_movies), k=1)])
    
    return avg_similarity

# Example usage
recommended_movies = get_recommendations('Toy Story')
accuracy = intra_similarity(recommended_movies, cosine_sim)
print("Accuracy Score (Intra-similarity):", accuracy)

Accuracy Score (Intra-similarity): 0.24958866394386076


In [27]:
# Define a simple baseline method (e.g., recommending the most popular movies)
def baseline_recommendation():
    # Example: Recommend the top 10 most popular movies
    popular_movies = combined_data['original_title'].value_counts().head(10).index.tolist()
    return popular_movies

# Evaluate the baseline method
baseline_recommendations = baseline_recommendation()
baseline_accuracy = intra_similarity(baseline_recommendations, cosine_sim)
print("Baseline Accuracy Score (Intra-similarity):", baseline_accuracy)

# Evaluate the recommendation system
recommended_movies = get_recommendations('Toy Story')
system_accuracy = intra_similarity(recommended_movies, cosine_sim)
print("Recommendation System Accuracy Score (Intra-similarity):", system_accuracy)

Baseline Accuracy Score (Intra-similarity): 0.006864446227002278
Recommendation System Accuracy Score (Intra-similarity): 0.24958866394386076
