# <span style="color:#cd0057; font-weight:bold">⬇️Step 1: Import Essential Libraries</span>
<hr>

In [1]:
import numpy as np              
import pandas as pd              
import matplotlib.pyplot as plt  
import seaborn as sns 

# <span style="color:#cd0057; font-weight:bold">🚚Step 2: Load Dataset</span>
<hr>

In [2]:
df = pd.read_csv('../data/top10K-TMDB-movies.csv')
df.head()

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,278,The Shawshank Redemption,"Drama,Crime",en,Framed in the 1940s for the double murder of h...,94.075,1994-09-23,8.7,21862
1,19404,Dilwale Dulhania Le Jayenge,"Comedy,Drama,Romance",hi,"Raj is a rich, carefree, happy-go-lucky second...",25.408,1995-10-19,8.7,3731
2,238,The Godfather,"Drama,Crime",en,"Spanning the years 1945 to 1955, a chronicle o...",90.585,1972-03-14,8.7,16280
3,424,Schindler's List,"Drama,History,War",en,The true story of how businessman Oskar Schind...,44.761,1993-12-15,8.6,12959
4,240,The Godfather: Part II,"Drama,Crime",en,In the continuing saga of the Corleone crime f...,57.749,1974-12-20,8.6,9811


# <span style="color:#cd0057; font-weight:bold">🩺Step03: Understand Data frame</span>
<hr>

In [3]:
df.shape

(10000, 9)

In [4]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

### <span style="color:#0076cd; font-weight:bold">Understand Missing values</span>

In [5]:
df.isnull().sum()        

id                    0
title                 0
genre                 3
original_language     0
overview             13
popularity            0
release_date          0
vote_average          0
vote_count            0
dtype: int64

### <span style="color:#0076cd; font-weight:bold">Find Duplicates</span>

In [6]:
df.duplicated().sum()         

0

### <span style="color:#0076cd; font-weight:bold">Understand Object Data</span>

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 10000 non-null  int64  
 1   title              10000 non-null  object 
 2   genre              9997 non-null   object 
 3   original_language  10000 non-null  object 
 4   overview           9987 non-null   object 
 5   popularity         10000 non-null  float64
 6   release_date       10000 non-null  object 
 7   vote_average       10000 non-null  float64
 8   vote_count         10000 non-null  int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 703.3+ KB


In [8]:
df['genre'].value_counts()

genre
Comedy                                    744
Drama                                     611
Drama,Romance                             290
Comedy,Drama                              262
Comedy,Romance                            255
                                         ... 
Fantasy,Animation,Romance,Family            1
Drama,Thriller,Crime,Western                1
Comedy,Drama,Romance,Fantasy,Adventure      1
Drama,History,Action                        1
Adventure,Fantasy,Action,Drama              1
Name: count, Length: 2123, dtype: int64

# <span style="color:#cd0057; font-weight:bold">❓Step 5: Handle Missing Values</span>
<hr>

In [9]:
df.shape

(10000, 9)

In [10]:
df.dropna(inplace=True)
df.shape

(9985, 9)

# <span style="color:#cd0057; font-weight:bold">🛠️Step 6: Feature Eng</span>
<hr>

In [11]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

overview_texts = df['overview'].fillna('').tolist()
vectors = model.encode(overview_texts, show_progress_bar=True)

Batches:   0%|          | 0/313 [00:00<?, ?it/s]

In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Define a named tokenizer function instead of using lambda
def custom_tokenizer(x):
    return x.split(', ')

# Use the named function in CountVectorizer
count_vectorizer = CountVectorizer(tokenizer=custom_tokenizer)
genre_matrix = count_vectorizer.fit_transform(df['genre'].fillna(''))



In [13]:
from sklearn.preprocessing import OneHotEncoder

lang_encoder = OneHotEncoder(sparse_output=False)
lang_matrix = lang_encoder.fit_transform(df[['original_language']])



In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
num_matrix = scaler.fit_transform(df[['vote_average', 'popularity', 'vote_count']])

In [15]:
import numpy as np
from scipy.sparse import hstack

# Combine all features (overview is sparse, so use hstack)
final_matrix = hstack([vectors, genre_matrix, lang_matrix, num_matrix])


In [16]:
df.columns

Index(['id', 'title', 'genre', 'original_language', 'overview', 'popularity',
       'release_date', 'vote_average', 'vote_count'],
      dtype='object')

In [17]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert once outside function if possible, to avoid repeated conversion
final_matrix_csr = final_matrix.tocsr()

def recommend_from_three_movies(input_titles):
    input_titles = [title.lower() for title in input_titles]
    movie_index = pd.Series(df.index, index=df['title'].str.lower())

    selected_indexes = [movie_index[title] for title in input_titles if title in movie_index]
    if len(selected_indexes) < 1:
        return "None of the movies were found."

    selected_vectors = final_matrix_csr[selected_indexes]

    avg_vector = selected_vectors.mean(axis=0)

    # Convert avg_vector to array for cosine similarity
    if hasattr(avg_vector, "toarray"):
        avg_vector = avg_vector.toarray()
    else:
        avg_vector = np.asarray(avg_vector)

    similarities = cosine_similarity(avg_vector, final_matrix_csr).flatten()

    similar_indices = similarities.argsort()[::-1]
    recommended_indices = [i for i in similar_indices if i not in selected_indexes][:5]

    return df['title'].iloc[recommended_indices]

# Example usage:
print(recommend_from_three_movies(['Inception', 'Interstellar', 'The Matrix']))


443     Guardians of the Galaxy
969                    Iron Man
33                   Fight Club
374            The Tomorrow War
1045           Edge of Tomorrow
Name: title, dtype: object


In [18]:
print(recommend_from_three_movies(['Moneyball', 'The Social Network', 'The Big Short']))

2975    Pirates of Silicon Valley
2336           The Great Debaters
897                The Blind Side
3409                 The Believer
33                     Fight Club
Name: title, dtype: object


In [None]:
import joblib
import scipy.sparse

# Save vectorizers and scalers
joblib.dump(count_vectorizer, 'count_vectorizer.pkl')
joblib.dump(lang_encoder, 'lang_encoder.pkl')
joblib.dump(scaler, 'minmax_scaler.pkl')

# Save overview embeddings (dense)
np.save('overview_vectors.npy', vectors)

# Save final_matrix (sparse)
scipy.sparse.save_npz('final_matrix.npz', final_matrix)

# Save the dataframe
df.to_csv('movies_df.csv', index=False)
