In [19]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Machine Learning - scikit-learn
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, mean_absolute_error, accuracy_score

# For recommendation systems specifically
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD  # for matrix factorization

# If using collaborative filtering
from scipy.sparse import csr_matrix
from scipy.sparse.linalg import svds

# Warnings control
import warnings
warnings.filterwarnings('ignore')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
%matplotlib inline

### Load data

In [20]:
df = pd.read_csv("movies_data.csv")

### Data analysis

In [21]:
df.head()

Unnamed: 0.1,Unnamed: 0,title,release_date,original_language,popularity,vote_count,vote_average,overview
0,0,Our Fault,10/15/25,es,1096.6654,302,7.869,Jenna and Lion's wedding brings about the long...
1,1,Inside Furioza,10/14/25,pl,384.322,27,6.593,"In the wake of murder, new Furioza leader Gold..."
2,2,Captain Hook - The Cursed Tides,7/11/25,en,370.1091,15,5.0,In the aftermath of a devastating defeat by hi...
3,3,War of the Worlds,7/29/25,en,327.876,642,4.4,Will Radford is a top analyst for Homeland Sec...
4,4,The Conjuring: Last Rites,9/3/25,en,284.0186,1065,6.972,Paranormal investigators Ed and Lorraine Warre...


In [22]:
df.columns.tolist()

['Unnamed: 0',
 'title',
 'release_date',
 'original_language',
 'popularity',
 'vote_count',
 'vote_average',
 'overview']

In [31]:
### missing values

df['overview'].fillna('', inplace=True)

In [32]:
### clean the data

print(f"Before: {len(df)} movies")

# Remove duplicates (keep first occurrence)
df = df.drop_duplicates(subset='title', keep='first')

# After cleaning
print(f"After: {len(df)} movies")
print(f"Removed: {2415} duplicates")

Before: 10000 movies
After: 7585 movies
Removed: 2415 duplicates


In [33]:
df['overview'] = df['overview'].fillna('')



### TF-IDF Vectorization

In [34]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
tfidf_matrix = tfidf.fit_transform(df['overview'])

# Compute similarity
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

### Recommendation Function

In [35]:
def get_recommendations(title, top_n=10):
    # Find movie index
    idx = df[df['title'] == title].index[0]
    
    # Get similarity scores
    sim_scores = list(enumerate(cosine_sim[idx]))
    
    # Sort by similarity
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N (skip first - it's the movie itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Return movie titles
    movie_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[movie_indices]

### Test the process!!

In [37]:
print(get_recommendations('Vicious'))

7990                              Final Summer
8498                          Extraterrestrial
1977            FernGully: The Last Rainforest
8275                  Lumaaq: An Eskimo Legend
6214        Friday the 13th: The Final Chapter
576                          Scarlet Innocence
9226    Doraemon: Nobita and the Winged Braves
3058                        Fun and Fancy Free
6655                          Three Blind Mice
71                         The Ugly Stepsister
Name: title, dtype: object


In [None]:
### Save the Process

import joblib

# Save cleaned data
df.to_csv('movies_clean.csv', index=False, sep='|')

# Save similarity matrix
joblib.dump(cosine_sim, 'cosine_similarity.pkl')

# Save indices (for faster lookup)
indices = pd.Series(df.index, index=df['title'])
joblib.dump(indices, 'movie_indices.pkl')