# REQUIREMENT #1


Requirement #1 

Steps we will take throughout this ipynb. 

1. Set up & clean the data
2. Build content-based recommender
3. Build collaborative filtering recommender
4. Combine them in a hybrid system
5. Test recommendations for a given show/user

# DATA IMPORT

In [1]:
# Step 1: Import libraries
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

# Don't forget to mount Google Drive if you haven't already:
# from google.colab import drive
# drive.mount('/content/drive')

df = pd.read_csv('movies_titles.csv')
df.head()

Unnamed: 0,show_id,type,title,director,cast,country,release_year,rating,duration,description,...,Language TV Shows,Musicals,Nature TV,Reality TV,Spirituality,TV Action,TV Comedies,TV Dramas,Talk Shows TV Comedies,Thrillers
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,"Michael Hilow, Ana Hoffman, Dick Johnson, Kirs...",United States,2020,PG-13,90 min,As her father nears the end of his life filmma...,...,0,0,0,0,0,0,0,0,0,0
1,s2,TV Show,Blood & Water,,Ama Qamata Khosi Ngema Gail Mabalane Thabang M...,South Africa,2021,TV-MA,2 Seasons,After crossing paths at a party a Cape Town te...,...,0,0,0,0,0,0,0,1,0,0
2,s3,TV Show,Ganglands,Julien Leclercq,Sami Bouajila Tracy Gotoas Samuel Jouy Nabiha ...,,2021,TV-MA,1 Season,To protect his family from a powerful drug lor...,...,0,0,0,0,0,1,0,0,0,0
3,s4,TV Show,Jailbirds New Orleans,,,,2021,TV-MA,1 Season,Feuds flirtations and toilet talk go down amon...,...,0,0,0,1,0,0,0,0,0,0
4,s5,TV Show,Kota Factory,,Mayur More Jitendra Kumar Ranjan Raj Alam Khan...,India,2021,TV-MA,2 Seasons,In a city of coaching centers known to train I...,...,0,0,0,0,0,0,1,0,0,0


# DATA UNDERSTANDING


In [2]:
print(df.shape)
df.isna().sum()

(8508, 42)


show_id                                                  0
type                                                     0
title                                                    0
director                                              2114
cast                                                   119
country                                                814
release_year                                             0
rating                                                   4
duration                                                 1
description                                              0
Action                                                   0
Adventure                                                0
Anime Series International TV Shows                      0
British TV Shows Docuseries International TV Shows       0
Children                                                 0
Comedies                                                 0
Comedies Dramas International Movies                    

# DATA PREPARATION


In [7]:
# Replace missing text fields with 'unknown'
df['director'].fillna('unknown', inplace=True)
df['cast'].fillna('unknown', inplace=True)
df['country'].fillna('unknown', inplace=True)

# Fill duration with mode (most common value)
df['duration'].fillna(df['duration'].mode()[0], inplace=True)

# Drop rows missing 'rating'
df.dropna(subset=['rating'], inplace=True)

# Reset the index after dropping
df.reset_index(drop=True, inplace=True)

# Check again
print(df.isna().sum(), '\n')
print(df.shape)


show_id                                               0
type                                                  0
title                                                 0
director                                              0
cast                                                  0
country                                               0
release_year                                          0
rating                                                0
duration                                              0
description                                           0
Action                                                0
Adventure                                             0
Anime Series International TV Shows                   0
British TV Shows Docuseries International TV Shows    0
Children                                              0
Comedies                                              0
Comedies Dramas International Movies                  0
Comedies International Movies                   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['director'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['cast'].fillna('unknown', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw

# MODELING PREP


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create TF-IDF vectorizer and remove stopwords
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'description' field
tfidf_matrix = tfidf.fit_transform(df['description'])

# Shape of the matrix (shows x unique words)
print(tfidf_matrix.shape)


(8504, 18564)


In [8]:
df_tfidf = pd.DataFrame(tfidf_matrix.T.todense(), 
                        index=tfidf.get_feature_names_out(), 
                        columns=df['title'])

df_tfidf.iloc[2221:2226]


title,Dick Johnson Is Dead,Blood & Water,Ganglands,Jailbirds New Orleans,Kota Factory,Midnight Mass,My Little Pony: A New Generation,Sankofa,The Great British Baking Show,The Starling,...,Zak Storm,Zed Plus,Zenda,Zindagi Gulzar Hai,Zinzana,Zodiac,Zombie Dumb,Zombieland,Zoom,Zubaan
brand,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
branded,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brandi,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brando,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
brandon,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
from sklearn.metrics.pairwise import linear_kernel

# Compute pairwise cosine similarity between all shows
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

# Optional: view as a DataFrame
pd.DataFrame(cosine_sim)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8494,8495,8496,8497,8498,8499,8500,8501,8502,8503
0,1.000000,0.000000,0.000000,0.000000,0.015283,0.000000,0.000000,0.000000,0.040107,0.017486,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.015485,0.000000
1,0.000000,1.000000,0.000000,0.000000,0.000000,0.030973,0.052236,0.000000,0.000000,0.000000,...,0.03571,0.032478,0.118291,0.000000,0.032402,0.000000,0.041654,0.000000,0.000000,0.000000
2,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.022284
3,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.016315,0.000000
4,0.015283,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000,0.016786,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.035912,0.068620,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8499,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.046180,0.052486,0.000000,0.000000,1.000000,0.000000,0.060082,0.000000,0.000000
8500,0.000000,0.041654,0.000000,0.000000,0.000000,0.054674,0.000000,0.000000,0.000000,0.000000,...,0.00000,0.036839,0.000000,0.000000,0.036753,0.000000,1.000000,0.000000,0.000000,0.000000
8501,0.000000,0.000000,0.000000,0.000000,0.035912,0.000000,0.000000,0.000000,0.000000,0.062403,...,0.00000,0.000000,0.000000,0.042452,0.000000,0.060082,0.000000,1.000000,0.000000,0.000000
8502,0.015485,0.000000,0.000000,0.016315,0.068620,0.000000,0.000000,0.000000,0.000000,0.017008,...,0.00000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000


In [11]:
# Create a title-to-index mapping
indices = pd.Series(df.index, index=df['title'].str.lower())

def get_recommendations(title, cosine_sim=cosine_sim, top_n=5):
    idx = indices[title.lower()]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:top_n+1]
    show_indices = [i[0] for i in sim_scores]
    return df.iloc[show_indices][['title', 'description']]


In [12]:
get_recommendations("Dick Johnson Is Dead")


Unnamed: 0,title,description
4732,End Game,Facing an inevitable outcome terminally ill pa...
1029,The Soul,While investigating the death of a businessman...
7250,Moon,As he nears the end of a lonely three-year sti...
4894,The Cloverfield Paradox,Orbiting above a planet on the brink of war sc...
5072,The Death and Life of Marsha P. Johnson,As she fights the tide of violence against tra...
