In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.metrics.pairwise import cosine_similarity
import ast
import numpy as np

In [14]:
df = pd.read_csv('NetFlix.csv')
df.shape

(7787, 12)

In [16]:
#tfidf
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [18]:
#put genres into list
df['genres'] = df['genres'].apply(lambda x: [genre.strip() for genre in x.split(',')])

#delete TV, Movies, and TV Shows from genres
def clean_genres(genres):
    filtered_genres = []
    for genre in genres:
        if 'TV' not in genre and 'Movies' not in genre and 'TV Shows' not in genre:
            if '&' in genre:
                filtered_genres.extend([g.strip() for g in genre.split(' & ')])
            else:
                filtered_genres.append(genre.strip())
    return filtered_genres

# take out unique genres
all_genres = set()
df['genres_cleaned'] = df['genres'].apply(clean_genres)
for genres in df['genres_cleaned']:
    all_genres.update(genres)

# one hot encoding
genre_columns = pd.DataFrame({genre: df['genres_cleaned'].apply(lambda x: 1 if genre in x else 0) for genre in all_genres})
df_genres = genre_columns

categorical_columns = ['type', 'director', 'cast', 'country', 'release_year']
other_one_hot = pd.get_dummies(df[categorical_columns])
other_one_hot = other_one_hot.astype(int)

#merge df
df_final = pd.concat([genre_columns, other_one_hot], axis=1)
# df_final['title'] = df['title']

# df_final = df_final[['title'] + [col for col in df_final.columns if col != 'title']]

In [6]:
# import matplotlib.pyplot as plt


# print(df['duration'].describe())

# plt.figure(figsize=(10, 6))
# plt.hist(df['duration'].dropna(), bins=20, edgecolor='k')
# plt.title('Distribution of Duration')
# plt.xlabel('Duration')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.show()

In [20]:
#one hot encoding for duration
def categorize_duration(value):
    if value <= 50:
        return '0-50'
    elif value <= 100:
        return '51-100'
    elif value <= 150:
        return '101-150'
    else:
        return '151+'

df['duration_category'] = df['duration'].apply(categorize_duration)
df_duration_one_hot = pd.get_dummies(df['duration_category'], prefix='duration')

df_duration_one_hot = df_duration_one_hot.astype(int)

df_final = pd.concat([df_final, df_duration_one_hot], axis=1)


In [21]:
df_final.head()

Unnamed: 0,Thrillers,Anime Series,Dramas,Documentaries,Stand-Up Comedy,Anime Features,Spirituality,Musicals,Talk Shows,Action,...,"country_Uruguay, Spain, Mexico",country_Venezuela,"country_Venezuela, Colombia",country_Vietnam,country_West Germany,country_Zimbabwe,duration_0-50,duration_101-150,duration_151+,duration_51-100
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0


In [9]:
#make lists of unique genres
print(all_genres)

{'Anime Series', 'Anime Features', 'Sci-Fi', 'Faith', 'Spirituality', 'Music', 'Talk Shows', 'Action', 'Fantasy', 'Comedies', 'Dramas', 'Adventure', 'Stand-Up Comedy', 'Docuseries', 'Thrillers', 'Musicals', 'Documentaries'}


In [24]:
#textual data processing
tfidf_vectorizer = TfidfVectorizer(stop_words='english')

#description
description_tfidf = tfidf_vectorizer.fit_transform(df['description'])

#title
title_tfidf = tfidf_vectorizer.fit_transform(df['title'])

#categorical data
categorical_data = df_final.to_numpy()

#merge
description_dense = description_tfidf.toarray()
title_dense = title_tfidf.toarray()

#add data holizontally
combined_matrix = np.hstack([categorical_data, description_dense, title_dense])

#cosine similarity
cosine_sim = cosine_similarity(combined_matrix)

#make it df
similarity_df = pd.DataFrame(cosine_sim, index=df['title'], columns=df['title'])

def get_similar_titles(user_title, similarity_matrix, titles, top_n=5):

    if user_title not in titles:
        raise ValueError(f"'{user_title}' does not exist")

    similarities = similarity_matrix[user_title]

#sort similarity
    similar_titles = similarities.sort_values(ascending=False).head(top_n + 1)
    similar_titles = similar_titles.iloc[1:]  #except itself

    return similar_titles


titles = list(df['title'])

# title input
user_title = input("please enter a title ")

try:
    top_similar = get_similar_titles(user_title, similarity_df, titles, top_n=5)
    
    #result
    print(f"similar titles to '{user_title}':")
    print(top_similar)
except ValueError as e:
    print(e)

please enter a title  3%


similar titles to '3%':
title
Find Yourself       1.000000
Alone               1.000000
What If?            0.999999
Back with the Ex    0.999999
Selling Sunset      0.999999
Name: 3%, dtype: float64


In [None]:
get_similar_titles()