In [17]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import neattext.functions as nfx
import warnings

# Ignore warnings
warnings.filterwarnings("ignore")

In [18]:
# Load the dataset
netflix_data = pd.read_csv('netflix_titles.csv')
print(netflix_data.head())


  show_id     type                  title         director  \
0      s1    Movie   Dick Johnson Is Dead  Kirsten Johnson   
1      s2  TV Show          Blood & Water              NaN   
2      s3  TV Show              Ganglands  Julien Leclercq   
3      s4  TV Show  Jailbirds New Orleans              NaN   
4      s5  TV Show           Kota Factory              NaN   

                                                cast        country  \
0                                                NaN  United States   
1  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...   South Africa   
2  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...            NaN   
3                                                NaN            NaN   
4  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...          India   

           date_added  release_year rating   duration  \
0  September 25, 2021          2020  PG-13     90 min   
1  September 24, 2021          2021  TV-MA  2 Seasons   
2  September 24, 2021        

In [19]:
# Rename the 'listed_in' column to 'genres'
netflix_data.rename(columns={'listed_in': 'genres'}, inplace=True)

# Check the distribution of content types
print(netflix_data['type'].value_counts())

type
Movie      6131
TV Show    2676
Name: count, dtype: int64


In [20]:
# Filter movies from the dataset
movies_data = netflix_data[netflix_data['type'] == 'Movie'].reset_index(drop=True)
print(movies_data.head())

# Check for duplicates
print("Number of duplicates in movies data:", movies_data.duplicated().sum())

# Check for missing values
print("Missing values in movies data:\n", movies_data.isnull().sum())


  show_id   type                             title  \
0      s1  Movie              Dick Johnson Is Dead   
1      s7  Movie  My Little Pony: A New Generation   
2      s8  Movie                           Sankofa   
3     s10  Movie                      The Starling   
4     s13  Movie                      Je Suis Karl   

                        director  \
0                Kirsten Johnson   
1  Robert Cullen, José Luis Ucha   
2                   Haile Gerima   
3                 Theodore Melfi   
4            Christian Schwochow   

                                                cast  \
0                                                NaN   
1  Vanessa Hudgens, Kimiko Glenn, James Marsden, ...   
2  Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...   
3  Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...   
4  Luna Wedler, Jannis Niewöhner, Milan Peschel, ...   

                                             country          date_added  \
0                                      United

In [21]:
# Fill missing values in the 'rating' column
movies_data['rating'].fillna('NaN', inplace=True)

# Drop rows with missing values
movies_data.dropna(inplace=True)
movies_data = movies_data.reset_index(drop=True)
print(movies_data.head())


  show_id   type         title             director  \
0      s8  Movie       Sankofa         Haile Gerima   
1     s10  Movie  The Starling       Theodore Melfi   
2     s13  Movie  Je Suis Karl  Christian Schwochow   
3     s25  Movie         Jeans           S. Shankar   
4     s28  Movie     Grown Ups         Dennis Dugan   

                                                cast  \
0  Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...   
1  Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...   
2  Luna Wedler, Jannis Niewöhner, Milan Peschel, ...   
3  Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...   
4  Adam Sandler, Kevin James, Chris Rock, David S...   

                                             country          date_added  \
0  United States, Ghana, Burkina Faso, United Kin...  September 24, 2021   
1                                      United States  September 24, 2021   
2                            Germany, Czech Republic  September 23, 2021   
3                           

In [22]:
# Select relevant features for recommendation
movie_features = movies_data[['title', 'director', 'cast', 'country', 'rating', 'genres']]
print(movie_features.head())

# Describe the selected features
print(movie_features.describe().T)

          title             director  \
0       Sankofa         Haile Gerima   
1  The Starling       Theodore Melfi   
2  Je Suis Karl  Christian Schwochow   
3         Jeans           S. Shankar   
4     Grown Ups         Dennis Dugan   

                                                cast  \
0  Kofi Ghanaba, Oyafunmike Ogunlano, Alexandra D...   
1  Melissa McCarthy, Chris O'Dowd, Kevin Kline, T...   
2  Luna Wedler, Jannis Niewöhner, Milan Peschel, ...   
3  Prashanth, Aishwarya Rai Bachchan, Sri Lakshmi...   
4  Adam Sandler, Kevin James, Chris Rock, David S...   

                                             country rating  \
0  United States, Ghana, Burkina Faso, United Kin...  TV-MA   
1                                      United States  PG-13   
2                            Germany, Czech Republic  TV-MA   
3                                              India  TV-14   
4                                      United States  PG-13   

                                           

In [23]:
# Prepare data for vectorization
# Remove stopwords and special characters
movie_features['director'] = movie_features['director'].apply(nfx.remove_stopwords)
movie_features['cast'] = movie_features['cast'].apply(nfx.remove_stopwords)
movie_features['country'] = movie_features['country'].apply(nfx.remove_stopwords)
movie_features['genres'] = movie_features['genres'].apply(nfx.remove_stopwords)
movie_features['country'] = movie_features['country'].apply(nfx.remove_special_characters)


In [24]:
# Vectorize the data
vectorizer = CountVectorizer(binary=True)
country_matrix = vectorizer.fit_transform(movie_features['country']).toarray()

vectorizer = CountVectorizer(binary=True, tokenizer=lambda x: x.split(','))
director_matrix = vectorizer.fit_transform(movie_features['director']).toarray()
cast_matrix = vectorizer.fit_transform(movie_features['cast']).toarray()
genres_matrix = vectorizer.fit_transform(movie_features['genres']).toarray()


In [25]:
# Convert matrices to DataFrames
director_df = pd.DataFrame(director_matrix).transpose()
cast_df = pd.DataFrame(cast_matrix).transpose()
country_df = pd.DataFrame(country_matrix).transpose()
genres_df = pd.DataFrame(genres_matrix).transpose()

# Combine all DataFrames
combined_features = pd.concat([director_df, cast_df, country_df, genres_df], axis=0, ignore_index=True)


In [26]:
# Compute cosine similarity
movie_similarity = cosine_similarity(combined_features.T)
print("Shape of similarity matrix:", movie_similarity.shape)

# Filter TV shows from the dataset
tv_shows_data = netflix_data[netflix_data['type'] == 'TV Show'].reset_index(drop=True)
print(tv_shows_data.head())

# Check for duplicates
print("Number of duplicates in TV shows data:", tv_shows_data.duplicated().sum())


Shape of similarity matrix: (5186, 5186)
  show_id     type                  title         director  \
0      s2  TV Show          Blood & Water              NaN   
1      s3  TV Show              Ganglands  Julien Leclercq   
2      s4  TV Show  Jailbirds New Orleans              NaN   
3      s5  TV Show           Kota Factory              NaN   
4      s6  TV Show          Midnight Mass    Mike Flanagan   

                                                cast       country  \
0  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...  South Africa   
1  Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...           NaN   
2                                                NaN           NaN   
3  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...         India   
4  Kate Siegel, Zach Gilford, Hamish Linklater, H...           NaN   

           date_added  release_year rating   duration  \
0  September 24, 2021          2021  TV-MA  2 Seasons   
1  September 24, 2021          2021  TV-MA   1 Seas

In [27]:
# KNN with Cosine Similarity
knn = NearestNeighbors(metric='cosine', algorithm='brute')
knn.fit(movies_tfidf_matrix)
# Check for missing values
print("Missing values in TV shows data:\n", tv_shows_data.isnull().sum())

# Fill missing values in the 'director' column
tv_shows_data['director'].fillna('NaN', inplace=True)

# Drop rows with missing values
tv_shows_data.dropna(inplace=True)
tv_shows_data = tv_shows_data.reset_index(drop=True)
print(tv_shows_data.head())

Missing values in TV shows data:
 show_id            0
type               0
title              0
director        2446
cast             350
country          391
date_added        10
release_year       0
rating             2
duration           0
genres             0
description        0
dtype: int64
  show_id     type                          title         director  \
0      s2  TV Show                  Blood & Water              NaN   
1      s5  TV Show                   Kota Factory              NaN   
2      s9  TV Show  The Great British Baking Show  Andy Devonshire   
3     s16  TV Show              Dear White People              NaN   
4     s18  TV Show                Falsa identidad              NaN   

                                                cast         country  \
0  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...    South Africa   
1  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...           India   
2  Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...  United Kin

In [28]:
# Select relevant features for recommendation
tv_features = tv_shows_data[['title', 'director', 'cast', 'country', 'rating', 'genres']]
print(tv_features.head())

# Describe the selected features
print(tv_features.describe().T)


                           title         director  \
0                  Blood & Water              NaN   
1                   Kota Factory              NaN   
2  The Great British Baking Show  Andy Devonshire   
3              Dear White People              NaN   
4                Falsa identidad              NaN   

                                                cast         country rating  \
0  Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...    South Africa  TV-MA   
1  Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...           India  TV-MA   
2  Mel Giedroyc, Sue Perkins, Mary Berry, Paul Ho...  United Kingdom  TV-14   
3  Logan Browning, Brandon P. Bell, DeRon Horton,...   United States  TV-MA   
4  Luis Ernesto Franco, Camila Sodi, Sergio Goyri...          Mexico  TV-MA   

                                              genres  
0    International TV Shows, TV Dramas, TV Mysteries  
1  International TV Shows, Romantic TV Shows, TV ...  
2                       British TV Shows,

In [29]:
# Prepare data for vectorization
# Remove stopwords and special characters
tv_features['cast'] = tv_features['cast'].apply(nfx.remove_stopwords)
tv_features['country'] = tv_features['country'].apply(nfx.remove_stopwords)
tv_features['genres'] = tv_features['genres'].apply(nfx.remove_stopwords)
tv_features['country'] = tv_features['country'].apply(nfx.remove_special_characters)


In [32]:
# Vectorize the data
vectorizer = CountVectorizer(binary=True)
country_matrix = vectorizer.fit_transform(tv_features['country']).toarray()

vectorizer = CountVectorizer(binary=True, tokenizer=lambda x: x.split(','))
cast_matrix = vectorizer.fit_transform(tv_features['cast']).toarray()
genres_matrix = vectorizer.fit_transform(tv_features['genres']).toarray()


In [33]:
# Convert matrices to DataFrames
cast_df = pd.DataFrame(cast_matrix).transpose()
country_df = pd.DataFrame(country_matrix).transpose()
genres_df = pd.DataFrame(genres_matrix).transpose()

# Combine all DataFrames
combined_tv_features = pd.concat([cast_df, country_df, genres_df], axis=0, ignore_index=True)


In [34]:
# Compute cosine similarity
tv_similarity = cosine_similarity(combined_tv_features.T)
print("Shape of TV similarity matrix:", tv_similarity.shape)


Shape of TV similarity matrix: (2013, 2013)


In [35]:
# Recommendation function
def get_recommendations(title):
    if title in movies_data['title'].values:
        index = movies_data[movies_data['title'] == title].index.item()
        scores = dict(enumerate(movie_similarity[index]))
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

        selected_indices = [id for id, score in sorted_scores.items()]
        selected_scores = [score for id, score in sorted_scores.items()]

        recommendations = movies_data.iloc[selected_indices]
        recommendations['similarity'] = selected_scores

        return recommendations[1:6]  # Skip the first row (same movie)

    elif title in tv_shows_data['title'].values:
        index = tv_shows_data[tv_shows_data['title'] == title].index.item()
        scores = dict(enumerate(tv_similarity[index]))
        sorted_scores = dict(sorted(scores.items(), key=lambda x: x[1], reverse=True))

        selected_indices = [id for id, score in sorted_scores.items()]
        selected_scores = [score for id, score in sorted_scores.items()]

        recommendations = tv_shows_data.iloc[selected_indices]
        recommendations['similarity'] = selected_scores

        return recommendations[1:6]  # Skip the first row (same show)

    else:
        print("Title not found in the dataset. Please check the spelling.")

In [36]:
# Test the recommendation function
print(get_recommendations("Child's Play"))
print(get_recommendations("Breaking Bad"))


     show_id   type                  title                     director  \
3531   s6416  Movie               Candyman                 Bernard Rose   
4781   s8238  Movie                The Car           Elliot Silverstein   
332     s797  Movie       Hostel: Part III                Scott Spiegel   
2343   s4525  Movie  Tales From the Hood 2  Rusty Cundieff, Darin Scott   
3618   s6545  Movie         Cult of Chucky                  Don Mancini   

                                                   cast  \
3531  Virginia Madsen, Tony Todd, Xander Berkeley, K...   
4781  James Brolin, Kathleen Lloyd, John Marley, R.G...   
332   Kip Pardue, Brian Hallisay, John Hensley, Sara...   
2343  Keith David, Bryan Batt, Alexandria Deberry, B...   
3618  Fiona Dourif, Michael Therriault, Adam Hurtig,...   

                            country        date_added  release_year rating  \
3531  United States, United Kingdom   October 1, 2019          1992      R   
4781                  United States   

In [38]:
import plotly.graph_objects as go

def display_recommendations(df):
    fig = go.Figure(data=[go.Table(
        columnorder=[1, 2, 3, 4, 5],
        columnwidth=[20, 20, 20, 30, 50],
        header=dict(values=list(['Type', 'Title', 'Country', 'Genre(s)', 'Description']),
                    line_color='black', font=dict(color='black', family="Gravitas One", size=20), height=40,
                    fill_color='#FF6865',
                    align='center'),
        cells=dict(values=[df.type, df.title, df.country, df.genres, df.description],
                   font=dict(color='black', family="Lato", size=16),
                   fill_color='#FFB3B2',
                   align='left'))
    ])

    fig.update_layout(height=700,
                      title={'text': "Top 5 Recommendations", 'font': {'size': 22, 'family': 'Gravitas One'}},
                      title_x=0.5
                      )
    fig.show()



In [39]:
# Display recommendations for a sample title
display_recommendations(get_recommendations("Elite"))