In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# @title Package Installation
!pip3 install lightning
!pip3 install ydata-profiling
!pip3 install pycaret[full]
!pip3 install swifter
!pip3 install transformers[torch]
!pip3 install datasets

In [None]:
import os
import pandas as pd
import numpy as np
from pathlib import Path
from ydata_profiling import ProfileReport

from sklearn.decomposition import NMF
from sklearn.preprocessing import MaxAbsScaler, Normalizer, normalize
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

from pycaret.regression import RegressionExperiment
from pycaret.classification import ClassificationExperiment

#### Case Study - Netflix & Chill

In this exercise, you are given 9957 netflix titles with various features.

Your tasks:
1. Convert the features into a layout that ML models can use.
2. Convert the description into a set of features that ML models can use.
3. Perform a SML on the rating
4. Create a recommender system that recommends 10 movies to an user after receiving an input for 3 movies that they like.

In [None]:
netflix = pd.read_csv(usml_data_path / 'netflix_movies.csv')
netflix

Unnamed: 0,title,year,certificate,duration,genre,rating,description,stars,votes
0,Cobra Kai,(2018– ),TV-14,30 min,"Action, Comedy, Drama",8.5,Decades after their 1984 All Valley Karate Tou...,"['Ralph Macchio, ', 'William Zabka, ', 'Courtn...",177031
1,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199885
2,Better Call Saul,(2015–2022),TV-MA,46 min,"Crime, Drama",8.9,The trials and tribulations of criminal lawyer...,"['Bob Odenkirk, ', 'Rhea Seehorn, ', 'Jonathan...",501384
3,Devil in Ohio,(2022),TV-MA,356 min,"Drama, Horror, Mystery",5.9,When a psychiatrist shelters a mysterious cult...,"['Emily Deschanel, ', 'Sam Jaeger, ', 'Gerardo...",9773
4,Cyberpunk: Edgerunners,(2022– ),TV-MA,24 min,"Animation, Action, Adventure",8.6,A Street Kid trying to survive in a technology...,"['Zach Aguilar, ', 'Kenichiro Ohashi, ', 'Emi ...",15413
...,...,...,...,...,...,...,...,...,...
9952,The Imperfects,(2022– ),TV-MA,45 min,"Action, Adventure, Drama",6.3,After an experimental gene therapy turns them ...,"['Morgan Taylor Campbell, ', 'Italia Ricci, ',...",3130
9953,The Walking Dead,(2010–2022),TV-MA,44 min,"Drama, Horror, Thriller",8.1,Sheriff Deputy Rick Grimes wakes up from a com...,"['Andrew Lincoln, ', 'Norman Reedus, ', 'Melis...",970067
9954,The Crown,(2016– ),TV-MA,58 min,"Biography, Drama, History",8.7,Follows the political rivalries and romance of...,"['Claire Foy, ', 'Olivia Colman, ', 'Imelda St...",199898
9955,Supernatural,(2005–2020),TV-14,44 min,"Drama, Fantasy, Horror",8.4,Two brothers follow their father's footsteps a...,"['Jared Padalecki, ', 'Jensen Ackles, ', 'Jim ...",439601


In [None]:
# Convert the year feature into just the starting year
netflix['year'] = netflix.year.str.extract(r'(\d{4})')

In [None]:
# Filter the rows for missing values in duration
netflix[netflix.duration.isna()]
netflix

In [None]:
# Should we drop the column or fill in the missing values?
# ## We can fill in the missing values with the mean value
netflix['duration'] = netflix['duration'].fillna(netflix['duration'].mean)

# ## We can drop the column
netflix = netflix.drop(columns=['duration'])

In [None]:
# Split the genre into multiple columns
# Extract the genre into a separate dataframe using the method explode
# Think of a way to indicate 1 for every genre that each title belongs to
# Don't concat it to the netflix df yet
netflix['genre'] = netflix.genre.str.split(', ')
genre = netflix[['genre']].explode('genre')
genre = pd.get_dummies(genre, columns=['genre'])
genre = genre.groupby(genre.index).sum()
genre

Unnamed: 0,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,...,genre_News,genre_Reality-TV,genre_Romance,genre_Sci-Fi,genre_Short,genre_Sport,genre_Talk-Show,genre_Thriller,genre_War,genre_Western
0,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9953,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,0
9954,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9955,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Do the same for the stars
# Check the number of columns that it will generate
netflix['stars'] = netflix.stars.str.split(', ')
stars = netflix[['stars']].explode('stars')
stars = pd.get_dummies(stars, columns=['stars'])
stars = stars.groupby(stars.index).sum()
stars

Unnamed: 0,"stars_""","stars_""Ahmad Al'Azzeh","stars_""Ahna O'Reilly","stars_""Alexx O'Nell""]","stars_""Annette O'Toole","stars_""Antonio Durán 'Morris'","stars_""Anya O'Connor","stars_""Anya O'Connor""]","stars_""Ashling O'Shea","stars_""Auli'i Cravalho",...,stars_['Åke Sandgren',stars_['Çagatay Ulusoy,stars_['Ève Landry,stars_['Ólafur Darri Ólafsson,stars_['Ömer Ugur',stars_['Öner Erkan,stars_['Öykü Karayel,stars_['Özcan Alper',stars_['Úrsula Corberó,stars_[]
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9953,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9954,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9955,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Given that there are 27k+ stars, we should consider decomposition techniques
# and represent each film with a few latent features

# Should we have 1 NMF for stars + genre or individual NMFs for stars and genre?
# Think about how latent features are generated, and the possibility of interactions between stars and genres
genre_stars = pd.concat([genre, stars], axis=1)
genre_stars

Unnamed: 0,genre_Action,genre_Adventure,genre_Animation,genre_Biography,genre_Comedy,genre_Crime,genre_Documentary,genre_Drama,genre_Family,genre_Fantasy,...,stars_['Åke Sandgren',stars_['Çagatay Ulusoy,stars_['Ève Landry,stars_['Ólafur Darri Ólafsson,stars_['Ömer Ugur',stars_['Öner Erkan,stars_['Öykü Karayel,stars_['Özcan Alper',stars_['Úrsula Corberó,stars_[]
0,1,0,0,0,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,1,1,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9953,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9954,0,0,0,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
9955,0,0,0,0,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# Perform NMF on your choice of stars and genre

# What is the scaler that you should use?
scaler = MaxAbsScaler()

# How many components should we use?
nmf = NMF(n_components=20)

# Create the pipeline and apply it to the genre_stars dataframe
pipeline = make_pipeline(scaler, nmf)
genre_stars_nmf = pipeline.fit_transform(genre_stars)
genre_stars_nmf_df = pd.DataFrame(genre_stars_nmf, columns=[f'genre_stars_{i}' for i in range(20)])
genre_stars_nmf_df

Unnamed: 0,genre_stars_0,genre_stars_1,genre_stars_2,genre_stars_3,genre_stars_4,genre_stars_5,genre_stars_6,genre_stars_7,genre_stars_8,genre_stars_9,genre_stars_10,genre_stars_11,genre_stars_12,genre_stars_13,genre_stars_14,genre_stars_15,genre_stars_16,genre_stars_17,genre_stars_18,genre_stars_19
0,0.0,0.084643,0.121223,0.101446,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,0.0,0.084225,0.000000,0.000036,0.000000,0.000000,0.000099,0.000000,0.000000,0.000071,0.000000,0.000000,0.0,0.000000,0.000000,0.170823,0.000199,0.0,0.203359,0.000000
2,0.0,0.084399,0.000000,0.000039,0.105095,0.000113,0.000104,0.000085,0.000000,0.000021,0.000000,0.000000,0.0,0.000526,0.000107,0.000109,0.000047,0.0,0.000000,0.000524
3,0.0,0.084192,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.128454,0.000000,0.0,0.000000,0.000000,0.000000,0.168004,0.0,0.000000,0.000000
4,0.0,0.000000,0.119003,0.000000,0.000000,0.000082,0.082118,0.000000,0.084193,0.000000,0.000000,0.000224,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,0.0,0.084485,0.119497,0.000000,0.000000,0.000044,0.000000,0.000000,0.084992,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
9953,0.0,0.084342,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.096046,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.168860,0.0,0.000000,0.000000
9954,0.0,0.084225,0.000000,0.000036,0.000000,0.000000,0.000099,0.000000,0.000000,0.000071,0.000000,0.000000,0.0,0.000000,0.000000,0.170823,0.000199,0.0,0.203359,0.000000
9955,0.0,0.084281,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.153709,0.0,0.000000,0.000000,0.000000,0.168289,0.0,0.000000,0.000000


In [None]:
# Combine the NMF features with the original dataframe
# Drop genre and stars columns
netflix = pd.concat([netflix, genre_stars_nmf_df], axis=1)
netflix.drop(columns=['genre', 'stars'], inplace=True)
netflix

Unnamed: 0,title,year,certificate,rating,description,votes,genre_stars_0,genre_stars_1,genre_stars_2,genre_stars_3,...,genre_stars_10,genre_stars_11,genre_stars_12,genre_stars_13,genre_stars_14,genre_stars_15,genre_stars_16,genre_stars_17,genre_stars_18,genre_stars_19
0,Cobra Kai,2018,TV-14,8.5,Decades after their 1984 All Valley Karate Tou...,177031,0.0,0.084643,0.121223,0.101446,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,The Crown,2016,TV-MA,8.7,Follows the political rivalries and romance of...,199885,0.0,0.084225,0.000000,0.000036,...,0.000000,0.000000,0.0,0.000000,0.000000,0.170823,0.000199,0.0,0.203359,0.000000
2,Better Call Saul,2015,TV-MA,8.9,The trials and tribulations of criminal lawyer...,501384,0.0,0.084399,0.000000,0.000039,...,0.000000,0.000000,0.0,0.000526,0.000107,0.000109,0.000047,0.0,0.000000,0.000524
3,Devil in Ohio,2022,TV-MA,5.9,When a psychiatrist shelters a mysterious cult...,9773,0.0,0.084192,0.000000,0.000000,...,0.128454,0.000000,0.0,0.000000,0.000000,0.000000,0.168004,0.0,0.000000,0.000000
4,Cyberpunk: Edgerunners,2022,TV-MA,8.6,A Street Kid trying to survive in a technology...,15413,0.0,0.000000,0.119003,0.000000,...,0.000000,0.000224,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,The Imperfects,2022,TV-MA,6.3,After an experimental gene therapy turns them ...,3130,0.0,0.084485,0.119497,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
9953,The Walking Dead,2010,TV-MA,8.1,Sheriff Deputy Rick Grimes wakes up from a com...,970067,0.0,0.084342,0.000000,0.000000,...,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.168860,0.0,0.000000,0.000000
9954,The Crown,2016,TV-MA,8.7,Follows the political rivalries and romance of...,199898,0.0,0.084225,0.000000,0.000036,...,0.000000,0.000000,0.0,0.000000,0.000000,0.170823,0.000199,0.0,0.203359,0.000000
9955,Supernatural,2005,TV-14,8.4,Two brothers follow their father's footsteps a...,439601,0.0,0.084281,0.000000,0.000000,...,0.000000,0.153709,0.0,0.000000,0.000000,0.000000,0.168289,0.0,0.000000,0.000000


In [None]:
# Observe that there's other features such as year and votes (and duration if you didn't drop it)
# Should we have combined all these and NMF them together?
# Consider the implications:
#   1. interactions between these features (interpretability)
#   2. how would you scale these features

In [None]:
# Lastly, we need to NMF the description
# Extract the descriptions and clean it
desc = netflix.description.swifter.apply(clean_text)
desc

Pandas Apply:   0%|          | 0/9957 [00:00<?, ?it/s]

0       decade 1984 valley karate tournament bout midd...
1       follows political rivalry romance queen elizab...
2       trial tribulation criminal lawyer jimmy mcgill...
3       psychiatrist shelter mysterious cult escapee w...
4       street kid trying survive technology body modi...
                              ...                        
9952    experimental gene therapy turn monster three t...
9953    sheriff deputy rick grime wake coma learn worl...
9954    follows political rivalry romance queen elizab...
9955    two brother follow father 's footstep hunter f...
9956    psychiatrist shelter mysterious cult escapee w...
Name: description, Length: 9957, dtype: object

In [None]:
# Perform NMF on the descriptions
# What additional preprocessing do you need?
tfidf = TfidfVectorizer()
desc_mat = tfidf.fit_transform(desc)

# What is the scaler that you should use?
scaler = Normalizer()

# How many components should we use?
nmf = NMF(n_components=10)

# Create the pipeline and apply it to the desc dataframe
pipeline = make_pipeline(scaler, nmf)
desc_nmf = pipeline.fit_transform(desc_mat)
desc_nmf_df = pd.DataFrame(desc_nmf, columns=[f'desc_{i}' for i in range(10)])
desc_nmf_df

Unnamed: 0,desc_0,desc_1,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9
0,0.0,0.015647,0.000000,0.001081,0.000000,0.000000,0.000772,0.002233,0.001985,0.000423
1,0.0,0.000920,0.000247,0.002040,0.010946,0.003669,0.006668,0.028911,0.000499,0.004783
2,0.0,0.012024,0.000000,0.000387,0.001111,0.004638,0.000000,0.002194,0.000000,0.000000
3,0.0,0.012890,0.000000,0.000000,0.012765,0.000000,0.014953,0.042532,0.012337,0.004717
4,0.0,0.019644,0.000000,0.002413,0.000000,0.002421,0.004420,0.005979,0.004103,0.002915
...,...,...,...,...,...,...,...,...,...,...
9952,0.0,0.027328,0.000000,0.000000,0.004819,0.000000,0.011676,0.007419,0.000552,0.006878
9953,0.0,0.020934,0.000000,0.000000,0.000000,0.000000,0.002928,0.044594,0.000000,0.000000
9954,0.0,0.000920,0.000247,0.002040,0.010946,0.003669,0.006668,0.028911,0.000499,0.004783
9955,0.0,0.028379,0.000000,0.000371,0.000000,0.000000,0.009591,0.009479,0.004203,0.002349


In [None]:
# Combine the NMF features with the original dataframe
# Drop the description column
netflix = pd.concat([netflix, desc_nmf_df], axis=1)
netflix.drop(columns=['description'], inplace=True)
netflix

Unnamed: 0,title,year,certificate,rating,votes,genre_stars_0,genre_stars_1,genre_stars_2,genre_stars_3,genre_stars_4,...,desc_0,desc_1,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9
0,Cobra Kai,2018,TV-14,8.5,177031,0.0,0.084643,0.121223,0.101446,0.000000,...,0.0,0.015647,0.000000,0.001081,0.000000,0.000000,0.000772,0.002233,0.001985,0.000423
1,The Crown,2016,TV-MA,8.7,199885,0.0,0.084225,0.000000,0.000036,0.000000,...,0.0,0.000920,0.000247,0.002040,0.010946,0.003669,0.006668,0.028911,0.000499,0.004783
2,Better Call Saul,2015,TV-MA,8.9,501384,0.0,0.084399,0.000000,0.000039,0.105095,...,0.0,0.012024,0.000000,0.000387,0.001111,0.004638,0.000000,0.002194,0.000000,0.000000
3,Devil in Ohio,2022,TV-MA,5.9,9773,0.0,0.084192,0.000000,0.000000,0.000000,...,0.0,0.012890,0.000000,0.000000,0.012765,0.000000,0.014953,0.042532,0.012337,0.004717
4,Cyberpunk: Edgerunners,2022,TV-MA,8.6,15413,0.0,0.000000,0.119003,0.000000,0.000000,...,0.0,0.019644,0.000000,0.002413,0.000000,0.002421,0.004420,0.005979,0.004103,0.002915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,The Imperfects,2022,TV-MA,6.3,3130,0.0,0.084485,0.119497,0.000000,0.000000,...,0.0,0.027328,0.000000,0.000000,0.004819,0.000000,0.011676,0.007419,0.000552,0.006878
9953,The Walking Dead,2010,TV-MA,8.1,970067,0.0,0.084342,0.000000,0.000000,0.000000,...,0.0,0.020934,0.000000,0.000000,0.000000,0.000000,0.002928,0.044594,0.000000,0.000000
9954,The Crown,2016,TV-MA,8.7,199898,0.0,0.084225,0.000000,0.000036,0.000000,...,0.0,0.000920,0.000247,0.002040,0.010946,0.003669,0.006668,0.028911,0.000499,0.004783
9955,Supernatural,2005,TV-14,8.4,439601,0.0,0.084281,0.000000,0.000000,0.000000,...,0.0,0.028379,0.000000,0.000371,0.000000,0.000000,0.009591,0.009479,0.004203,0.002349


In [None]:
# Check what dtype is votes?
# Do we need to convert it to a numerical value?
netflix.votes = pd.to_numeric(netflix.votes.str.replace(',', ''))
netflix

Unnamed: 0,title,year,certificate,rating,votes,genre_stars_0,genre_stars_1,genre_stars_2,genre_stars_3,genre_stars_4,...,desc_0,desc_1,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9
0,Cobra Kai,2018,TV-14,8.5,177031.0,0.0,0.084643,0.121223,0.101446,0.000000,...,0.0,0.015647,0.000000,0.001081,0.000000,0.000000,0.000772,0.002233,0.001985,0.000423
1,The Crown,2016,TV-MA,8.7,199885.0,0.0,0.084225,0.000000,0.000036,0.000000,...,0.0,0.000920,0.000247,0.002040,0.010946,0.003669,0.006668,0.028911,0.000499,0.004783
2,Better Call Saul,2015,TV-MA,8.9,501384.0,0.0,0.084399,0.000000,0.000039,0.105095,...,0.0,0.012024,0.000000,0.000387,0.001111,0.004638,0.000000,0.002194,0.000000,0.000000
3,Devil in Ohio,2022,TV-MA,5.9,9773.0,0.0,0.084192,0.000000,0.000000,0.000000,...,0.0,0.012890,0.000000,0.000000,0.012765,0.000000,0.014953,0.042532,0.012337,0.004717
4,Cyberpunk: Edgerunners,2022,TV-MA,8.6,15413.0,0.0,0.000000,0.119003,0.000000,0.000000,...,0.0,0.019644,0.000000,0.002413,0.000000,0.002421,0.004420,0.005979,0.004103,0.002915
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9952,The Imperfects,2022,TV-MA,6.3,3130.0,0.0,0.084485,0.119497,0.000000,0.000000,...,0.0,0.027328,0.000000,0.000000,0.004819,0.000000,0.011676,0.007419,0.000552,0.006878
9953,The Walking Dead,2010,TV-MA,8.1,970067.0,0.0,0.084342,0.000000,0.000000,0.000000,...,0.0,0.020934,0.000000,0.000000,0.000000,0.000000,0.002928,0.044594,0.000000,0.000000
9954,The Crown,2016,TV-MA,8.7,199898.0,0.0,0.084225,0.000000,0.000036,0.000000,...,0.0,0.000920,0.000247,0.002040,0.010946,0.003669,0.006668,0.028911,0.000499,0.004783
9955,Supernatural,2005,TV-14,8.4,439601.0,0.0,0.084281,0.000000,0.000000,0.000000,...,0.0,0.028379,0.000000,0.000371,0.000000,0.000000,0.009591,0.009479,0.004203,0.002349


In [None]:
# Check for missing values
netflix.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9957 entries, 0 to 9956
Data columns (total 35 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   title           9957 non-null   object 
 1   year            9326 non-null   object 
 2   certificate     6504 non-null   object 
 3   rating          8784 non-null   float64
 4   votes           8784 non-null   float64
 5   genre_stars_0   9957 non-null   float64
 6   genre_stars_1   9957 non-null   float64
 7   genre_stars_2   9957 non-null   float64
 8   genre_stars_3   9957 non-null   float64
 9   genre_stars_4   9957 non-null   float64
 10  genre_stars_5   9957 non-null   float64
 11  genre_stars_6   9957 non-null   float64
 12  genre_stars_7   9957 non-null   float64
 13  genre_stars_8   9957 non-null   float64
 14  genre_stars_9   9957 non-null   float64
 15  genre_stars_10  9957 non-null   float64
 16  genre_stars_11  9957 non-null   float64
 17  genre_stars_12  9957 non-null   f

In [None]:
# We are going to predict ratings
# Can we just drop the rows with missing values?
netflix.drop(columns=['certificate'], axis=1, inplace=True)

# Create a copy of the dataframe and name it rating_df
# Proceed with your choice of processing the data
rating_df = netflix.copy()
rating_df.dropna(inplace=True)

In [None]:
# Perform regression or classification on the netflix dataset?
exp = RegressionExperiment()
exp.setup(
    data=rating_df,
    target='rating',
    ignore_features=['title'],
    use_gpu=True
)



[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


Unnamed: 0,Description,Value
0,Session id,8249
1,Target,rating
2,Target type,Regression
3,Original data shape,"(8784, 34)"
4,Transformed data shape,"(8784, 33)"
5,Transformed train set shape,"(6148, 33)"
6,Transformed test set shape,"(2636, 33)"
7,Ignore features,1
8,Numeric features,31
9,Categorical features,1




[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


<pycaret.regression.oop.RegressionExperiment at 0x459c62750>

In [None]:
# Perform model selection
best_model = exp.compare_models()

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,0.6447,0.7727,0.8782,0.4898,0.1287,0.1098,2.386
lightgbm,Light Gradient Boosting Machine,0.6598,0.7917,0.8891,0.4771,0.1301,0.1122,36.945
et,Extra Trees Regressor,0.6597,0.812,0.9002,0.4641,0.1321,0.112,1.072
xgboost,Extreme Gradient Boosting,0.6851,0.846,0.9191,0.441,0.1337,0.1156,9.547
gbr,Gradient Boosting Regressor,0.7005,0.8683,0.9311,0.4268,0.1364,0.1199,3.813
br,Bayesian Ridge,0.8316,1.1466,1.0703,0.2422,0.1531,0.1402,0.793
lr,Linear Regression,0.8316,1.1466,1.0703,0.2421,0.153,0.1401,0.436
lar,Least Angle Regression,0.8316,1.1466,1.0703,0.2421,0.153,0.1401,0.114
ridge,Ridge Regression,0.834,1.1536,1.0735,0.2378,0.1539,0.141,0.092
ada,AdaBoost Regressor,0.9137,1.2788,1.1303,0.1538,0.1581,0.1466,0.812


In [None]:
# Validate best model on test data
exp.predict_model(best_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Random Forest Regressor,0.6475,0.7549,0.8688,0.4549,0.1251,0.1079


Unnamed: 0,year,votes,genre_stars_0,genre_stars_1,genre_stars_2,genre_stars_3,genre_stars_4,genre_stars_5,genre_stars_6,genre_stars_7,...,desc_2,desc_3,desc_4,desc_5,desc_6,desc_7,desc_8,desc_9,rating,prediction_label
8388,2014,1666.0,0.111551,0.084937,0.120296,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.040292,0.000000,0.001460,0.029402,0.000000,0.000000,6.9,7.576000
2924,2020,306.0,0.000000,0.000000,0.000000,0.100511,0.000000,0.000000,0.082292,0.000000,...,0.000042,0.000000,0.000042,0.001781,0.000623,0.000000,0.001880,0.002826,5.7,6.873000
3833,2015,6329.0,0.111716,0.000013,0.000000,0.000037,0.000000,0.122850,0.000005,0.000000,...,0.000000,0.000000,0.006178,0.056534,0.026063,0.000000,0.000000,0.000000,7.4,7.246000
9872,2021,1281.0,0.111514,0.083991,0.000000,0.000000,0.104601,0.000000,0.000000,0.000000,...,0.000000,0.001086,0.000144,0.003022,0.000000,0.002452,0.000970,0.000772,7.3,6.321000
4679,2021,270.0,0.000000,0.000168,0.000000,0.000000,0.000099,0.000406,0.000169,0.000434,...,0.000000,0.009871,0.004767,0.007187,0.000000,0.012609,0.002074,0.002139,5.7,5.686000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8616,2018,3627.0,0.111529,0.084601,0.121613,0.101569,0.000000,0.000000,0.000000,0.000000,...,0.000145,0.000000,0.000000,0.000000,0.000000,0.000000,0.000241,0.000000,8.7,8.243000
7632,2006,131.0,0.111514,0.000000,0.119075,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000262,0.072198,0.000000,0.000000,0.000000,0.000000,0.000444,0.000000,3.1,5.498667
4095,2015,742.0,0.111775,0.084612,0.000000,0.000012,0.000008,0.000000,0.000000,0.000006,...,0.001141,0.002667,0.002147,0.001782,0.000218,0.002058,0.000103,0.006845,7.1,6.804000
8082,2002,226.0,0.111529,0.000000,0.000000,0.101885,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.087604,0.000372,0.008384,0.000000,0.000000,0.000000,0.000000,7.7,7.860000


In [None]:
# Finalize model
final_model = exp.finalize_model(best_model)

# Save the model
exp.save_model(final_model, model_path / f'netflix_rating_{datetime.datetime.now()}')

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=Memory(location=None),
          steps=[('numerical_imputer',
                  TransformerWrapper(include=['votes', 'genre_stars_0',
                                              'genre_stars_1', 'genre_stars_2',
                                              'genre_stars_3', 'genre_stars_4',
                                              'genre_stars_5', 'genre_stars_6',
                                              'genre_stars_7', 'genre_stars_8',
                                              'genre_stars_9', 'genre_stars_10',
                                              'genre_stars_11', 'genre_stars_12',
                                              'genre_stars_13', 'genre_stars_14',
                                              'ge...
                                              'desc_7', 'desc_8', ...],
                                     transformer=SimpleImputer())),
                 ('categorical_imputer',
                  TransformerWrapper(include=['yea

In [None]:
# Will a classification model work better?
# Convert the ratings into classes
# Approach 1: Use 5 ratings
rating_df['rating_rank'] = pd.cut(rating_df['rating'], bins=[0, 4, 6, 8, 10], labels=['very_low', 'low', 'high', 'very_high'])

# Approach 2: Convert the ratings into 10 classes (round to the nearest integer)
rating_df['rating_rank'] = rating_df['rating'].round().astype(int)

In [None]:
# Check for class imbalance
rating_df.rating_rank.value_counts()

rating_rank
7     2680
8     2336
6     2148
5      789
9      371
4      332
3       92
2       22
10      14
Name: count, dtype: int64

In [None]:
# Perform classification on the netflix dataset
cls_exp = ClassificationExperiment()

cls_exp.setup(
    data=rating_df,
    target='rating_rank',
    ignore_features=['title', 'rating'],
    fix_imbalance=True,
    data_split_stratify=True,
    use_gpu=True
)

[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1


Unnamed: 0,Description,Value
0,Session id,4826
1,Target,rating_rank
2,Target type,Multiclass
3,Target mapping,"2: 0, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8"
4,Original data shape,"(8784, 35)"
5,Transformed data shape,"(8784, 33)"
6,Transformed train set shape,"(6148, 33)"
7,Transformed test set shape,"(2636, 33)"
8,Ignore features,2
9,Numeric features,31


[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1
[LightGBM] [Info] Number of positive: 1, number of negative: 1


[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1
[LightGBM] [Fatal] GPU Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_GPU=1
[LightGBM] [Fatal] CUDA Tree Learner was not enabled in this build.
Please recompile with CMake option -DUSE_CUDA=1


<pycaret.classification.oop.ClassificationExperiment at 0x457c5a190>

In [None]:
# Perform model selection
best_cls_model = cls_exp.compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
rf,Random Forest Classifier,0.4859,0.7616,0.4859,0.4811,0.4675,0.3036,0.3071,0.268
lightgbm,Light Gradient Boosting Machine,0.4623,0.7519,0.4623,0.4527,0.4479,0.2756,0.2776,3.007
xgboost,Extreme Gradient Boosting,0.4587,0.7487,0.4587,0.4504,0.4455,0.2716,0.2736,2.083
gbc,Gradient Boosting Classifier,0.4585,0.0,0.4585,0.4519,0.4396,0.2666,0.2704,17.129
et,Extra Trees Classifier,0.4561,0.74,0.4561,0.4442,0.4445,0.2718,0.2732,0.172
dt,Decision Tree Classifier,0.3944,0.6058,0.3944,0.3972,0.3951,0.209,0.2092,0.132
ridge,Ridge Classifier,0.3772,0.0,0.3772,0.3083,0.3386,0.1405,0.1435,0.032
lda,Linear Discriminant Analysis,0.365,0.0,0.365,0.3448,0.342,0.1408,0.1428,0.054
ada,Ada Boost Classifier,0.3154,0.0,0.3154,0.2636,0.2466,0.0593,0.0733,0.445
dummy,Dummy Classifier,0.3051,0.5,0.3051,0.0931,0.1427,0.0,0.0,0.031


In [None]:
# Create a recommender system
# Should we create another NMF on the processed netflix dataset?
# Think about the implications of doing so
# If not, how would we incorporate non-NMF features into the recommender system?

In [None]:
# Using your choice, prepare the NMF features for recommender system
title = netflix.title
selected_nmf = netflix.drop(columns=['title', 'year', 'rating', 'votes'])

# Perform the step necessary for applying cosine similarity
nml = Normalizer()

# Apply the normalizer to the selected_nmf
nmf_recommender = nml.fit_transform(selected_nmf)

In [None]:
# Check how the nmf_recommender looks like in a dataframe
nmf_recommender_df = pd.DataFrame(nmf_recommender, index=netflix.title)
nmf_recommender_df

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Cobra Kai,0.0,0.470195,0.673394,0.563531,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.086918,0.00000,0.006004,0.000000,0.000000,0.004291,0.012402,0.011025,0.002350
The Crown,0.0,0.300284,0.000000,0.000130,0.000000,0.000000,0.000351,0.000000,0.000000,0.000253,...,0.0,0.003279,0.00088,0.007274,0.039026,0.013083,0.023774,0.103075,0.001777,0.017054
Better Call Saul,0.0,0.623197,0.000000,0.000292,0.776015,0.000836,0.000768,0.000626,0.000000,0.000151,...,0.0,0.088782,0.00000,0.002855,0.008206,0.034246,0.000000,0.016197,0.000000,0.000000
Devil in Ohio,0.0,0.361137,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.055290,0.00000,0.000000,0.054755,0.000000,0.064138,0.182437,0.052917,0.020232
Cyberpunk: Edgerunners,0.0,0.000000,0.705261,0.000000,0.000000,0.000488,0.486666,0.000000,0.498965,0.000000,...,0.0,0.116420,0.00000,0.014303,0.000000,0.014349,0.026194,0.035433,0.024317,0.017273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
The Imperfects,0.0,0.490648,0.693977,0.000000,0.000000,0.000258,0.000000,0.000000,0.493588,0.000000,...,0.0,0.158709,0.00000,0.000000,0.027985,0.000000,0.067808,0.043084,0.003204,0.039944
The Walking Dead,0.0,0.387854,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.441679,...,0.0,0.096268,0.00000,0.000000,0.000000,0.000000,0.013463,0.205070,0.000000,0.000000
The Crown,0.0,0.300284,0.000000,0.000130,0.000000,0.000000,0.000351,0.000000,0.000000,0.000253,...,0.0,0.003279,0.00088,0.007274,0.039026,0.013083,0.023774,0.103075,0.001777,0.017054
Supernatural,0.0,0.343899,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.115798,0.00000,0.001514,0.000000,0.000000,0.039135,0.038679,0.017150,0.009586


In [None]:
# Randomly select a title from the netflix dataset
random_title = np.random.choice(title)

# Get the title's profile / embedding
random_title_profile = nmf_recommender_df.loc[random_title]

# and find the top 20 most similar title
# Calculate the cosine similarity between the random title and all other titles
nmf_recommender_df.dot(random_title_profile).nlargest(20)

title
Two Graves                     1.000000
Detak                          0.949180
Sutton's Case                  0.947608
Ombra e il poeta               0.947117
Unbridled                      0.945623
Home Is Where the Killer Is    0.944302
House of the Rising Sun        0.909645
The Plan                       0.901055
Ranbhool                       0.892471
Shattered                      0.890503
Why Me?                        0.889702
Identical                      0.888307
Dangerous Lies                 0.885053
I Came By                      0.884435
I Came By                      0.884435
Tau                            0.883489
A Scandall                     0.879249
Till Death                     0.874333
Beneath the Leaves             0.867744
Rebirth                        0.866803
dtype: float64

In [None]:
# Can you improve the recommendation through the use of other features?

# For each title, get the rating and the votes and present the top 5
# highest rated titles

top_20_index = nmf_recommender_df.dot(random_title_profile).nlargest(20).index
netflix[netflix.title.isin(top_20_index)][['title', 'rating', 'votes']].sort_values(['votes', 'rating'], ascending=False)

Unnamed: 0,title,rating,votes
1654,Tau,5.8,31614.0
770,Till Death,5.8,21221.0
1464,Dangerous Lies,5.3,17192.0
9942,I Came By,6.1,16247.0
15,I Came By,6.1,16230.0
2806,Rebirth,4.9,5435.0
3911,House of the Rising Sun,4.3,3567.0
861,Shattered,5.0,3207.0
4771,Why Me?,7.4,2525.0
4159,Beneath the Leaves,4.5,1538.0
