In [1]:
## Import dependencies
import pandas as pd
import numpy as np
import requests 
import matplotlib as plt
import plotly.express as px
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Extract the data

In [2]:
# Read in CSV files as Pandas DataFrames.
netflix1_df = pd.read_csv('../Netflix_Team_1/Resources/netflix_list.csv', low_memory=False)
netflix2_df = pd.read_csv('../Netflix_Team_1/Resources/netflix_titles.csv', low_memory=False)

In [3]:
netflix1_df

Unnamed: 0,imdb_id,title,popular_rank,certificate,startYear,endYear,episodes,runtime,type,orign_country,language,plot,summary,rating,numVotes,genres,isAdult,cast,image_url
0,tt4052886,Lucifer,1,15,2016.0,,93.0,42,tvSeries,United States,English,Lucifer Morningstar has decided he's had enoug...,"Lucifer Morningstar, bored from his sulking li...",8.1,250884.0,"Crime,Drama,Fantasy",0,"['Tom Ellis', 'Lauren German', 'Lesley-Ann Bra...",https://m.media-amazon.com/images/M/MV5BNzY1Yj...
1,tt0993840,Army of the Dead,2,18,2021.0,,,148,movie,United States,English,"Following a zombie outbreak in Las Vegas, a gr...","With the abandoned, walled city of Las Vegas o...",5.8,110780.0,"Action,Crime,Horror",0,"['Dave Bautista', 'Ella Purnell', 'Ana de la R...",https://m.media-amazon.com/images/M/MV5BNGY0Nz...
2,tt7255502,The Kominsky Method,3,18,2018.0,2021.0,22.0,30,tvSeries,United States,English,"An aging actor, who long ago enjoyed a brush w...",Michael Douglas plays an actor who made it big...,8.2,28795.0,"Comedy,Drama",0,"['Michael Douglas', 'Sarah Baker', 'Graham Rog...",https://m.media-amazon.com/images/M/MV5BMzA0YT...
3,tt0108778,Friends,4,13+,1994.0,2004.0,235.0,22,tvSeries,United States,English,Follows the personal and professional lives of...,"Ross Geller, Rachel Green, Monica Geller, Joey...",8.9,861843.0,"Comedy,Romance",0,"['Jennifer Aniston', 'Courteney Cox', 'Lisa Ku...",https://m.media-amazon.com/images/M/MV5BNDVkYj...
4,tt9251798,Ragnarok,5,18,2020.0,,12.0,45,tvSeries,Norway,Norwegian,A small Norwegian town experiencing warm winte...,In the small fictional town of Edda coming of ...,7.5,26606.0,"Action,Drama,Fantasy",0,"['David Stakston', 'Jonas Strand Gravli', 'Her...",https://m.media-amazon.com/images/M/MV5BODM3NT...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7003,tt9777386,Merry Happy Whatever,16543,13,2019.0,,,28,tvEpisode,United States,English,When his daughter arrives home for the holiday...,When his daughter arrives home for the holiday...,6.1,221.0,Comedy,0,"['Dennis Quaid', 'Bridgit Mendler', 'Brent Mor...",https://m.media-amazon.com/images/M/MV5BOTc0ND...
7004,tt9790434,The Morning Show,16569,,2019.0,,,30,tvEpisode,-,-,,-,,,\N,0,"['Bounty Hunters Brothers', 'Bruce Lourie', 'C...",https://m.media-amazon.com/images/G/01/imdb/im...
7005,tt9817600,Losers,16599,16,2019.0,,,\N,tvEpisode,-,-,"The life of Michael Bentt, a champion boxer fo...","The life of Michael Bentt, a champion boxer fo...",8.0,221.0,"Documentary,Sport",0,"['Michael Bentt', 'Harold Perrineau', 'Mickey ...",https://m.media-amazon.com/images/M/MV5BOTY3Zj...
7006,tt9842922,Firefly Lane,16627,18,2021.0,,,51,tvEpisode,United States,English,Kate experiences the force of nature that is T...,Kate experiences the force of nature that is T...,7.3,380.0,"Drama,Romance",0,"['Katherine Heigl', 'Sarah Chalke', 'Ben Lawso...",https://m.media-amazon.com/images/M/MV5BODM3OW...


# Transform the data

In [4]:
#Drop unused Columns
netflix1_df_dropped = netflix1_df.drop(columns=['imdb_id','plot','summary','isAdult','image_url','cast','endYear'])
netflix1_df_dropped.head()

Unnamed: 0,title,popular_rank,certificate,startYear,episodes,runtime,type,orign_country,language,rating,numVotes,genres
0,Lucifer,1,15,2016.0,93.0,42,tvSeries,United States,English,8.1,250884.0,"Crime,Drama,Fantasy"
1,Army of the Dead,2,18,2021.0,,148,movie,United States,English,5.8,110780.0,"Action,Crime,Horror"
2,The Kominsky Method,3,18,2018.0,22.0,30,tvSeries,United States,English,8.2,28795.0,"Comedy,Drama"
3,Friends,4,13+,1994.0,235.0,22,tvSeries,United States,English,8.9,861843.0,"Comedy,Romance"
4,Ragnarok,5,18,2020.0,12.0,45,tvSeries,Norway,Norwegian,7.5,26606.0,"Action,Drama,Fantasy"


In [5]:
#Check for null columns
netflix1_df_dropped.isnull().sum()

title               0
popular_rank        0
certificate      4599
startYear         515
episodes         4704
runtime             2
type                2
orign_country       0
language            0
rating           1008
numVotes         1008
genres              2
dtype: int64

In [6]:
#Count of total number of values in each column
netflix1_df_dropped.count()

title            7008
popular_rank     7008
certificate      2409
startYear        6493
episodes         2304
runtime          7006
type             7006
orign_country    7008
language         7008
rating           6000
numVotes         6000
genres           7006
dtype: int64

In [7]:
#Look at missing data
netflix1_df_dropped[netflix1_df_dropped['runtime'].isnull()]

Unnamed: 0,title,popular_rank,certificate,startYear,episodes,runtime,type,orign_country,language,rating,numVotes,genres
6624,The War Next-Door,9751,,,,,,Mexico,Spanish,,,
6625,Exception,9752,,,,,,-,-,,,


In [8]:
#Look at missing data
netflix1_df_dropped[netflix1_df_dropped['genres'].isnull()]

Unnamed: 0,title,popular_rank,certificate,startYear,episodes,runtime,type,orign_country,language,rating,numVotes,genres
6624,The War Next-Door,9751,,,,,,Mexico,Spanish,,,
6625,Exception,9752,,,,,,-,-,,,


In [9]:
#Look at missing data
netflix1_df_dropped[netflix1_df_dropped['rating'].isnull()]

Unnamed: 0,title,popular_rank,certificate,startYear,episodes,runtime,type,orign_country,language,rating,numVotes,genres
152,Don't Look Up,153,,2021.0,,\N,movie,United States,English,,,Comedy
167,Hotel Transylvania: Transformania,168,,2021.0,,\N,movie,United States,English,,,"Adventure,Animation,Comedy"
173,The Sandman,174,,2021.0,11.0,\N,tvSeries,United States,English,,,"Action,Adventure,Drama"
251,Army of the Dead: Lost Vegas,252,,2021.0,1.0,\N,tvSeries,United States,English,,,"Action,Animation,Crime"
289,The Ice Road,290,PG-13,2021.0,,103,movie,United States,English,,,Thriller
...,...,...,...,...,...,...,...,...,...,...,...,...
6975,Ponysitters Club,16062,,2018.0,,\N,tvEpisode,Canada,English,,,Family
6979,Maya and the Three,16141,,2021.0,,\N,tvEpisode,United States,English,,,"Action,Adventure,Animation"
6984,Cowboy Bebop,16217,,2021.0,,\N,tvEpisode,United States,English,,,"Action,Adventure,Crime"
7001,Pieces of Her,16513,,2021.0,,\N,tvEpisode,United States,English,,,"Drama,Thriller"


In [10]:
#Drop rows with missing data
netflix1_df_dropped_data = netflix1_df_dropped.dropna(subset=['rating','genres','runtime','type'])
netflix1_df_dropped_data 

Unnamed: 0,title,popular_rank,certificate,startYear,episodes,runtime,type,orign_country,language,rating,numVotes,genres
0,Lucifer,1,15,2016.0,93.0,42,tvSeries,United States,English,8.1,250884.0,"Crime,Drama,Fantasy"
1,Army of the Dead,2,18,2021.0,,148,movie,United States,English,5.8,110780.0,"Action,Crime,Horror"
2,The Kominsky Method,3,18,2018.0,22.0,30,tvSeries,United States,English,8.2,28795.0,"Comedy,Drama"
3,Friends,4,13+,1994.0,235.0,22,tvSeries,United States,English,8.9,861843.0,"Comedy,Romance"
4,Ragnarok,5,18,2020.0,12.0,45,tvSeries,Norway,Norwegian,7.5,26606.0,"Action,Drama,Fantasy"
...,...,...,...,...,...,...,...,...,...,...,...,...
7002,Ares,16535,18,2020.0,,32,tvEpisode,Netherlands,Dutch,6.5,191.0,"Drama,Horror,Mystery"
7003,Merry Happy Whatever,16543,13,2019.0,,28,tvEpisode,United States,English,6.1,221.0,Comedy
7005,Losers,16599,16,2019.0,,\N,tvEpisode,-,-,8.0,221.0,"Documentary,Sport"
7006,Firefly Lane,16627,18,2021.0,,51,tvEpisode,United States,English,7.3,380.0,"Drama,Romance"


In [11]:
netflix1_df_dropped_data.value_counts(['certificate'])

certificate
18             613
16             415
13             255
R              226
A              147
UA             110
7              106
U              104
PG-13           92
16+             74
18+             57
PG              53
13+             21
All             19
15+             18
Not Rated       16
12+             12
15              11
G                7
7+               6
(Banned)         3
12               3
Unrated          2
NC-17            1
dtype: int64

In [12]:
netflix1_df_dropped_data.value_counts(['type'])

type        
movie           2517
tvSeries        1880
tvEpisode        639
tvSpecial        366
tvMiniSeries     277
tvMovie          147
short             85
video             83
tvShort            5
videoGame          1
dtype: int64

In [13]:
netflix1_df_dropped_data.value_counts(['language'])

language     
English          3400
Spanish           392
Japanese          366
Korean            281
-                 279
                 ... 
Min Nan             1
Hausa               1
Ibo                 1
Luxembourgish       1
Zulu                1
Length: 70, dtype: int64

In [14]:
netflix2_df

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...
...,...,...,...,...,...,...,...,...,...,...,...,...
8802,s8803,Movie,Zodiac,David Fincher,"Mark Ruffalo, Jake Gyllenhaal, Robert Downey J...",United States,"November 20, 2019",2007,R,158 min,"Cult Movies, Dramas, Thrillers","A political cartoonist, a crime reporter and a..."
8803,s8804,TV Show,Zombie Dumb,,,,"July 1, 2019",2018,TV-Y7,2 Seasons,"Kids' TV, Korean TV Shows, TV Comedies","While living alone in a spooky town, a young g..."
8804,s8805,Movie,Zombieland,Ruben Fleischer,"Jesse Eisenberg, Woody Harrelson, Emma Stone, ...",United States,"November 1, 2019",2009,R,88 min,"Comedies, Horror Movies",Looking to survive in a world taken over by zo...
8805,s8806,Movie,Zoom,Peter Hewitt,"Tim Allen, Courteney Cox, Chevy Chase, Kate Ma...",United States,"January 11, 2020",2006,PG,88 min,"Children & Family Movies, Comedies","Dragged from civilian life, a former superhero..."


In [15]:
#Drop unused Columns
netflix2_df_dropped = netflix2_df.drop(columns=['description','cast','listed_in','show_id'])
netflix2_df_dropped.head()

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,"September 25, 2021",2020,PG-13,90 min
1,TV Show,Blood & Water,,South Africa,"September 24, 2021",2021,TV-MA,2 Seasons
2,TV Show,Ganglands,Julien Leclercq,,"September 24, 2021",2021,TV-MA,1 Season
3,TV Show,Jailbirds New Orleans,,,"September 24, 2021",2021,TV-MA,1 Season
4,TV Show,Kota Factory,,India,"September 24, 2021",2021,TV-MA,2 Seasons


In [16]:
#Check for null columns
netflix2_df_dropped.isnull().sum()

type               0
title              0
director        2634
country          831
date_added        10
release_year       0
rating             4
duration           3
dtype: int64

In [17]:
netflix2_df_dropped[netflix2_df_dropped['duration'].isnull()]

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration
5541,Movie,Louis C.K. 2017,Louis C.K.,United States,"April 4, 2017",2017,74 min,
5794,Movie,Louis C.K.: Hilarious,Louis C.K.,United States,"September 16, 2016",2010,84 min,
5813,Movie,Louis C.K.: Live at the Comedy Store,Louis C.K.,United States,"August 15, 2016",2015,66 min,


In [18]:
netflix2_df_dropped[netflix2_df_dropped['date_added'].isnull()]

Unnamed: 0,type,title,director,country,date_added,release_year,rating,duration
6066,TV Show,A Young Doctor's Notebook and Other Stories,,United Kingdom,,2013,TV-MA,2 Seasons
6174,TV Show,Anthony Bourdain: Parts Unknown,,United States,,2018,TV-PG,5 Seasons
6795,TV Show,Frasier,,United States,,2003,TV-PG,11 Seasons
6806,TV Show,Friends,,United States,,2003,TV-14,10 Seasons
6901,TV Show,Gunslinger Girl,,Japan,,2008,TV-14,2 Seasons
7196,TV Show,Kikoriki,,,,2010,TV-Y,2 Seasons
7254,TV Show,La Familia P. Luche,,United States,,2012,TV-14,3 Seasons
7406,TV Show,Maron,,United States,,2016,TV-MA,4 Seasons
7847,TV Show,Red vs. Blue,,United States,,2015,NR,13 Seasons
8182,TV Show,The Adventures of Figaro Pho,,Australia,,2015,TV-Y7,2 Seasons


In [19]:
netflix2_df_dropped_data = netflix2_df_dropped.dropna(subset=['director','date_added','duration','type'])
netflix2_df_dropped_data.value_counts(['type'])

type   
Movie      5940
TV Show     230
dtype: int64

In [20]:
netflix2_df_dropped_data.rename(columns={"duration":"runtime"}, inplace = True)
netflix2_df_dropped_data

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,type,title,director,country,date_added,release_year,rating,runtime
0,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,"September 25, 2021",2020,PG-13,90 min
2,TV Show,Ganglands,Julien Leclercq,,"September 24, 2021",2021,TV-MA,1 Season
5,TV Show,Midnight Mass,Mike Flanagan,,"September 24, 2021",2021,TV-MA,1 Season
6,Movie,My Little Pony: A New Generation,"Robert Cullen, José Luis Ucha",,"September 24, 2021",2021,PG,91 min
7,Movie,Sankofa,Haile Gerima,"United States, Ghana, Burkina Faso, United Kin...","September 24, 2021",1993,TV-MA,125 min
...,...,...,...,...,...,...,...,...
8801,Movie,Zinzana,Majid Al Ansari,"United Arab Emirates, Jordan","March 9, 2016",2015,TV-MA,96 min
8802,Movie,Zodiac,David Fincher,United States,"November 20, 2019",2007,R,158 min
8804,Movie,Zombieland,Ruben Fleischer,United States,"November 1, 2019",2009,R,88 min
8805,Movie,Zoom,Peter Hewitt,United States,"January 11, 2020",2006,PG,88 min


In [21]:
#Create merged df
netflix1_df_dropped_data.rating=netflix1_df_dropped_data.rating.astype(str)
joined_df = pd.merge(netflix1_df_dropped_data, netflix2_df_dropped_data, how='inner', left_on=['title'], right_on=['title'])
joined_df.drop(columns = ['certificate', 'episodes'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[name] = value


In [22]:
joined_df.drop(columns=['type_y', 'director', 'country', 'date_added', 'release_year', 'runtime_x'], inplace=True)
joined_df.dropna(inplace=True)
joined_df.rename(columns={'runtime_y': 'runtime'}, inplace=True)
joined_df

Unnamed: 0,title,popular_rank,startYear,type_x,orign_country,language,rating_x,numVotes,genres,rating_y,runtime
0,Army of the Dead,2,2021.0,movie,United States,English,5.8,110780.0,"Action,Crime,Horror",R,148 min
1,The Woman in the Window,16,2021.0,movie,United States,English,5.7,46703.0,"Crime,Drama,Mystery",R,102 min
2,Bo Burnham: Inside,17,2021.0,tvSpecial,United States,English,8.9,13128.0,"Comedy,Drama,Music",TV-MA,88 min
3,Blue Miracle,19,2021.0,movie,United States,English,6.7,2962.0,"Adventure,Biography,Drama",TV-PG,97 min
4,The Flash,27,2014.0,tvSeries,United States,English,7.7,314053.0,"Action,Adventure,Drama",TV-14,7 Seasons
...,...,...,...,...,...,...,...,...,...,...,...
1364,Fakkah Fuzz: Almost Banned,6115,2018.0,tvSpecial,Singapore,English,4.2,40.0,Comedy,TV-MA,61 min
1365,Dieter Nuhr: Nuhr in Berlin,6138,2016.0,tvSpecial,Germany,German,4.6,65.0,Comedy,TV-MA,69 min
1366,Fernando Sanjiao: Hombre,6140,2018.0,tvSpecial,Argentina,Spanish,6.3,55.0,Comedy,TV-MA,63 min
1367,Refugee,6156,2016.0,short,-,English,6.5,21.0,"Documentary,Short",TV-PG,24 min


In [23]:
joined_df.value_counts(['type_x'])

type_x      
movie           963
tvSpecial       205
tvSeries         78
short            44
tvMovie          38
tvMiniSeries     27
video            14
dtype: int64

In [24]:
bins = [0 ,1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
joined_df['rating_x'] = joined_df['rating_x'].astype(float)
joined_df['binned'] = pd.cut(joined_df['rating_x'], bins)
#Use regular expressions to cast duration column to ints
def find_number(text):
    num = re.findall(r'[0-9]+', text)
    return " ".join(num)
#cast rating column to good/bad, good = 0, bad = 1
def cast_rating(num):
    if num < 7:
        return 1 
    else:
        return 0
joined_df['rating_x'] = joined_df['rating_x'].apply(lambda x: cast_rating(x))
joined_df['runtime'] = joined_df['runtime'].apply(lambda x: find_number(x))
pd.set_option('max_columns', None)
joined_df

Unnamed: 0,title,popular_rank,startYear,type_x,orign_country,language,rating_x,numVotes,genres,rating_y,runtime,binned
0,Army of the Dead,2,2021.0,movie,United States,English,1,110780.0,"Action,Crime,Horror",R,148,"(5, 6]"
1,The Woman in the Window,16,2021.0,movie,United States,English,1,46703.0,"Crime,Drama,Mystery",R,102,"(5, 6]"
2,Bo Burnham: Inside,17,2021.0,tvSpecial,United States,English,0,13128.0,"Comedy,Drama,Music",TV-MA,88,"(8, 9]"
3,Blue Miracle,19,2021.0,movie,United States,English,1,2962.0,"Adventure,Biography,Drama",TV-PG,97,"(6, 7]"
4,The Flash,27,2014.0,tvSeries,United States,English,0,314053.0,"Action,Adventure,Drama",TV-14,7,"(7, 8]"
...,...,...,...,...,...,...,...,...,...,...,...,...
1364,Fakkah Fuzz: Almost Banned,6115,2018.0,tvSpecial,Singapore,English,1,40.0,Comedy,TV-MA,61,"(4, 5]"
1365,Dieter Nuhr: Nuhr in Berlin,6138,2016.0,tvSpecial,Germany,German,1,65.0,Comedy,TV-MA,69,"(4, 5]"
1366,Fernando Sanjiao: Hombre,6140,2018.0,tvSpecial,Argentina,Spanish,1,55.0,Comedy,TV-MA,63,"(6, 7]"
1367,Refugee,6156,2016.0,short,-,English,1,21.0,"Documentary,Short",TV-PG,24,"(6, 7]"


In [25]:
joined_df_encoded = pd.get_dummies(joined_df, columns = ['binned'])
le = LabelEncoder()
#Cast each column to int/float
joined_df_encoded['popular_rank'] = joined_df_encoded['popular_rank'].str.replace(',', '').apply(pd.to_numeric, errors='coerce')
joined_df_encoded['runtime'] = pd.to_numeric(joined_df_encoded['runtime'], errors='coerce')
joined_df_encoded['title'] = le.fit_transform(joined_df_encoded['title']).astype(float)
joined_df_encoded['type_x'] = le.fit_transform(joined_df_encoded['type_x']).astype(float)
joined_df_encoded['orign_country'] = le.fit_transform(joined_df_encoded['orign_country']).astype(float)
joined_df_encoded['language'] = le.fit_transform(joined_df_encoded['language']).astype(float)
joined_df_encoded['genres'] = le.fit_transform(joined_df_encoded['genres']).astype(float)
joined_df_encoded['rating_y'] = le.fit_transform(joined_df_encoded['rating_y']).astype(float)
joined_df_encoded['runtime'] = joined_df_encoded['runtime'].astype(float)
pd.set_option('max_columns', None)
joined_df_encoded

Unnamed: 0,title,popular_rank,startYear,type_x,orign_country,language,rating_x,numVotes,genres,rating_y,runtime,"binned_(0, 1]","binned_(1, 2]","binned_(2, 3]","binned_(3, 4]","binned_(4, 5]","binned_(5, 6]","binned_(6, 7]","binned_(7, 8]","binned_(8, 9]","binned_(9, 10]"
0,105.0,2,2021.0,0.0,51.0,9.0,1,110780.0,26.0,5.0,148.0,0,0,0,0,0,1,0,0,0,0
1,1234.0,16,2021.0,0.0,51.0,9.0,1,46703.0,143.0,5.0,102.0,0,0,0,0,0,1,0,0,0,0
2,173.0,17,2021.0,5.0,51.0,9.0,0,13128.0,110.0,8.0,88.0,0,0,0,0,0,0,0,0,1,0
3,171.0,19,2021.0,0.0,51.0,9.0,1,2962.0,49.0,9.0,97.0,0,0,0,0,0,0,1,0,0,0
4,1123.0,27,2014.0,4.0,51.0,9.0,0,314053.0,4.0,6.0,7.0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,363.0,6115,2018.0,5.0,41.0,9.0,1,40.0,98.0,8.0,61.0,0,0,0,0,1,0,0,0,0,0
1365,314.0,6138,2016.0,5.0,16.0,14.0,1,65.0,98.0,8.0,69.0,0,0,0,0,1,0,0,0,0,0
1366,375.0,6140,2018.0,5.0,1.0,35.0,1,55.0,98.0,8.0,63.0,0,0,0,0,0,0,1,0,0,0
1367,884.0,6156,2016.0,1.0,0.0,9.0,1,21.0,165.0,9.0,24.0,0,0,0,0,0,0,1,0,0,0


# Create ML RandomForest Model

In [26]:
X = joined_df_encoded.copy()
X = X.drop("rating_x", axis=1)
X.head(10)

Unnamed: 0,title,popular_rank,startYear,type_x,orign_country,language,numVotes,genres,rating_y,runtime,"binned_(0, 1]","binned_(1, 2]","binned_(2, 3]","binned_(3, 4]","binned_(4, 5]","binned_(5, 6]","binned_(6, 7]","binned_(7, 8]","binned_(8, 9]","binned_(9, 10]"
0,105.0,2,2021.0,0.0,51.0,9.0,110780.0,26.0,5.0,148.0,0,0,0,0,0,1,0,0,0,0
1,1234.0,16,2021.0,0.0,51.0,9.0,46703.0,143.0,5.0,102.0,0,0,0,0,0,1,0,0,0,0
2,173.0,17,2021.0,5.0,51.0,9.0,13128.0,110.0,8.0,88.0,0,0,0,0,0,0,0,0,1,0
3,171.0,19,2021.0,0.0,51.0,9.0,2962.0,49.0,9.0,97.0,0,0,0,0,0,0,1,0,0,0
4,1123.0,27,2014.0,4.0,51.0,9.0,314053.0,4.0,6.0,7.0,0,0,0,0,0,0,0,1,0,0
5,1038.0,29,2005.0,4.0,51.0,9.0,399487.0,175.0,6.0,15.0,0,0,0,0,0,0,0,0,1,0
6,440.0,35,2021.0,2.0,51.0,9.0,7747.0,91.0,8.0,1.0,0,0,0,0,0,0,0,1,0,0
7,1241.0,63,2021.0,0.0,51.0,9.0,20762.0,185.0,8.0,121.0,0,0,0,0,0,1,0,0,0,0
8,476.0,73,2011.0,4.0,51.0,9.0,319964.0,143.0,9.0,73.0,0,0,0,0,0,0,0,0,1,0
9,900.0,84,2017.0,4.0,51.0,9.0,124859.0,143.0,6.0,4.0,0,0,0,0,0,0,1,0,0,0


In [28]:
y = joined_df_encoded['rating_x'].values
y[:5]

array([1, 1, 0, 1, 0], dtype=int64)

In [29]:
# Train testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [30]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1026, 20)
(343, 20)
(1026,)
(343,)


In [31]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [32]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [34]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [36]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0,
       0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,

In [37]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,104,9
Actual 1,0,230


In [38]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [39]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,104,9
Actual 1,0,230


Accuracy Score : 0.9737609329446064
Classification Report
              precision    recall  f1-score   support

           0       1.00      0.92      0.96       113
           1       0.96      1.00      0.98       230

    accuracy                           0.97       343
   macro avg       0.98      0.96      0.97       343
weighted avg       0.97      0.97      0.97       343



In [40]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.02679683, 0.03015725, 0.02269811, 0.01873812, 0.01336034,
       0.01080467, 0.04015744, 0.02945466, 0.01104592, 0.03329256,
       0.        , 0.        , 0.00258267, 0.00788308, 0.03814404,
       0.09688481, 0.09920556, 0.40967628, 0.10693117, 0.00218648])

In [41]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.4096762765730539, 'binned_(7, 8]'),
 (0.10693116897782268, 'binned_(8, 9]'),
 (0.09920556417517684, 'binned_(6, 7]'),
 (0.09688481246630928, 'binned_(5, 6]'),
 (0.0401574415632332, 'numVotes'),
 (0.03814403988212453, 'binned_(4, 5]'),
 (0.033292564259804654, 'runtime'),
 (0.030157253210402513, 'popular_rank'),
 (0.02945466499947716, 'genres'),
 (0.02679682538153356, 'title'),
 (0.02269811478010501, 'startYear'),
 (0.01873811707054984, 'type_x'),
 (0.013360336159584972, 'orign_country'),
 (0.011045916712939263, 'rating_y'),
 (0.010804671646366995, 'language'),
 (0.007883076817259062, 'binned_(3, 4]'),
 (0.002582674140701372, 'binned_(2, 3]'),
 (0.0021864811835550168, 'binned_(9, 10]'),
 (0.0, 'binned_(1, 2]'),
 (0.0, 'binned_(0, 1]')]