In [1]:
## Import dependencies
import pandas as pd
import numpy as np
import requests 
import matplotlib.pyplot as plt
import plotly.express as px
import re
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

# Extract the data

In [2]:
# Read in CSV files as Pandas DataFrames.
netflix1_df = pd.read_csv('../Netflix_Team_1/Resources/netflix_list.csv', low_memory=False)
netflix2_df = pd.read_csv('../Netflix_Team_1/Resources/netflix_titles.csv', low_memory=False)
netflix3_df = pd.read_csv('../Netflix_Team_1/Resources/joined.csv', low_memory=False)

In [None]:
netflix1_df

# Transform the data

In [None]:
#Drop unused Columns
netflix1_df_dropped = netflix1_df.drop(columns=['imdb_id','plot','summary','isAdult','image_url','cast','endYear'])
netflix1_df_dropped.head()

In [None]:
#Check for null columns
netflix1_df_dropped.isnull().sum()

In [None]:
#Count of total number of values in each column
netflix1_df_dropped.count()

In [None]:
#Look at missing data
netflix1_df_dropped[netflix1_df_dropped['runtime'].isnull()]

In [None]:
#Look at missing data
netflix1_df_dropped[netflix1_df_dropped['genres'].isnull()]

In [None]:
#Look at missing data
netflix1_df_dropped[netflix1_df_dropped['rating'].isnull()]

In [None]:
#Drop rows with missing data
netflix1_df_dropped_data = netflix1_df_dropped.dropna(subset=['rating','genres','runtime','type'])
netflix1_df_dropped_data 

In [None]:
netflix1_df_dropped_data.value_counts(['certificate'])

In [None]:
netflix1_df_dropped_data.value_counts(['type'])

In [None]:
netflix1_df_dropped_data.value_counts(['language'])

In [None]:
netflix2_df

In [None]:
#Drop unused Columns
netflix2_df_dropped = netflix2_df.drop(columns=['description','cast','listed_in','show_id'])
netflix2_df_dropped.head()

In [None]:
#Check for null columns
netflix2_df_dropped.isnull().sum()

In [None]:
netflix2_df_dropped[netflix2_df_dropped['duration'].isnull()]

In [None]:
netflix2_df_dropped[netflix2_df_dropped['date_added'].isnull()]

In [None]:
netflix2_df_dropped_data = netflix2_df_dropped.dropna(subset=['director','date_added','duration','type'])
netflix2_df_dropped_data.value_counts(['type'])

In [None]:
netflix2_df_dropped_data.rename(columns={"duration":"runtime"}, inplace = True)
netflix2_df_dropped_data

In [None]:
#Create merged df
netflix1_df_dropped_data.rating=netflix1_df_dropped_data.rating.astype(str)
joined_df = pd.merge(netflix1_df_dropped_data, netflix2_df_dropped_data, how='inner', left_on=['title'], right_on=['title'])
joined_df.drop(columns = ['certificate', 'episodes'], inplace=True)

In [None]:
joined_df.drop(columns=['popular_rank','type_y', 'director', 'country', 'date_added', 'release_year', 'runtime_x'], inplace=True)
joined_df.dropna(inplace=True)
joined_df.rename(columns={'runtime_y': 'runtime'}, inplace=True)
joined_df

In [None]:
joined_df.value_counts(['type_x'])

In [None]:
#bins = [0 ,1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
joined_df['rating_x'] = joined_df['rating_x'].astype(float)
#joined_df['binned'] = pd.cut(joined_df['rating_x'], bins)
#Use regular expressions to cast duration column to ints
def find_number(text):
    num = re.findall(r'[0-9]+', text)
    return " ".join(num)
#cast rating column to good/bad, good = 0, bad = 1
def cast_rating(num):
    if num < 7:
        return 1 
    else:
        return 0
joined_df['rating_x'] = joined_df['rating_x'].apply(lambda x: cast_rating(x))
joined_df['runtime'] = joined_df['runtime'].apply(lambda x: find_number(x))
pd.set_option('max_columns', None)
joined_df

In [None]:
#joined_df_encoded = pd.get_dummies(joined_df, columns = ['binned'])
le = LabelEncoder()
#Cast each column to int/float
joined_df['runtime'] = pd.to_numeric(joined_df['runtime'], errors='coerce')
joined_df['title'] = le.fit_transform(joined_df['title']).astype(float)
joined_df['type_x'] = le.fit_transform(joined_df['type_x']).astype(float)
joined_df['orign_country'] = le.fit_transform(joined_df['orign_country']).astype(float)
joined_df['language'] = le.fit_transform(joined_df['language']).astype(float)
joined_df['genres'] = le.fit_transform(joined_df['genres']).astype(float)
joined_df['rating_y'] = le.fit_transform(joined_df['rating_y']).astype(float)
joined_df['runtime'] = joined_df['runtime'].astype(float)
pd.set_option('max_columns', None)
joined_df

In [3]:
netflix3_df.head(10)


pd.set_option('max_columns', None)
netflix3_df.fillna(value = 0,inplace = True)
netflix3_df

Unnamed: 0,Title,Start Year,Run Time,Type,Origin Country,Language,Rating,Number of Votes,Genre1,Genre2,Genre3,Release Year,Movie Rating
0,Lucifer,2016,42,TV Show,United States,English,8.1,250884,Crime,Drama,Fantasy,2021,teen
1,Army of the Dead,2021,148,Movie,United States,English,5.8,110780,Action,Crime,Horror,2021,adult
2,The Kominsky Method,2018,30,TV Show,United States,English,8.2,28795,Comedy,Drama,0,2021,adult
3,Ragnarok,2020,45,TV Show,Norway,Norwegian,7.5,26606,Action,Drama,Fantasy,2021,adult
4,StartUp,2016,44,TV Show,United States,English,8.0,16980,Crime,Thriller,0,2018,adult
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2221,Fakkah Fuzz: Almost Banned,2018,60,Movie,Singapore,English,4.2,40,Comedy,0,0,2018,adult
2222,Dieter Nuhr: Nuhr in Berlin,2016,69,Movie,Germany,German,4.6,65,Comedy,0,0,2016,adult
2223,Fernando Sanjiao: Hombre,2018,\N,Movie,Argentina,Spanish,6.3,55,Comedy,0,0,2018,adult
2224,Saverio Raimondo: Il Satiro Parlante,2019,53,Movie,Italy,Italian,6.0,42,Comedy,0,0,2019,adult


In [None]:
netflix3_df.value_counts(['Genre1'])

In [None]:
netflix3_df.value_counts(['Genre2'])

In [None]:
netflix3_df.value_counts(['Genre3'])

In [4]:
genres = ["Comedy", "Drama", "Action", "Documentary",
                           "Crime", "Adventure", "Biography", "Animation",
                           "Reality-TV", "Horror", "Thriller", "Family",
                           "Game-Show", "Mystery", "Romance", "Music",
                           "Fantasy", "Musical", "Talk-Show", "Sci-Fi", "Sport",
                           "History", "Other"]

df_copy = netflix3_df["Genre1"].copy()

df_copy.replace(to_replace=["Comedy","Drama", "Action", "Documentary",
                           "Crime", "Adventure", "Biography", "Animation",
                           "Reality-TV", "Horror", "Thriller", "Family",
                           "Game-Show", "Mystery", "Romance", "Music",
                           "Fantasy", "Musical", "Talk-Show", "Sci-Fi", "Sport",
                           "History"], value = 0, inplace=True)
df_copy.value_counts()
for genre in genres:
    netflix3_df[genre] = df_copy
netflix3_df["Genre2"].replace(0, "Other", inplace=True)
netflix3_df["Genre3"].replace(0, "Other", inplace=True)
netflix3_df["Comedy"] = df_copy
netflix3_df.head(10)

Unnamed: 0,Title,Start Year,Run Time,Type,Origin Country,Language,Rating,Number of Votes,Genre1,Genre2,Genre3,Release Year,Movie Rating,Comedy,Drama,Action,Documentary,Crime,Adventure,Biography,Animation,Reality-TV,Horror,Thriller,Family,Game-Show,Mystery,Romance,Music,Fantasy,Musical,Talk-Show,Sci-Fi,Sport,History,Other
0,Lucifer,2016,42,TV Show,United States,English,8.1,250884,Crime,Drama,Fantasy,2021,teen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Army of the Dead,2021,148,Movie,United States,English,5.8,110780,Action,Crime,Horror,2021,adult,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Kominsky Method,2018,30,TV Show,United States,English,8.2,28795,Comedy,Drama,Other,2021,adult,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Ragnarok,2020,45,TV Show,Norway,Norwegian,7.5,26606,Action,Drama,Fantasy,2021,adult,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,StartUp,2016,44,TV Show,United States,English,8.0,16980,Crime,Thriller,Other,2018,adult,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
5,Grey's Anatomy,2005,41,TV Show,United States,English,7.5,260703,Drama,Romance,Other,2020,teen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,Sweet Tooth,2021,\N,TV Show,United States,English,8.2,9622,Action,Adventure,Drama,2021,teen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,The Blacklist,2013,43,TV Show,United States,English,8.0,207174,Crime,Drama,Mystery,2019,teen,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
8,Jupiter's Legacy,2021,56,TV Show,United States,English,6.8,27309,Action,Adventure,Drama,2021,adult,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Dirty John,2018,44,TV Show,United States,English,7.2,16578,Crime,Drama,Other,2020,adult,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [5]:
genres = ["Comedy", "Drama", "Action", "Documentary",
                           "Crime", "Adventure", "Biography", "Animation",
                           "Reality-TV", "Horror", "Thriller", "Family",
                           "Game-Show", "Mystery", "Romance", "Music",
                           "Fantasy", "Musical", "Talk-Show", "Sci-Fi", "Sport",
                           "History", "Other"]
netflix_copy = netflix3_df.copy()
def cast_genre_values1(genre):
    for index, row in netflix_copy.iterrows():
        for x in genres:
            if x == row["Genre1"]:
                netflix_copy.at[index, x] += 1
def cast_genre_values2(genre):
    for index, row in netflix_copy.iterrows():
        for x in genres:
            if x == row["Genre2"]:
                netflix_copy.at[index, x] += 1
def cast_genre_values3(genre):
    for index, row in netflix_copy.iterrows():
        for x in genres:
            if x == row["Genre3"]:
                netflix_copy.at[index, x] += 1
cast_genre_values1(genres)
cast_genre_values2(genres)
cast_genre_values3(genres)

netflix_copy.drop(["Genre1", "Genre2", "Genre3"], axis=1, inplace = True)

netflix_copy.head(10)

            

Unnamed: 0,Title,Start Year,Run Time,Type,Origin Country,Language,Rating,Number of Votes,Release Year,Movie Rating,Comedy,Drama,Action,Documentary,Crime,Adventure,Biography,Animation,Reality-TV,Horror,Thriller,Family,Game-Show,Mystery,Romance,Music,Fantasy,Musical,Talk-Show,Sci-Fi,Sport,History,Other
0,Lucifer,2016,42,TV Show,United States,English,8.1,250884,2021,teen,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,Army of the Dead,2021,148,Movie,United States,English,5.8,110780,2021,adult,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,The Kominsky Method,2018,30,TV Show,United States,English,8.2,28795,2021,adult,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,Ragnarok,2020,45,TV Show,Norway,Norwegian,7.5,26606,2021,adult,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,StartUp,2016,44,TV Show,United States,English,8.0,16980,2018,adult,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
5,Grey's Anatomy,2005,41,TV Show,United States,English,7.5,260703,2020,teen,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
6,Sweet Tooth,2021,\N,TV Show,United States,English,8.2,9622,2021,teen,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,The Blacklist,2013,43,TV Show,United States,English,8.0,207174,2019,teen,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8,Jupiter's Legacy,2021,56,TV Show,United States,English,6.8,27309,2021,adult,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,Dirty John,2018,44,TV Show,United States,English,7.2,16578,2020,adult,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1


In [6]:
#Cast Rating column to integer, 1 = bad, 0 = good
def cast_rating(num):
    if num < 7:
        return 1 
    else:
        return 0

le = LabelEncoder()
#Cast each column to int/float
netflix_copy['Start Year'] = pd.to_numeric(netflix_copy['Start Year'], errors='coerce')
netflix_copy['Release Year'] = pd.to_numeric(netflix_copy['Release Year'], errors='coerce')
netflix_copy['Run Time'] = pd.to_numeric(netflix_copy['Run Time'], errors='coerce')
netflix_copy['Title'] = le.fit_transform(netflix_copy['Title']).astype(float)
netflix_copy['Type'] = le.fit_transform(netflix_copy['Type']).astype(float)
netflix_copy['Origin Country'] = le.fit_transform(netflix_copy['Origin Country']).astype(float)
netflix_copy['Language'] = le.fit_transform(netflix_copy['Language']).astype(float)
netflix_copy['Movie Rating'] = le.fit_transform(netflix_copy['Movie Rating']).astype(float)
netflix_copy["Rating"] = netflix_copy["Rating"].apply(lambda x: cast_rating(x))
netflix_copy.dropna(inplace=True)
netflix_copy.head(10)

Unnamed: 0,Title,Start Year,Run Time,Type,Origin Country,Language,Rating,Number of Votes,Release Year,Movie Rating,Comedy,Drama,Action,Documentary,Crime,Adventure,Biography,Animation,Reality-TV,Horror,Thriller,Family,Game-Show,Mystery,Romance,Music,Fantasy,Musical,Talk-Show,Sci-Fi,Sport,History,Other
0,1052.0,2016,42.0,1.0,57.0,10.0,0,250884,2021,2.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,148.0,2021,148.0,0.0,57.0,10.0,1,110780,2021,0.0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1864.0,2018,30.0,1.0,57.0,10.0,0,28795,2021,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1390.0,2020,45.0,1.0,38.0,35.0,0,26606,2021,0.0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1636.0,2016,44.0,1.0,57.0,10.0,0,16980,2018,0.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
5,704.0,2005,41.0,1.0,57.0,10.0,0,260703,2020,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
7,1739.0,2013,43.0,1.0,57.0,10.0,0,207174,2019,2.0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8,925.0,2021,56.0,1.0,57.0,10.0,1,27309,2021,0.0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,501.0,2018,44.0,1.0,57.0,10.0,0,16578,2020,0.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,1982.0,2010,44.0,1.0,57.0,10.0,0,873752,2019,0.0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0


# Create ML RandomForest Model

In [7]:
X = netflix_copy.copy()
X = X.drop("Rating", axis=1)
X.head(10)

Unnamed: 0,Title,Start Year,Run Time,Type,Origin Country,Language,Number of Votes,Release Year,Movie Rating,Comedy,Drama,Action,Documentary,Crime,Adventure,Biography,Animation,Reality-TV,Horror,Thriller,Family,Game-Show,Mystery,Romance,Music,Fantasy,Musical,Talk-Show,Sci-Fi,Sport,History,Other
0,1052.0,2016,42.0,1.0,57.0,10.0,250884,2021,2.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
1,148.0,2021,148.0,0.0,57.0,10.0,110780,2021,0.0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2,1864.0,2018,30.0,1.0,57.0,10.0,28795,2021,0.0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1390.0,2020,45.0,1.0,38.0,35.0,26606,2021,0.0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0
4,1636.0,2016,44.0,1.0,57.0,10.0,16980,2018,0.0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
5,704.0,2005,41.0,1.0,57.0,10.0,260703,2020,2.0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1
7,1739.0,2013,43.0,1.0,57.0,10.0,207174,2019,2.0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0
8,925.0,2021,56.0,1.0,57.0,10.0,27309,2021,0.0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,501.0,2018,44.0,1.0,57.0,10.0,16578,2020,0.0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
10,1982.0,2010,44.0,1.0,57.0,10.0,873752,2019,0.0,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
y = netflix_copy['Rating'].values
y[:5]

array([0, 1, 0, 0, 0], dtype=int64)

In [11]:
# Train testing groups
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(1486, 32)
(496, 32)
(1486,)
(496,)


In [13]:
# Creating a StandardScaler instance.
scaler = StandardScaler()
# Fitting the Standard Scaler with the training data.
X_scaler = scaler.fit(X_train)

# Scaling the data.
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [14]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=1)

In [15]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [16]:
# Making predictions using the testing data.
predictions = rf_model.predict(X_test_scaled)
predictions

array([1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1,
       0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0,
       0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0,
       1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0,
       1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1,

In [17]:
# Calculating the confusion matrix.
cm = confusion_matrix(y_test, predictions)

# Create a DataFrame from the confusion matrix.
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

cm_df

Unnamed: 0,Predicted 0,Predicted 1
Actual 0,152,66
Actual 1,55,223


In [18]:
# Calculating the accuracy score.
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,152,66
Actual 1,55,223


Accuracy Score : 0.7560483870967742
Classification Report
              precision    recall  f1-score   support

           0       0.73      0.70      0.72       218
           1       0.77      0.80      0.79       278

    accuracy                           0.76       496
   macro avg       0.75      0.75      0.75       496
weighted avg       0.76      0.76      0.76       496



In [20]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.10689633, 0.0756927 , 0.16048386, 0.07014112, 0.04907467,
       0.03646665, 0.1857171 , 0.0631921 , 0.02752276, 0.01961036,
       0.01707992, 0.01231313, 0.03980333, 0.00908064, 0.00773667,
       0.00886167, 0.00668609, 0.00418376, 0.01029559, 0.01025153,
       0.00787608, 0.00157791, 0.00742432, 0.00875981, 0.00728953,
       0.0040628 , 0.00138629, 0.00100254, 0.00484068, 0.00309324,
       0.00505525, 0.02654158])

In [21]:
# We can sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1857171020913328, 'Number of Votes'),
 (0.16048386358375483, 'Run Time'),
 (0.10689633031738668, 'Title'),
 (0.0756927016344955, 'Start Year'),
 (0.07014111733085752, 'Type'),
 (0.06319210200260922, 'Release Year'),
 (0.04907466836911393, 'Origin Country'),
 (0.039803331458303566, 'Documentary'),
 (0.03646664741025341, 'Language'),
 (0.027522759343774697, 'Movie Rating'),
 (0.026541583172221737, 'Other'),
 (0.019610360827460573, 'Comedy'),
 (0.017079918728332345, 'Drama'),
 (0.012313125460994112, 'Action'),
 (0.010295588345049704, 'Horror'),
 (0.010251534701345474, 'Thriller'),
 (0.00908064214213677, 'Crime'),
 (0.00886166873641696, 'Biography'),
 (0.008759807543739886, 'Romance'),
 (0.00787607677013907, 'Family'),
 (0.007736666253305888, 'Adventure'),
 (0.007424320613335656, 'Mystery'),
 (0.007289526560953442, 'Music'),
 (0.006686093292048161, 'Animation'),
 (0.005055246285049886, 'History'),
 (0.0048406790228954395, 'Sci-Fi'),
 (0.0041837635923769505, 'Reality-TV'),
 (0.004062796