In [59]:
# First Import the Pandas and Numpy Packages

import pandas as pd
import numpy as np

In [341]:
#Import the imdb dataset and check the first five rows

imdb_source = pd.read_csv(r'C:\Users\MLeif\data_science_practice\datasets\imdb\imdb.csv').set_index('Name')
imdb_source.head()

Unnamed: 0_level_0,Date,Rate,Votes,Genre,Duration,Type,Certificate,Episodes,Nudity,Violence,Profanity,Alcohol,Frightening
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
No Time to Die,2021,7.6,107163,"Action, Adventure, Thriller",163,Film,PG-13,-,Mild,Moderate,Mild,Mild,Moderate
The Guilty,2021,6.3,64375,"Crime, Drama, Thriller",90,Film,R,-,,,Severe,,Moderate
The Many Saints of Newark,2021,6.4,27145,"Crime, Drama",120,Film,R,-,Moderate,Severe,Severe,Moderate,Moderate
Venom: Let There Be Carnage,2021,6.4,30443,"Action, Adventure, Sci-Fi",97,Film,PG-13,-,,Moderate,Moderate,Mild,Moderate
Dune,2021,8.3,84636,"Action, Adventure, Drama",155,Film,PG-13,-,,Moderate,,Mild,Moderate


In [342]:
# Look at the number of Rows and Columns in the Data, to help verify the data later

imdb_source.shape

(6178, 13)

In [343]:
# Check that all of the Data Types Imported Correctly

imdb_source.dtypes

#Duration, Votes and Rate do Not Have the Correct Data Type, so that needs to be fixed

Date            int64
Rate           object
Votes          object
Genre          object
Duration       object
Type           object
Certificate    object
Episodes       object
Nudity         object
Violence       object
Profanity      object
Alcohol        object
Frightening    object
dtype: object

In [344]:
# Thankfully it does not appear we have missing values directly in the source data

imdb_source.isnull().sum()

Date           0
Rate           0
Votes          0
Genre          0
Duration       0
Type           0
Certificate    0
Episodes       0
Nudity         0
Violence       0
Profanity      0
Alcohol        0
Frightening    0
dtype: int64

In [345]:
#The Rate, Episodes and Duration columns use strings for missing values. 
#We need to change these to null so we can change the datatype

print("Total Rate Strings Before: " + str(imdb_source[imdb_source['Rate'] == 'No Rate']['Rate'].count()))
print("Total Duration Strings Before: " + str(imdb_source[imdb_source['Duration'] == 'None']['Duration'].count()))
print("Total Episodes Strings Before: " + str(imdb_source[imdb_source['Episodes'] == '-']['Episodes'].count()))

imdb_source.loc[imdb_source['Rate'] == 'No Rate', 'Rate'] = np.nan
imdb_source.loc[imdb_source['Duration'] == 'None', 'Duration'] = np.nan
imdb_source.loc[imdb_source['Episodes'] == '-', 'Episodes'] = np.nan

print("Total Rate Nulls After: " + str( imdb_source['Rate'].isnull().sum()))
print("Total Duration Nulls After: " + str( imdb_source['Duration'].isnull().sum()))
print("Total Episodes Nulls After: " + str( imdb_source['Episodes'].isnull().sum()))

Total Rate Strings Before: 185
Total Duration Strings Before: 301
Total Episodes Strings Before: 4446
Total Rate Nulls After: 185
Total Duration Nulls After: 301
Total Episodes Nulls After: 4446


In [346]:
#The votes column uses commas in the numbers which means they cannot be turned into numeric variables
#It also uses strings for when the number of votes is 0, so that needs to be changed to 0 as well
print("Total Votes Strings Before: " + str(imdb_source[imdb_source['Votes'] == 'No Votes']['Votes'].count()))

imdb_source.loc[imdb_source['Votes'] == 'No Votes', 'Votes'] = 0
imdb_source['Votes'] = imdb_source['Votes'].str.replace(',','')

print("Total vote Nulls After: " + str(imdb_source['Votes'].isnull().sum()))
print("Sample of Votes:\n" + str(imdb_source['Votes'].sample(5)))


Total Votes Strings Before: 185
Total vote Nulls After: 185
Sample of Votes:
Name
The Fugitive            3696
Rick and Morty        436882
Into the Wild         595966
Soylent Green          61303
RuPaul's Drag Race     17543
Name: Votes, dtype: object


In [354]:
# Set the Columns for Votes, Rate, Duration, and Episodes to floats

imdb_source['Rate'] = imdb_source['Rate'].astype('float64')
imdb_source['Votes'] = imdb_source['Votes'].astype('float64')
imdb_source['Duration'] = imdb_source['Duration'].astype('float64')
imdb_source['Episodes'] = imdb_source['Episodes'].astype('float64')

print(imdb_source[['Rate','Votes','Duration','Episodes']].dtypes)
print(imdb_source[['Rate','Votes','Duration','Episodes']].sample(5))


Rate        float64
Votes       float64
Duration    float64
Episodes    float64
dtype: object
                                         Rate     Votes  Duration  Episodes
Name                                                                       
4400                                      NaN       NaN       NaN       7.0
Friday the 13th Part VII: The New Blood   5.3   35105.0      88.0       NaN
Exam                                      6.8  110417.0     101.0       NaN
Burn Notice                               7.9   70896.0      44.0     111.0
The Division                              NaN       NaN       NaN       NaN


In [358]:
# Recode some Age Ratings Which Represent the same Ratings

null_cert_list = ['(Banned)','Approved','E','None','Not Rated','Passed','Unrated']

print("List Certificate Categories Before: \n" + str(imdb_source['Certificate'].unique()))

imdb_source.loc[imdb_source['Certificate'].isin(['X']), 'Certificate'] = 'NC-17'
imdb_source.loc[imdb_source['Certificate'].isin(['M/PG','M','GP']), 'Certificate'] = 'PG'
imdb_source.loc[imdb_source['Certificate'].isin(['TV-Y7-FV']), 'Certificate'] = 'TV-Y7'

print("List Certificate Categories After: \n" + str(imdb_source['Certificate'].unique()))

List Certificate Categories Before: 
['PG-13' 'R' 'TV-MA' 'None' 'TV-14' 'TV-PG' 'PG' 'TV-G' '(Banned)'
 'Not Rated' 'E' 'NC-17' 'TV-Y7' 'Unrated' 'Approved' 'G' 'TV-Y' 'Passed']
List Certificate Categories After: 
['PG-13' 'R' 'TV-MA' 'None' 'TV-14' 'TV-PG' 'PG' 'TV-G' '(Banned)'
 'Not Rated' 'E' 'NC-17' 'TV-Y7' 'Unrated' 'Approved' 'G' 'TV-Y' 'Passed']


In [359]:
# Here we are adding a new field to change define wether each rating is for TV or Film

def calc_certificate_type(cert):
    
    if cert in null_cert_list:
        return np.nan
    elif 'TV' in cert:
        return 'TV'
    else:
        return 'Film'
    
imdb_source['certificate_type'] = imdb_source['Certificate'].apply(lambda x: calc_certificate_type(x))

print(imdb_source[['certificate_type','Certificate']].drop_duplicates())

                           certificate_type Certificate
Name                                                   
No Time to Die                         Film       PG-13
The Guilty                             Film           R
Ted Lasso                                TV       TV-MA
House of the Dragon                     NaN        None
What If...?                              TV       TV-14
Seinfeld                                 TV       TV-PG
The Addams Family 2                    Film          PG
Ordinary Joe                             TV        TV-G
Family Guy                              NaN    (Banned)
Busanhaeng                              NaN   Not Rated
Luca                                    NaN           E
La vie d'Adèle                         Film       NC-17
Avatar: The Last Airbender               TV       TV-Y7
Requiem for a Dream                     NaN     Unrated
12 Angry Men                            NaN    Approved
Cars                                   Film     

In [360]:
# Define what level of certification the rating is, in order to make the variable Ordinal

def calc_certificate_ordinal(cert_cat):
    
    if (cert_cat == 'G') | (cert_cat == 'TV-Y'):
        return 1
    elif (cert_cat == 'PG') | (cert_cat == 'TV-Y7'):
        return 2
    elif (cert_cat == 'PG-13') | (cert_cat == 'TV-G'):
        return 3
    elif (cert_cat == 'R') | (cert_cat == 'TV-PG'):
        return 4
    elif (cert_cat == 'NC-17') | (cert_cat == 'TV-14'):
        return 5
    elif (cert_cat == 'TV-MA'):
        return 6
    else:
        return np.nan
    
imdb_source['certificate_num'] = imdb_source['Certificate'].apply(lambda x: calc_certificate_ordinal(x))
    
#NOTE: Each of these Levels is not designed to be Equivelent between TV and Film, 
# this and the next UDF are the most efficient way to make the age ratings ordinal

In [361]:
# Change Missing or unclear values in ratings to null
print("Total Rate Strings Before: " + str(imdb_source[imdb_source['Certificate'].isin(null_cert_list)]['Certificate'].count()))

imdb_source.loc[imdb_source['Certificate'].isin(null_cert_list), 'Certificate'] = np.nan

print("Total Rate Nulls After: " + str(imdb_source['Certificate'].isnull().sum()))

Total Rate Strings Before: 810
Total Rate Nulls After: 810


In [362]:
# Recode certification sub-categories to be ordinal

def recode_age_rating_category(cert_cat):
    
    if cert_cat == 'None':
        return 0
    elif cert_cat == 'Mild':
        return 1
    elif cert_cat == 'Moderate':
        return 2
    elif cert_cat == 'Severe':
        return 3
    else:
        return np.nan
    
imdb_source['Nudity'] = imdb_source['Nudity'].apply(lambda x: recode_age_rating_category(x))
imdb_source['Violence'] = imdb_source['Violence'].apply(lambda x: recode_age_rating_category(x))
imdb_source['Profanity'] = imdb_source['Profanity'].apply(lambda x: recode_age_rating_category(x))
imdb_source['Alcohol'] = imdb_source['Alcohol'].apply(lambda x: recode_age_rating_category(x))
imdb_source['Frightening'] = imdb_source['Frightening'].apply(lambda x: recode_age_rating_category(x))


In [364]:
#Trim and Lowercase all type and genre columns to standardize them

imdb_source['Type'] = imdb_source['Type'].str.strip().str.lower()
imdb_source['Genre'] = imdb_source['Genre'].str.strip().str.lower()
