In [5]:
'''
Importing...
'''
# importing math and numpy
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# importing scipy for stats package
import scipy
# importing random forest classifier
from sklearn import ensemble
# importing some sklearn tools
from sklearn.model_selection import train_test_split

In [25]:
'''
Function for creating our classes
'''
def create_classes(data, num_class):
    '''
    This function creates classes by splitting the Revenue data into different ranges depending on how
    classes are being requested

    Input: 
        - num_class -> (int) the number of classes we want to split the data into
        - data -> the pandas dataset that we are altering

    Output: The pandas dataset with new classes
    '''
    if num_class == 2:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 50000000, 
        'Revenue Class'] = 0
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] > 50000000, 
        'Revenue Class'] = 1
    else:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 25000000, 'Revenue Class'] = 0
        data.loc[(data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 25000001) & (data['Revenue ( USD, Adjusted for 2024 Inflation)'] < 120000000), 'Revenue Class'] = 1
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 120000001, 'Revenue Class'] = 2

    return data

In [26]:
'''
Loading in the dataset
'''
def load_data():
    '''
    The function loads the dataset, removes rows with N/A values, selects numerical and categorical 
    columns.
    
    Input: None
    
    Output: Train and test datasets
    '''
    previous_data = pd.read_csv('IMDB_MovieListData_Normalized.csv')

    numerical_features = ['Vote Average', 
                          'Vote Count', 
                          'Runtime (mins)', 
                          'Budget (USD, Adjusted for 2024 Inflation)', 
                          'Release Year', 
                          'Popularity', 
                          'Average Rating', 
                          'IMDB Rating', 
                          'Meta Score', 
                          'Revenue Class']

    # Creating Classes
    previous_data = create_classes(previous_data, 2)
   
    # Select only numerical 
    clean_data = previous_data[numerical_features]
    # Dropping NaN rows
    data = clean_data.dropna()

    # Setting Data and Target variables
    X = data.drop(columns=['Revenue Class'])
    y = data['Revenue Class'] 
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    train_test_sets = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    return train_test_sets

In [27]:
'''
Running Random Forest Classifier
'''
datasets = load_data()
classifier = ensemble.RandomForestClassifier()
classifier.fit(datasets['X_train'], datasets['y_train'])
classifier.score(datasets['X_test'], datasets['y_test'])

0.8201438848920863