In [81]:
'''
Importing...
'''
# importing math and numpy
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# importing scipy for stats package
import scipy
# importing random forest classifier
from sklearn import ensemble
# importing some sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import multilabel_confusion_matrix

In [82]:
'''
Function for creating our classes
'''
def create_classes(data, num_class):
    '''
    This function creates classes by splitting the Revenue data into different ranges depending on how
    classes are being requested

    Input: 
        - num_class -> (int) the number of classes we want to split the data into
        - data -> the pandas dataset that we are altering

    Output: The pandas dataset with new classes
    '''
    if num_class == 2:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 50000000, 
        'Revenue Class'] = 0
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] > 50000000, 
        'Revenue Class'] = 1
    else:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 25000000, 'Revenue Class'] = 0
        data.loc[(data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 25000001) & (data['Revenue ( USD, Adjusted for 2024 Inflation)'] < 120000000), 'Revenue Class'] = 1
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 120000001, 'Revenue Class'] = 2

    return data

In [83]:
'''
Plotting the Data
'''
def plot_data(X, X_train, X_test, y_train, y_test, feature1, feature2, rf):
    x_min, x_max = X.iloc[:, 0].min() - 1, X.iloc[:, 0].max() + 1
    y_min, y_max = X.iloc[:, 1].min() - 1, X.iloc[:, 1].max() + 1
    # x_min, x_max = max(x_min, -10), min(x_max, 10)
    # y_max = min(y_max, 10)
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))
    
    Z = rf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    
    cs = plt.contourf(xx, yy, Z, alpha=0.75, cmap=plt.cm.RdYlBu)

    plt.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=plt.cm.RdYlBu, edgecolors='k', marker='o', s=100, label="Train")
    plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=plt.cm.RdYlBu, edgecolors='k', marker='^', s=100, label="Test")
    
    # Step 8: Set plot labels and show
    plt.xlabel(feature1)
    plt.ylabel(feature2)
    plt.title('Random Forest Decision Surface with Top 2 Features')
    plt.legend()
    # plt.show()
    plt.savefig("RF_Decision Surface_plot.png")
    plt.show()
    

In [92]:
'''
Loading in the dataset
'''
def load_data(num_classes):
    '''
    The function loads the dataset, removes rows with N/A values, selects numerical and categorical 
    columns.
    
    Input: None
    
    Output: Train and test datasets
    '''
    previous_data = pd.read_csv('IMDB_MovieListData_Normalized.csv')

    numerical_features = ['Vote Average', 
                          'Vote Count', 
                          'Runtime (mins)', 
                          'Budget (USD, Adjusted for 2024 Inflation)', 
                          'Release Year', 
                          'Popularity', 
                          'Average Rating', 
                          'IMDB Rating', 
                          'Revenue Class']

    # Creating Classes
    previous_data = create_classes(previous_data, num_classes)
    # Select only numerical 
    clean_data = previous_data[numerical_features]
    # Dropping NaN rows
    data = clean_data.dropna()
    

    # Setting Data and Target variables
    X = data.drop(columns=['Revenue Class'])
    y = data['Revenue Class'] 
    
    return X,y

In [98]:
'''
Helper Function for Running Random Forest Classifier
'''
def run_classifier(features, num_classes):
    X, y = load_data(num_classes)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    classifier = ensemble.RandomForestClassifier()
    classifier.fit(X_train, y_train)
    
    print(f"Accurcy from Random Forests Classifier: {classifier.score(X_test, y_test)}")
    feature_importance = classifier.feature_importances_
    for (feature, importance) in zip(features, feature_importance):
            print(f"{feature}: {importance:.3f}")
    X = X[["Budget (USD, Adjusted for 2024 Inflation)", "Popularity"]]
    
    # plot_data(X, X_train, X_test, y_train, y_test, "Budget (USD, Adjusted for 2024 Inflation)", "Popularity", classifier)

    # print("Calculating the Precision for 2 Classes")
    # y_pred = classifier.predict(X_test)
    # accuracy = accuracy_score(y_test, y_pred)
    # cm = confusion_matrix(y_test, y_pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm,
    #                           display_labels=classifier.classes_)
    # disp.plot()
    
    # plt.show()

In [104]:
'''
Running the classifier...
'''
numerical_features = ['Vote Average', 
                      'Vote Count', 
                      'Runtime (mins)', 
                      'Budget (USD, Adjusted for 2024 Inflation)', 
                      'Release Year', 
                      'Popularity', 
                      'Average Rating', 
                      'IMDB Rating'] 

print("This data is based on 2 classes")
run_classifier(numerical_features, 2)
print()
print("This data is based on 3 classes")
run_classifier(numerical_features, 3)

This data is based on 2 classes
Accurcy from Random Forests Classifier: 0.8473767885532592
Vote Average: 0.090
Vote Count: 0.181
Runtime (mins): 0.085
Budget (USD, Adjusted for 2024 Inflation): 0.220
Release Year: 0.108
Popularity: 0.165
Average Rating: 0.080
IMDB Rating: 0.070

This data is based on 3 classes
Accurcy from Random Forests Classifier: 0.7186009538950715
Vote Average: 0.097
Vote Count: 0.165
Runtime (mins): 0.086
Budget (USD, Adjusted for 2024 Inflation): 0.261
Release Year: 0.100
Popularity: 0.139
Average Rating: 0.082
IMDB Rating: 0.070


In [16]:
'''
Running Random Forest Classifier
'''
X, y = load_data()
classifier = ensemble.RandomForestClassifier()
classifier.fit(datasets['X_train'], datasets['y_train'])
print(f"Accurcy from Random Forests Classifier: {classifier.score(datasets['X_test'], datasets['y_test'])}")
feature_importance = classifier.feature_importances_
numerical_features = ['Vote Average', 
                          'Vote Count', 
                          'Runtime (mins)', 
                          'Budget (USD, Adjusted for 2024 Inflation)', 
                          'Release Year', 
                          'Popularity', 
                          'Average Rating', 
                          'IMDB Rating', 
                          'Meta Score']
for (feature, importance) in zip(numerical_features, feature_importance):
    print(f"{feature}: {importance:.3f}")

plot_data(X, y_train, y_test, feature1, feature2, rf):
# print(f"These are the important features from RF:{}")

Accurcy from Random Forests Classifier: 0.8201438848920863
Vote Average: 0.113
Vote Count: 0.124
Runtime (mins): 0.104
Budget (USD, Adjusted for 2024 Inflation): 0.180
Release Year: 0.098
Popularity: 0.130
Average Rating: 0.095
IMDB Rating: 0.061
Meta Score: 0.095


In [None]:
'''
Calculating 
'''
