In [4]:
'''
Importing...
'''
# importing math and numpy
import math
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
# importing scipy for stats package
import scipy
# importing random forest classifier
from sklearn import ensemble
# importing some sklearn tools
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import validation_curve

In [5]:
'''
Function for creating our classes
'''
def create_classes(data, num_class):
    '''
    This function creates classes by splitting the Revenue data into different ranges depending on how
    classes are being requested

    Input: 
        - num_class -> (int) the number of classes we want to split the data into
        - data -> the pandas dataset that we are altering

    Output: The pandas dataset with new classes
    '''
    if num_class == 2:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 50000000, 
        'Revenue Class'] = 0
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] > 50000000, 
        'Revenue Class'] = 1
    else:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 25000000, 'Revenue Class'] = 0
        data.loc[(data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 25000001) & (data['Revenue ( USD, Adjusted for 2024 Inflation)'] < 120000000), 'Revenue Class'] = 1
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 120000001, 'Revenue Class'] = 2

    return data

In [128]:
'''
Loading in the dataset
'''
def load_data(num_classes):
    '''
    The function loads the dataset, removes rows with N/A values, selects numerical and categorical 
    columns.
    
    Input: None
    
    Output: Train and test datasets
    '''
    previous_data = pd.read_csv('IMDB_MovieListData_Normalized.csv')
    print(f"This is the len before cleaning: {len(previous_data)}")

    numerical_features = ['Vote Average', 
                          'Vote Count', 
                          'Runtime (mins)', 
                          'Budget (USD, Adjusted for 2024 Inflation)', 
                          'Release Year', 
                          'Popularity', 
                          'Average Rating', 
                          'IMDB Rating', 
                          'Revenue Class']

    # Creating Classes
    previous_data = create_classes(previous_data, num_classes)
    print(f"Highest runtime movie info: {previous_data.loc[previous_data['Runtime (mins)'].idxmax()]}")
    print(f"Highest Revenue info: {previous_data.loc[previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'].idxmax()]}")
    print(f"Lowest Revenue info: {previous_data.loc[previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'].idxmin()]}")
    print(f"This is the len after the classes: {len(previous_data)}")
    # Select only numerical 
    clean_data = previous_data[numerical_features]
    print(f"This is the len with only numerical features: {len(clean_data)}")
    # Dropping NaN rows
    data = clean_data.dropna()
    print(f"This is the len when we drop rows with na: {len(data)}")
    

    # Setting Data and Target variables
    X = data.drop(columns=['Revenue Class'])
    y = data['Revenue Class'] 
    
    return X,y

In [129]:
import itertools
%matplotlib widget

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [130]:
'''
Helper Function for Running Random Forest Classifier
'''
def run_classifier(features, num_classes):
    X, y = load_data(num_classes)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    classifier = ensemble.RandomForestClassifier()
    classifier.fit(X_train, y_train)
    
    print(f"Accurcy from Random Forests Classifier: {classifier.score(X_test, y_test)}")
    feature_importances = classifier.feature_importances_
    for (feature, importance) in zip(features, feature_importances):
            print(f"{feature}: {importance:.3f}")

    predictions = classifier.predict(X_test)
    cm = confusion_matrix(y_test, predictions)
    # visualize(X, y, classifier, cm, feature_importances, features)
    

    # f_range = [0.01, 0.1, 0.5, 1.0, "sqrt"]
    # random_parameters = {"max_features": f_range}
    # acc_scores = []
    
    # skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)

    # print(f"RUNNING 5-Fold CV on Random Forest Classifier")
    # print("-"*50)
    
    # for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    #     print(f"Fold {i}:")
    #     clf = GridSearchCV(learner, parameters, cv=3)
    #     clf.fit(X[train_index], y[train_index])
    #     print(f"Best parameters: {clf.best_params_}")
    #     print(f"Tuning Set Score: {clf.best_score_:.3f}\n")
    #     score = clf.score(X[test_index], y[test_index])
    #     acc_scores.append(score)
    
    # print("Test Set Results")
    # print("-"*10)
    # print()
    # print(f"Fold, RF, LR")
    # for acc, (r_score, lr_score) in enumerate(zip(r_scores, lr_scores)):
    #     print(f"{acc}, {r_score:.3f}, {lr_score:.3f}")
    # print()
    
    
    # plot_data(X, X_train, X_test, y_train, y_test, "Budget (USD, Adjusted for 2024 Inflation)", "Popularity", classifier)

    # print("Calculating the Precision for 2 Classes")
    # y_pred = classifier.predict(X_test)
    # accuracy = accuracy_score(y_test, y_pred)
    # cm = confusion_matrix(y_test, y_pred)
    # disp = ConfusionMatrixDisplay(confusion_matrix=cm,
    #                           display_labels=classifier.classes_)
    # disp.plot()
    
    # plt.show()

In [131]:
'''
Working on Visualisations
'''
def visualize(X, y, clf, cm, importances, features):
    param_range = [i for i in range(1, 26)]
    # train_scores, test_scores = validation_curve(clf, X, y, param_name="n_estimators", param_range=param_range, cv=5)

    # train_means = np.mean(train_scores, axis=1)
    # test_means = np.mean(test_scores, axis=1)

    # plt.title("RF Validation Curve", fontsize="xx-large")
    # plt.xlabel("Trees",fontsize="large")
    # plt.ylabel("Accuracy",fontsize="large")
    # plt.ylim(0.0, 1.1)
    # plt.xticks(param_range)
    
    # plt.plot(param_range, train_means, label="Training score", color="darkblue")
    for i in range(10):
        train_scores, test_scores = validation_curve(clf, X, y, param_name="n_estimators", param_range=param_range, cv=5)

        train_means = np.mean(train_scores, axis=1)
        test_means = np.mean(test_scores, axis=1)
        print(f"This is the highest accuracy score: {max(test_means)}")
    # plt.plot(param_range, test_means, label="Cross-validation score", color="blue")
    
    # plt.legend(loc="best")
    # plt.savefig("rf_validation curve.png")
    # plt.show()

    # disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=clf.classes_)
    # disp.plot(cmap="Blues")
    # plt.title("Random Forest Confusion Matrix")
    # plt.show()

    # plt.figure(figsize=(10, 4))
    # plt.barh(features, importances, color="blue")
    # plt.title('Feature Importances')
    # plt.xlim(0, 0.3)
    # plt.xlabel('Relative Importance')
    # plt.show()

In [132]:
'''
Running the classifier...
'''
numerical_features = ['Vote Average', 
                      'Vote Count', 
                      'Runtime (mins)', 
                      'Budget', 
                      'Release Year', 
                      'Popularity', 
                      'Average Rating', 
                      'IMDB Rating'] 

print("This data is based on 2 classes")
run_classifier(numerical_features, 2)
print()
# print("This data is based on 3 classes")
# run_classifier(numerical_features, 3)

This data is based on 2 classes
This is the len before cleaning: 7240
Highest runtime movie info: Title                                                                                 God of War
Vote Average                                                                                10.0
Vote Count                                                                                     1
Runtime (mins)                                                                               999
Budget (USD, Adjusted for 2024 Inflation)                                            100800000.0
Release Year                                                                                2018
Popularity                                                                                   1.4
Production Companies                           Santa Monica Studio, Sony Interactive Entertai...
Production Countries                                                                         NaN
Spoken Languages             

In [96]:
plt.close('all')

In [16]:
'''
Running Random Forest Classifier
'''
X, y = load_data()
classifier = ensemble.RandomForestClassifier()
classifier.fit(datasets['X_train'], datasets['y_train'])
print(f"Accurcy from Random Forests Classifier: {classifier.score(datasets['X_test'], datasets['y_test'])}")
feature_importance = classifier.feature_importances_
numerical_features = ['Vote Average', 
                          'Vote Count', 
                          'Runtime (mins)', 
                          'Budget (USD, Adjusted for 2024 Inflation)', 
                          'Release Year', 
                          'Popularity', 
                          'Average Rating', 
                          'IMDB Rating', 
                          'Meta Score']
for (feature, importance) in zip(numerical_features, feature_importance):
    print(f"{feature}: {importance:.3f}")

plot_data(X, y_train, y_test, feature1, feature2, rf):
# print(f"These are the important features from RF:{}")

Accurcy from Random Forests Classifier: 0.8201438848920863
Vote Average: 0.113
Vote Count: 0.124
Runtime (mins): 0.104
Budget (USD, Adjusted for 2024 Inflation): 0.180
Release Year: 0.098
Popularity: 0.130
Average Rating: 0.095
IMDB Rating: 0.061
Meta Score: 0.095


In [None]:
'''
Calculating 
'''
