In [1]:
#import libraries
from sklearn import neighbors
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

In [2]:
def load_data2Class():
    '''
    The function loads the dataset, removes rows with N/A values, and selects numerical columns for regression.
    Input: None
    Output: Train and test datasets for regression
    '''
    file_path = 'IMDB_MovieListData_Normalized.csv'
    previous_data = pd.read_csv(file_path)
    previous_data.loc[previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 50000000, 'Revenue Class'] = 0
    previous_data.loc[previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'] > 50000000, 'Revenue Class'] = 1
   
    # Define numerical features
    numerical_features = [
        'Vote Average',
        'Vote Count',
        'Runtime (mins)',
        'Budget (USD, Adjusted for 2024 Inflation)',
        'Release Year',
        'Popularity',
        'Revenue ( USD, Adjusted for 2024 Inflation)',
        'Revenue Class'
    ]

    # Select only numerical 
    clean_data = previous_data[numerical_features]
    
    #Remove rows with missing values
    data = clean_data.dropna()
    unique1 = data['Revenue Class'].value_counts()
    print(unique1)
    
    # Split features and target
    X = data.drop(columns=['Revenue Class'])
    y = data['Revenue Class']

    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = normalize(X)
    #y = normalize(y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

    train_test_sets = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }

    X_train, X_test = train_test_sets['X_train'], train_test_sets['X_test']
    y_train, y_test = train_test_sets['y_train'], train_test_sets['y_test']

    movieClassifier = neighbors.KNeighborsClassifier(metric='euclidean', n_neighbors=1, weights='distance')
    movieClassifier.fit(X_train, y_train)
    print("accuraccy: " + str(movieClassifier.score(X_test, y_test)))
    print('Label 0: Revenue is less than $50 million USD.')
    print('Label 1: Revenue is more than $50 million USD.')
    print('load_data2Class finished')
    print('--------------')
    


In [3]:
def load_data3Class():
    '''
    The function loads the dataset, removes rows with N/A values, and selects numerical columns for regression.
    Input: None
    Output: Train and test datasets for regression
    '''
    file_path = 'IMDB_MovieListData_Normalized.csv'
    previous_data = pd.read_csv(file_path)
    previous_data.loc[previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 25000000, 'Revenue Class'] = 0
    previous_data.loc[(previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 25000001) & (previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'] < 120000000), 'Revenue Class'] = 1
    previous_data.loc[previous_data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 120000001, 'Revenue Class'] = 2
   
    # Define numerical features
    numerical_features = [
        'Vote Average',
        'Vote Count',
        'Runtime (mins)',
        'Budget (USD, Adjusted for 2024 Inflation)',
        'Release Year',
        'Popularity',
        'Revenue ( USD, Adjusted for 2024 Inflation)',
        'Revenue Class'
    ]

    # Select only numerical 
    clean_data = previous_data[numerical_features]
    
    #Remove rows with missing values
    data = clean_data.dropna()
    unique1 = data['Revenue Class'].value_counts()
    print(unique1)
    
    # Split features and target
    X = data.drop(columns=['Revenue Class'])
    y = data['Revenue Class']

    # Scale numerical features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = normalize(X)
    #y = normalize(y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3)

    train_test_sets = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }

    X_train, X_test = train_test_sets['X_train'], train_test_sets['X_test']
    y_train, y_test = train_test_sets['y_train'], train_test_sets['y_test']

    movieClassifier = neighbors.KNeighborsClassifier(metric='euclidean', n_neighbors=1, weights='distance')
    movieClassifier.fit(X_train, y_train)
    print('Label 0: Revenue is less than or equal to $25 million USD.')
    print('Label 1: Revenue is between $25 million to $120 million USD.')
    print('Label 2: Revenue is more than $120 million USD.')
    print("accuraccy: " + str(movieClassifier.score(X_test, y_test)))
    print('load_data3Class finished')
    print('--------------')


In [4]:
def normalize(X):
    """
    You will get overflow problems when calculating exponentials if 
    your feature values are too large.  This function adjusts all values to be
    in the range of 0 to 1 for each column.
    """         
    X = X - X.min() # shift range to start at 0
    normalizedX = X/X.max() # divide by possible range of values so max is now 1
    return normalizedX

In [5]:
data2Class = load_data2Class()
print(data2Class)

data3Class = load_data3Class()
print(data3Class)

Revenue Class
1.0    3752
0.0    3488
Name: count, dtype: int64
accuraccy: 0.815377532228361
Label 0: Revenue is less than $50 million USD.
Label 1: Revenue is more than $50 million USD.
load_data2Class finished
--------------
None
Revenue Class
1.0    2474
0.0    2469
2.0    2297
Name: count, dtype: int64
Label 0: Revenue is less than or equal to $25 million USD.
Label 1: Revenue is between $25 million to $120 million USD.
Label 2: Revenue is more than $120 million USD.
accuraccy: 0.6767955801104972
load_data3Class finished
--------------
None
