# Logisitic Regression

In [30]:
import pandas as pd
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.metrics import r2_score

In [87]:
def load_data():
    '''
    The function loads the dataset, removes rows with N/A values, and selects numerical columns for regression.
    Input: None
    Output: Train and test datasets for regression
    '''
    file_path = 'IMDB_MovieListData_Normalized.csv'
    previous_data = pd.read_csv(file_path)


    # Define numerical features
    numerical_features = [
        'Vote Average',
        'Vote Count',
        'Runtime (mins)',
        'Budget (USD, Adjusted for 2024 Inflation)',
        'Release Year',
        'Popularity',
        'Average Rating',
        'IMDB Rating',
        'Revenue ( USD, Adjusted for 2024 Inflation)'
    ]

    # Select only numerical 
    clean_data = previous_data[numerical_features]
    
    #Remove rows with missing values
    data = clean_data.dropna()

    # Split features and target
    X = data.drop(columns=['Revenue ( USD, Adjusted for 2024 Inflation)'])
    y = data['Revenue ( USD, Adjusted for 2024 Inflation)']

    # Scale numerical features
    scaler = StandardScaler()
    #X_scaled = scaler.fit_transform(X)
    X_scaled = normalize(X)
    y = normalize(y)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

    train_test_sets = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    return train_test_sets

In [88]:
def printWeights(features, weights):
    """
    Pretty-print the model weights.
    features contains the name of each feature and weights is an array type of the same length.
    You may need to modify this function depending on your implementation
    """
    if "bias" not in features:
        #print("Assuming the last weight is bias term; modify the printWeights function if this is not true")
        features = list(features)+["bias"]
    if len(weights) != len(features):
        print("ERROR: printWeights() called with non-matching feature and weight vector lengths")
        return
    print("\t%30s %10s" % ("Feature", "Weight"))
    for i in range(len(features)):
        print("\t%30s %10.3f" % (features[i], weights[i]))


In [89]:
def normalize(X):
    """
    You will get overflow problems when calculating exponentials if 
    your feature values are too large.  This function adjusts all values to be
    in the range of 0 to 1 for each column.
    """         
    X = X - X.min() # shift range to start at 0
    normalizedX = X/X.max() # divide by possible range of values so max is now 1
    return normalizedX

In [90]:
data = load_data()
X_train, X_test = data['X_train'], data['X_test']
y_train, y_test = data['y_train'], data['y_test']

sgd = SGDRegressor(max_iter=100,shuffle=False, tol=None, \
                    penalty=None, learning_rate='constant', eta0 = 0.001)

sgd.fit(X_train,y_train)

In [91]:
accuracy = sgd.score(X_test, y_test) # note that we're testing on the training data here...
print("Accuracy from SKlearn SGDClassifier on movie data: %.2f\n" % (accuracy))

weights = list(sgd.coef_) + list(sgd.intercept_)


test_predictions = sgd.predict(X_test)
test_r2 = r2_score(y_test, test_predictions)
print(f"\nTest Set R^2 Score: {test_r2:.4f}")

numerical_features = [
        'Vote Average',
        'Vote Count',
        'Runtime (mins)',
        'Budget (USD, Adjusted for 2024 Inflation)',
        'Release Year',
        'Popularity',
        'Average Rating',
        'IMDB Rating',
    ]

printWeights(numerical_features, weights)

Accuracy from SKlearn SGDClassifier on movie data: 0.54


Test Set R^2 Score: 0.5409
	                       Feature     Weight
	                  Vote Average      0.016
	                    Vote Count      0.240
	                Runtime (mins)      0.028
	Budget (USD, Adjusted for 2024 Inflation)      0.221
	                  Release Year     -0.123
	                    Popularity      0.048
	                Average Rating      0.036
	                   IMDB Rating     -0.001
	                          bias      0.053
