# Logisitic Regression

In [4]:
import pandas as pd
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import train_test_split
# from sklearn.model_selection import RepeatedKFold
# from sklearn.metrics import r2_score

In [5]:
'''
Function for creating our classes
'''
def create_classes(data, num_class):
    '''
    This function creates classes by splitting the Revenue data into different ranges depending on how
    classes are being requested

    Input: 
        - num_class -> (int) the number of classes we want to split the data into
        - data -> the pandas dataset that we are altering

    Output: The pandas dataset with new classes
    '''
    if num_class == 2:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 50000000, 
        'Revenue Class'] = 0
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] > 50000000, 
        'Revenue Class'] = 1
    else:
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] <= 25000000, 'Revenue Class'] = 0
        data.loc[(data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 25000001) & (data['Revenue ( USD, Adjusted for 2024 Inflation)'] < 120000000), 'Revenue Class'] = 1
        data.loc[data['Revenue ( USD, Adjusted for 2024 Inflation)'] >= 120000001, 'Revenue Class'] = 2

In [6]:
'''
Loading in the dataset
'''
def load_data(classes):
    '''
    The function loads the dataset, removes rows with N/A values, selects numerical and categorical 
    columns.
    
    Input: None
    
    Output: Train and test datasets
    '''
    file_path = 'IMDB_MovieListData_Normalized.csv'
    previous_data = pd.read_csv(file_path)

    numerical_features = ['Vote Average', 
                          'Vote Count', 
                          'Runtime (mins)', 
                          'Budget (USD, Adjusted for 2024 Inflation)', 
                          'Release Year', 
                          'Popularity', 
                          'Average Rating', 
                          'IMDB Rating', 
                          'Meta Score', 
                          'Revenue Class']

    # Creating Classes
    if classes == 2:
        create_classes(previous_data, 2)
    else:
        create_classes(previous_data, 3)
    # Select only numerical 
    clean_data = previous_data[numerical_features]
    # Dropping NaN rows
    data = clean_data.dropna()
    

    # Setting Data and Target variables
    X = data.drop(columns=['Revenue Class'])
    y = data['Revenue Class'] 

    X = normalize(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    train_test_sets = {
        'X_train': X_train,
        'X_test': X_test,
        'y_train': y_train,
        'y_test': y_test
    }
    return train_test_sets

In [7]:
def printWeights(features, weights):
    """
    Pretty-print the model weights.
    features contains the name of each feature and weights is an array type of the same length.
    You may need to modify this function depending on your implementation
    """
    if "bias" not in features:
        #print("Assuming the last weight is bias term; modify the printWeights function if this is not true")
        features = list(features)+["bias"]
    if len(weights) != len(features):
        print("ERROR: printWeights() called with non-matching feature and weight vector lengths")
        return
    print("\t%30s %10s" % ("Feature", "Weight"))
    for i in range(len(features)):
        print("\t%30s %10.3f" % (features[i], weights[i]))


In [8]:
def normalize(X):
    """
    You will get overflow problems when calculating exponentials if 
    your feature values are too large.  This function adjusts all values to be
    in the range of 0 to 1 for each column.
    """         
    X = X - X.min() # shift range to start at 0
    normalizedX = X/X.max() # divide by possible range of values so max is now 1
    return normalizedX

In [9]:
def fit_model(classes):
    """
    Loads in the data with revenue separated into either two or three classes based on the parameter 
    passed in. Then fits the model and returns it after printing accuracy.
    """

    if classes == 2:
        data = load_data(2)
        X_train, X_test = data['X_train'], data['X_test']
        y_train, y_test = data['y_train'], data['y_test']
        sgd = SGDClassifier(max_iter=100,shuffle=False, tol=None, \
                            penalty=None, learning_rate='constant', eta0 = 0.01)
    else:
        data = load_data(3)
        X_train, X_test = data['X_train'], data['X_test']
        y_train, y_test = data['y_train'], data['y_test']
        sgd = SGDClassifier(max_iter=100,shuffle=False, tol=None, \
                            penalty=None, learning_rate='constant', eta0 = 0.01)
        sgd = OneVsRestClassifier(sgd)
    
    sgd.fit(X_train,y_train)

    accuracy = sgd.score(X_test, y_test)
    print("Accuracy from SKlearn SGDClassifier on movie data: %.2f\n" % (accuracy))

    return sgd

In [10]:
numerical_features = [
            'Vote Average',
            'Vote Count',
            'Runtime (mins)',
            'Budget (USD, Adjusted for 2024 Inflation)',
            'Release Year',
            'Popularity',
            'Average Rating',
            'IMDB Rating',
        ]

print("Logisitic Regression on two classes: ")
sgd = fit_model(2)
weights = sgd.coef_ + list(sgd.intercept_)
printWeights(numerical_features, weights[0])
print()


print("Logisitic Regression on three classes: ")
ovr = fit_model(3)
estimators = ovr.estimators_
for model in estimators:
    weights = model.coef_ + list(model.intercept_)
    printWeights(numerical_features, weights[0])
    print()

Logisitic Regression on two classes: 
Accuracy from SKlearn SGDClassifier on movie data: 0.81

	                       Feature     Weight
	                  Vote Average      1.001
	                    Vote Count      1.254
	                Runtime (mins)      1.054
	Budget (USD, Adjusted for 2024 Inflation)      1.278
	                  Release Year      0.978
	                    Popularity      1.124
	                Average Rating      1.026
	                   IMDB Rating      0.892
	                          bias      1.045

Logisitic Regression on three classes: 
Accuracy from SKlearn SGDClassifier on movie data: 0.63

	                       Feature     Weight
	                  Vote Average     -0.996
	                    Vote Count     -1.088
	                Runtime (mins)     -1.067
	Budget (USD, Adjusted for 2024 Inflation)     -1.076
	                  Release Year     -1.046
	                    Popularity     -1.118
	                Average Rating     -1.069
	          