In [5]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, accuracy_score
import csv

In [6]:
directors = []
actors = []
mpaaRatings = []
X = [] # This holds the movie data that the model will use to train.
y = [] # This holds whether the movies won an oscar or not that the model will use to train.

dummyData = pd.read_csv("../Data/oscars.csv", encoding = "ISO-8859-1")
n = len(dummyData.index)

for i in range(0, n):
    movie = list(dummyData.iloc[i])
    director = movie[7]
    actor1 = movie[11]
    actor2 = movie[15]
    actor3 = movie[19]
    mpaaRating = movie[23]
    
    if not(director in directors):
        directors.append(director)
    movie[7] = directors.index(director) # replace director to its unique numerical value
    movie[8] = int(movie[8])
    
    if not(actor1 in actors):
        actors.append(actor1)
    movie[11] = actors.index(actor1)
    movie[12] = int(movie[12])
    
    if not(actor2 in actors):
        actors.append(actor2)
    movie[15] = actors.index(actor2)
    movie[16] = int(movie[16])
    
    if not(actor3 in actors):
        actors.append(actor3)
    movie[19] = actors.index(actor3)
    movie[20] = int(movie[20])
    
    if not(mpaaRating in mpaaRatings):
        mpaaRatings.append(mpaaRating)
    movie[23] = mpaaRatings.index(mpaaRating) # replace mpaa rating to its unique numerical value
    
    X.append(movie[3:])
    y.append(int(movie[1]))

In [9]:
data = []
c_values = [1, 10, 100]
penalty_type = ['l1', 'l2']

for i in range(0,3):
    for j in range(0,2):
        data.append(["C", "penalty", "Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7", "Round 8", "Round 9", "Round 10", "Avg F1 Score", "Avg Accuracy"])
        data_row = []
        data_row.append(c_values[i])
        data_row.append(penalty_type[j])
        f1_scores = []
        accuracy = []
        for k in range(0, 10):
            # train_test_split takes the datasets, shuffles the datasets, and splits it into training and testing sets.
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = y)

            sc = StandardScaler()
            X_train = sc.fit_transform(X_train)
            X_test = sc.transform(X_test)
            classifier = LogisticRegression(random_state = 0, C=c_values[i], penalty=penalty_type[j])
            classifier.fit(X_train, y_train)
            
            predictions = classifier.predict(X_test) # This predict function produces yes or no Oscar predictions.
            proba_predictions = classifier.predict_proba(X_test)[:,1] # This predict_proba function produces the probability of the movies winning an oscar.
            
            tn, fp, fn, tp = confusion_matrix(y_test, predictions).ravel()
            f1_score = 2 * tp / (2 * tp + fp + fn)
            f1_scores.append(f1_score)
            data_row.append(round(f1_score,3))
            accuracy.append(accuracy_score(y_test,predictions))
        
        avg_f1 = np.mean(f1_scores)
        avg_accuracy = np.mean(accuracy)
        data_row.append(round(avg_f1,3))  
        data_row.append(round(avg_accuracy,3))
        data.append(data_row)
with open("logistic_regression_data.csv","w+") as csv_file:
    csvWriter = csv.writer(csv_file,delimiter=',')
    csvWriter.writerows(data)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=0, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [12]:
predictions = classifier.predict(X_test)
pretty_list = "\n".join("{}\t\t{}".format(x, y) for x, y in zip(predictions, y_test))
print("Predictions\tActual")
print(pretty_list)

Predictions	Actual
0		0
0		0
0		0
1		1
1		0
1		0
0		0
0		0
1		1
0		0
1		1
0		0
0		0
0		0
0		0
0		0
1		0
0		0
0		0
0		0
0		0
0		0
0		0
0		1
0		0


In [13]:
predictions_percent = classifier.predict_proba(X_test)[:,1]
pretty_list = "\n".join("{:.2f}\t{}".format(x, y) for x, y in zip(predictions_percent, y_test))
print("% of 1 \tActual")
print(pretty_list)

% of 1 	Actual
0.05	0
0.19	0
0.11	0
0.66	1
0.57	0
0.69	0
0.14	0
0.18	0
0.53	1
0.00	0
0.59	1
0.03	0
0.17	0
0.18	0
0.01	0
0.13	0
0.65	0
0.40	0
0.09	0
0.10	0
0.09	0
0.11	0
0.18	0
0.15	1
0.04	0


In [14]:
cm = confusion_matrix(y_test, predictions)
print(cm)
accuracy_score(y_test, predictions)

[[18  3]
 [ 1  3]]


0.84