In [6]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
import csv
import numpy as np

In [7]:
directors = []
actors = []
mpaaRatings = []
X = [] # This holds the movie data that the model will use to train.
y = [] # This holds whether the movies won an oscar or not that the model will use to train.
X_final_testset = [] # This holds movies from 2017 to 2019 that will be used on the website to display predictions.
y_final_testset = [] # This holds whether the movies from 2017 to 2019 won an oscar or not.

data = pd.read_csv("../Data/oscars.csv", encoding = "ISO-8859-1")
n = len(data.index)

for i in range(0, n):
    movie = list(data.iloc[i])
    year = movie[2]
    director = movie[7]
    actor1 = movie[11]
    actor2 = movie[15]
    actor3 = movie[19]
    mpaaRating = movie[23]
    
    if not(director in directors):
        directors.append(director)
    movie[7] = directors.index(director) # replace director to its unique numerical value
    movie[8] = int(movie[8])
    
    if not(actor1 in actors):
        actors.append(actor1)
    movie[11] = actors.index(actor1)
    movie[12] = int(movie[12])
    
    if not(actor2 in actors):
        actors.append(actor2)
    movie[15] = actors.index(actor2)
    movie[16] = int(movie[16])
    
    if not(actor3 in actors):
        actors.append(actor3)
    movie[19] = actors.index(actor3)
    movie[20] = int(movie[20])
    
    if not(mpaaRating in mpaaRatings):
        mpaaRatings.append(mpaaRating)
    movie[23] = mpaaRatings.index(mpaaRating) # replace mpaa rating to its unique numerical value
    
    if (year < 2017):
        X.append(movie[3:])
        y.append(int(movie[1]))
    else:
        X_final_testset.append(movie[3:])
        y_final_testset.append(int(movie[1]))

In [8]:
svc = make_pipeline(StandardScaler(), SVC(probability=True, class_weight='balanced', C=1000, gamma=0.01))
svc.fit(X, y)

Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=1000, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.01, kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [9]:
predictions = svc.predict(X_final_testset) # This predict function produces yes or no Oscar predictions.
pretty_list = "\n".join("{}\t\t{}".format(x, y) for x, y in zip(predictions, y_final_testset))
print("Predictions\tActual")
print(pretty_list)

Predictions	Actual
0		0
0		0
0		0
0		0
1		0
0		0
0		0
1		1
0		0
0		1
0		0
0		0
0		0
0		0
1		0
1		0
0		0
1		0
0		0
0		0
0		0
1		1
0		0
0		0
0		0
0		0


In [10]:
# This predict_proba function produces the probability of the movies winning an oscar.
proba_predictions = svc.predict_proba(X_final_testset)[:,1]

% of 1 	Actual
0.15	0
0.08	0
0.12	0
0.16	0
0.19	0
0.10	0
0.05	0
0.30	1
0.16	0
0.10	1
0.13	0
0.17	0
0.09	0
0.18	0
0.22	0
0.18	0
0.10	0
0.21	0
0.08	0
0.06	0
0.07	0
0.21	1
0.05	0
0.16	0
0.10	0
0.14	0


In [11]:
pretty_list = "\n".join("{:.3f}\t{}".format(x, y) for x, y in zip(proba_predictions, y_final_testset))
print("% of 1 \tActual")
print(pretty_list)

% of 1 	Actual
0.151	0
0.076	0
0.119	0
0.156	0
0.195	0
0.105	0
0.055	0
0.305	1
0.159	0
0.098	1
0.133	0
0.172	0
0.089	0
0.177	0
0.218	0
0.185	0
0.101	0
0.211	0
0.081	0
0.061	0
0.065	0
0.207	1
0.054	0
0.163	0
0.105	0
0.143	0


In [11]:
data = []
data.append(["C", "gamma", "Round 1", "Round 2", "Round 3", "Round 4", "Round 5", "Round 6", "Round 7", "Round 8", "Round 9", "Round 10", "Avg F1 Score", "Avg Accuracy"])
    
c_values = [0.1, 1, 10, 100, 1000, 1100]
gamma_values = [0.01, 0.1, 1, 10, 100]

for i in range(0,6):
    for j in range(0,5):
        data_row = []
        data_row.append(c_values[i])
        data_row.append(gamma_values[j])
        f1_scores = []
        accuracy = []
        for k in range(0, 10):
            # train_test_split takes the datasets, shuffles the datasets, and splits it into training and testing sets.
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, shuffle = True, stratify = y)

            svc = make_pipeline(StandardScaler(), SVC(probability=True, class_weight='balanced', C=c_values[i], gamma=gamma_values[j]))
            svc.fit(X_train, y_train)

            predictions = svc.predict(X_test) # This predict function produces yes or no Oscar predictions.
            proba_predictions = svc.predict_proba(X_test)[:,1] # This predict_proba function produces the probability of the movies winning an oscar.
            
            f1_scores.append(f1_score(y_test, predictions, average='weighted'))
            data_row.append(round(f1_score(y_test, predictions, average='weighted'),3))
            accuracy.append(accuracy_score(y_test,predictions))
        
        avg_f1 = np.mean(f1_scores)
        avg_accuracy = np.mean(accuracy)
        data_row.append(round(avg_f1,3))  
        data_row.append(round(avg_accuracy,3))
        data.append(data_row)
with open("svc_data_parametertuning2sample.csv","w+",newline='') as csv_file:
    csvWriter = csv.writer(csv_file,delimiter=',')
    csvWriter.writerows(data)

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision'

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [44]:
# This predict function produces yes or no Oscar predictions.
predictions = svc.predict(X_test)
pretty_list = "\n".join("{}\t\t{}".format(x, y) for x, y in zip(predictions, y_test))
print("Predictions\tActual")
print(pretty_list)

Predictions	Actual
0		0
0		0
0		0
0		0
0		1
0		0
0		0
1		1
0		0
0		0
0		0
1		1
1		0
0		0
0		0
0		0
0		0
1		1
1		0
0		0
0		0
1		0
0		0
0		0
1		0


In [45]:
# This predict_proba function produces the probability of the movies winning an oscar.
proba_predictions = svc.predict_proba(X_test)[:,1]
pretty_list = "\n".join("{:.2f}\t{}".format(x, y) for x, y in zip(proba_predictions, y_test))
print("% of 1 \tActual")
print(pretty_list)

% of 1 	Actual
0.12	0
0.13	0
0.15	0
0.11	0
0.17	1
0.11	0
0.09	0
0.18	1
0.17	0
0.11	0
0.13	0
0.17	1
0.18	0
0.12	0
0.12	0
0.14	0
0.17	0
0.27	1
0.21	0
0.13	0
0.12	0
0.18	0
0.16	0
0.12	0
0.21	0


In [46]:
cm=confusion_matrix(y_test, predictions)
print(cm)
accuracy_score(y_test,predictions)

[[17  4]
 [ 1  3]]


0.8

In [10]:
C = np.logspace(0, 4, 10)
print(C)

[1.00000000e+00 2.78255940e+00 7.74263683e+00 2.15443469e+01
 5.99484250e+01 1.66810054e+02 4.64158883e+02 1.29154967e+03
 3.59381366e+03 1.00000000e+04]
