# Logistic Regression with Principal Component Analysis

In [1]:
#Import libraries
import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
#Load the dataset
fight_data = pd.read_csv("preprocessed_data.csv")

fight_data.head()

Unnamed: 0,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,...,weight_class_Women's Strawweight,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,Red,True,5,0.0,4.0,0.0,9.2,6.0,0.2,0.0,...,0,0,1,0,0,0,0,1,0,0
1,Red,True,5,0.0,3.0,0.0,14.6,9.1,11.8,7.3,...,0,0,1,0,0,0,0,0,1,0
2,Red,False,3,0.0,3.0,0.0,15.354839,11.322581,6.741935,4.387097,...,0,0,1,0,0,0,0,1,0,0
3,Blue,False,3,0.0,4.0,0.0,17.0,14.0,13.75,11.0,...,0,0,0,0,0,1,0,1,0,0
4,Blue,False,3,0.0,1.0,0.0,17.0,14.5,2.5,2.0,...,0,0,0,0,1,0,0,0,1,0


In [3]:
#Trim any unnecessary columns that do not have any relevance to linear regression or will be hard to quantify
fight_data["Winner"] = (fight_data["Winner"]=="Red")*1
fight_data["title_bout"] = (fight_data["title_bout"])*1



#########
# TODO:
# Re-evaluate to see if there is value in any non-numerical value.  Eg. weight class
# 
#########


#Drop any columns that are not numeric values (e.g. City, Location)
drop_index = fight_data.dtypes.loc[fight_data.dtypes == object].index
cleaned_fight_data = fight_data.drop(drop_index, axis = 1)

#Get rid of any NaN values for use in feature selection
cleaned_fight_data.fillna(value= 0, inplace = True)


cleaned_fight_data.head()

Unnamed: 0,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,B_avg_CLINCH_att,B_avg_CLINCH_landed,...,weight_class_Women's Strawweight,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Southpaw,R_Stance_Switch
0,1,1,5,0.0,4.0,0.0,9.2,6.0,0.2,0.0,...,0,0,1,0,0,0,0,1,0,0
1,1,1,5,0.0,3.0,0.0,14.6,9.1,11.8,7.3,...,0,0,1,0,0,0,0,0,1,0
2,1,0,3,0.0,3.0,0.0,15.354839,11.322581,6.741935,4.387097,...,0,0,1,0,0,0,0,1,0,0
3,0,0,3,0.0,4.0,0.0,17.0,14.0,13.75,11.0,...,0,0,0,0,0,1,0,1,0,0
4,0,0,3,0.0,1.0,0.0,17.0,14.5,2.5,2.0,...,0,0,0,0,1,0,0,0,1,0


In [4]:
#Separete X and y values
X = cleaned_fight_data.drop(columns = ["Winner"])
y = cleaned_fight_data["Winner"]

print("X Shape: {}\ny Shape: {}".format(X.shape, y.shape))


X Shape: (3592, 159)
y Shape: (3592,)


In [7]:
#Baseline the project by running linear regression on the entire dataset
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
original_data_scores = []

#Predict the accuracy using Logistic regretion on the original dataset 20 times with a test size random between .2 and .4
for _ in range(10):
    size_change = .2 + (random.randint(1, 20) / 100)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size_change)
    logreg.fit(X_train, y_train)
    original_data_scores.append(logreg.score(X_test, y_test))

# warning given solution options: 
# https://stackoverflow.com/questions/52670012/convergencewarning-liblinear-failed-to-converge-increase-the-number-of-iterati
# Normalize the data using scikit-learn's StandardScaler

print("Average of logistic regression run on the original dataset: {}".format(sum(original_data_scores)/len(original_data_scores))) 




Average of logistic regression run on the original dataset: 0.6772493774021172




In [9]:
PCA_data_scores = []

for i in range(1, 30):

    pca = PCA(n_components = X.shape[1])
    PCA_X = pd.DataFrame(pca.fit_transform(X)).iloc[:, :i]
    
    print("Trying {} number of components".format(i))

    scores_per_component = []
    
    for _ in range(30):
        size_change = .2 + (random.randint(1, 20) / 100)

        PCA_X_train, PCA_X_test, PCA_y_train, PCA_y_test = train_test_split(PCA_X, y, test_size=size_change)
        logreg.fit(PCA_X_train, PCA_y_train)
        scores_per_component.append(logreg.score(PCA_X_test, PCA_y_test))

    PCA_data_scores.append(sum(scores_per_component) / len(scores_per_component))


for i in range(0, len(PCA_data_scores)):
    print("Number of components: {}   Average Score: {}".format(i+1, PCA_data_scores[i]))
    
    

Trying 1 number of components
Trying 2 number of components
Trying 3 number of components
Trying 4 number of components
Trying 5 number of components
Trying 6 number of components
Trying 7 number of components
Trying 8 number of components
Trying 9 number of components
Trying 10 number of components
Trying 11 number of components
Trying 12 number of components
Trying 13 number of components
Trying 14 number of components
Trying 15 number of components
Trying 16 number of components
Trying 17 number of components
Trying 18 number of components
Trying 19 number of components
Trying 20 number of components
Trying 21 number of components
Trying 22 number of components
Trying 23 number of components
Trying 24 number of components
Trying 25 number of components
Trying 26 number of components
Trying 27 number of components
Trying 28 number of components
Trying 29 number of components
Number of components: 1   Average Score: 0.6563533539496798
Number of components: 2   Average Score: 0.6610937

In [None]:
#Plot the cumulative variance to decide how many features should be included
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')

In [None]:
logreg.predict(X_test)

logreg.score(X_test, y_test)

In [None]:
logreg.predict(PCA_X_test)



In [None]:
#next steps: automate and compare different levels of components in PCA and how it affects the outcome
#try comparing 