# Logistic Regression with Principal Component Analysis

In [1]:
#Import libraries
import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv("data_1.csv")


data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,Henry Cejudo,Marlon Moraes,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,2.0,0.0,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0
1,Valentina Shevchenko,Jessica Eye,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,2.0,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0
2,Tony Ferguson,Donald Cerrone,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,3.0,6.0,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0
3,Jimmie Rivera,Petr Yan,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,1.0,0.0,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0
4,Tai Tuivasa,Blagoy Ivanov,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,2.0,0.0,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0


In [3]:
#Put all fighters into one list, give each an ID, put back into data table

fighters = pd.concat([data["R_fighter"], data["B_fighter"]], ignore_index = True)

codes, uniques = pd.factorize(fighters)

data["R_fighter"] = codes[:int(len(codes) / 2)]
data["B_fighter"] = codes[int(len(codes) / 2):]

data.head()

Unnamed: 0,R_fighter,B_fighter,Referee,date,location,Winner,title_bout,weight_class,no_of_rounds,B_current_lose_streak,...,R_win_by_KO/TKO,R_win_by_Submission,R_win_by_TKO_Doctor_Stoppage,R_wins,R_Stance,R_Height_cms,R_Reach_cms,R_Weight_lbs,B_age,R_age
0,0,460,Marc Goddard,2019-06-08,"Chicago, Illinois, USA",Red,True,Bantamweight,5,0.0,...,2.0,0.0,0.0,8.0,Orthodox,162.56,162.56,135.0,31.0,32.0
1,1,605,Robert Madrigal,2019-06-08,"Chicago, Illinois, USA",Red,True,Women's Flyweight,5,0.0,...,0.0,2.0,0.0,5.0,Southpaw,165.1,167.64,125.0,32.0,31.0
2,2,272,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Red,False,Lightweight,3,0.0,...,3.0,6.0,1.0,14.0,Orthodox,180.34,193.04,155.0,36.0,35.0
3,3,310,Kevin MacDonald,2019-06-08,"Chicago, Illinois, USA",Blue,False,Bantamweight,3,0.0,...,1.0,0.0,0.0,6.0,Orthodox,162.56,172.72,135.0,26.0,29.0
4,4,137,Dan Miragliotta,2019-06-08,"Chicago, Illinois, USA",Blue,False,Heavyweight,3,0.0,...,2.0,0.0,0.0,3.0,Southpaw,187.96,190.5,264.0,32.0,26.0


In [5]:
#Drop unusable rows

data.drop(["Referee", "date", "location"], axis = 1, inplace = True)


#One-hot encode the usable categorical data

one_hot = pd.get_dummies(data["weight_class"])
one_hot1 = pd.get_dummies(data["B_Stance"], prefix = "B_Stance")
one_hot2 = pd.get_dummies(data["R_Stance"], prefix = "R_Stance")



# for column in one_hot.columns:
#     print(column, ": ",(one_hot[column] != 0).sum())

data = pd.concat([data, one_hot, one_hot1, one_hot2], axis = 1, sort = False)
data.drop(["weight_class", "B_Stance", "R_Stance"], axis = 1, inplace = True)

data.dropna(inplace = True)


print(data.count())

data.head()

R_fighter               3355
B_fighter               3355
Winner                  3355
title_bout              3355
no_of_rounds            3355
                        ... 
R_Stance_Open Stance    3355
R_Stance_Orthodox       3355
R_Stance_Sideways       3355
R_Stance_Southpaw       3355
R_Stance_Switch         3355
Length: 163, dtype: int64


Unnamed: 0,R_fighter,B_fighter,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,...,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,0,460,Red,True,5,0.0,4.0,0.0,9.2,6.0,...,0,1,0,0,0,0,1,0,0,0
1,1,605,Red,True,5,0.0,3.0,0.0,14.6,9.1,...,0,1,0,0,0,0,0,0,1,0
2,2,272,Red,False,3,0.0,3.0,0.0,15.354839,11.322581,...,0,1,0,0,0,0,1,0,0,0
3,3,310,Blue,False,3,0.0,4.0,0.0,17.0,14.0,...,0,0,0,0,1,0,1,0,0,0
4,4,137,Blue,False,3,0.0,1.0,0.0,17.0,14.5,...,0,0,0,1,0,0,0,0,1,0


In [13]:
#Weight class/ fighter stance column distribution
# print("Weight Classes: \n{}\n".format(one_hot.count()))
for column in one_hot.columns:
    print(column, ": ",(one_hot[column] != 0).sum())
print()

# print("R_Stance: \n{}\n".format(one_hot1.count()))
for column in one_hot1.columns:
    print(column, ": ",(one_hot1[column] != 0).sum())
print()
# print("B_Stance: \n{}\n".format(one_hot2.count()))
for column in one_hot2.columns:
    print(column, ": ",(one_hot2[column] != 0).sum())

Bantamweight :  379
Catch Weight :  38
Featherweight :  442
Flyweight :  187
Heavyweight :  507
Light Heavyweight :  502
Lightweight :  989
Middleweight :  725
Open Weight :  92
Welterweight :  969
Women's Bantamweight :  111
Women's Featherweight :  10
Women's Flyweight :  50
Women's Strawweight :  143

B_Stance_Open Stance :  9
B_Stance_Orthodox :  3829
B_Stance_Sideways :  4
B_Stance_Southpaw :  975
B_Stance_Switch :  168

R_Stance_Open Stance :  15
R_Stance_Orthodox :  3807
R_Stance_Sideways :  2
R_Stance_Southpaw :  1036
R_Stance_Switch :  150


In [14]:
# change boolean data into 0, 1.  
# we are counting draws as a win for blue.  So this counts if the favorite wins
data["Winner"] = (data["Winner"]=="Red")*1
data["title_bout"] = (data["title_bout"])*1

data.head()

Unnamed: 0,R_fighter,B_fighter,Winner,title_bout,no_of_rounds,B_current_lose_streak,B_current_win_streak,B_draw,B_avg_BODY_att,B_avg_BODY_landed,...,B_Stance_Open Stance,B_Stance_Orthodox,B_Stance_Sideways,B_Stance_Southpaw,B_Stance_Switch,R_Stance_Open Stance,R_Stance_Orthodox,R_Stance_Sideways,R_Stance_Southpaw,R_Stance_Switch
0,0,460,1,1,5,0.0,4.0,0.0,9.2,6.0,...,0,1,0,0,0,0,1,0,0,0
1,1,605,1,1,5,0.0,3.0,0.0,14.6,9.1,...,0,1,0,0,0,0,0,0,1,0
2,2,272,1,0,3,0.0,3.0,0.0,15.354839,11.322581,...,0,1,0,0,0,0,1,0,0,0
3,3,310,0,0,3,0.0,4.0,0.0,17.0,14.0,...,0,0,0,0,1,0,1,0,0,0
4,4,137,0,0,3,0.0,1.0,0.0,17.0,14.5,...,0,0,0,1,0,0,0,0,1,0


In [19]:
for column in data.columns:
    print("{}: {}".format(column, data[column].dtypes))






R_fighter: int64
B_fighter: int64
Winner: int64
title_bout: int64
no_of_rounds: int64
B_current_lose_streak: float64
B_current_win_streak: float64
B_draw: float64
B_avg_BODY_att: float64
B_avg_BODY_landed: float64
B_avg_CLINCH_att: float64
B_avg_CLINCH_landed: float64
B_avg_DISTANCE_att: float64
B_avg_DISTANCE_landed: float64
B_avg_GROUND_att: float64
B_avg_GROUND_landed: float64
B_avg_HEAD_att: float64
B_avg_HEAD_landed: float64
B_avg_KD: float64
B_avg_LEG_att: float64
B_avg_LEG_landed: float64
B_avg_PASS: float64
B_avg_REV: float64
B_avg_SIG_STR_att: float64
B_avg_SIG_STR_landed: float64
B_avg_SIG_STR_pct: float64
B_avg_SUB_ATT: float64
B_avg_TD_att: float64
B_avg_TD_landed: float64
B_avg_TD_pct: float64
B_avg_TOTAL_STR_att: float64
B_avg_TOTAL_STR_landed: float64
B_longest_win_streak: float64
B_losses: float64
B_avg_opp_BODY_att: float64
B_avg_opp_BODY_landed: float64
B_avg_opp_CLINCH_att: float64
B_avg_opp_CLINCH_landed: float64
B_avg_opp_DISTANCE_att: float64
B_avg_opp_DISTANCE_la

In [None]:
#Load the dataset
fight_data = pd.read_csv("preprocessed_data.csv")

fight_data.head()

In [None]:
#Trim any unnecessary columns that do not have any relevance to linear regression or will be hard to quantify
fight_data["Winner"] = (fight_data["Winner"]=="Red")*1
fight_data["title_bout"] = (fight_data["title_bout"])*1



#########
# TODO:
# Re-evaluate to see if there is value in any non-numerical value.  Eg. weight class
# 
#########


#Drop any columns that are not numeric values (e.g. City, Location)
drop_index = fight_data.dtypes.loc[fight_data.dtypes == object].index
cleaned_fight_data = fight_data.drop(drop_index, axis = 1)

#Get rid of any NaN values for use in feature selection
cleaned_fight_data.fillna(value= 0, inplace = True)


cleaned_fight_data.head()

In [None]:
#Separete X and y values
X = cleaned_fight_data.drop(columns = ["Winner"])
y = cleaned_fight_data["Winner"]

print("X Shape: {}\ny Shape: {}".format(X.shape, y.shape))


In [None]:
#Baseline the project by running linear regression on the entire dataset
logreg = LogisticRegression(solver='lbfgs', max_iter=1000)
original_data_scores = []

#Predict the accuracy using Logistic regretion on the original dataset 20 times with a test size random between .2 and .4
for _ in range(10):
    size_change = .2 + (random.randint(1, 20) / 100)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size_change)
    logreg.fit(X_train, y_train)
    original_data_scores.append(logreg.score(X_test, y_test))

# warning given solution options: 
# https://stackoverflow.com/questions/52670012/convergencewarning-liblinear-failed-to-converge-increase-the-number-of-iterati
# Normalize the data using scikit-learn's StandardScaler

print("Average of logistic regression run on the original dataset: {}".format(sum(original_data_scores)/len(original_data_scores))) 


In [None]:
PCA_data_scores = []

for i in range(1, 30):

    pca = PCA(n_components = X.shape[1])
    PCA_X = pd.DataFrame(pca.fit_transform(X)).iloc[:, :i]
    
    print("Trying {} number of components".format(i))

    scores_per_component = []
    
    for _ in range(30):
        size_change = .2 + (random.randint(1, 20) / 100)

        PCA_X_train, PCA_X_test, PCA_y_train, PCA_y_test = train_test_split(PCA_X, y, test_size=size_change)
        logreg.fit(PCA_X_train, PCA_y_train)
        scores_per_component.append(logreg.score(PCA_X_test, PCA_y_test))

    PCA_data_scores.append(sum(scores_per_component) / len(scores_per_component))


for i in range(0, len(PCA_data_scores)):
    print("Number of components: {}   Average Score: {}".format(i+1, PCA_data_scores[i]))
    
    

In [None]:
#Plot the cumulative variance to decide how many features should be included
# plt.plot(np.cumsum(pca.explained_variance_ratio_))
# plt.xlabel('number of components')
# plt.ylabel('cumulative explained variance')

In [None]:
logreg.predict(X_test)

logreg.score(X_test, y_test)

In [None]:
logreg.predict(PCA_X_test)



In [None]:
#next steps: automate and compare different levels of components in PCA and how it affects the outcome
#try comparing 