In [1]:
from cleanser import *

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [3]:
import tensorflow as tf
#tf.debugging.set_log_device_placement(True)

In [4]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 14043170608732785865
, name: "/device:GPU:0"
device_type: "GPU"
memory_limit: 303014378
locality {
  bus_id: 1
  links {
  }
}
incarnation: 10847689528824014132
physical_device_desc: "device: 0, name: GeForce GTX 1650, pci bus id: 0000:01:00.0, compute capability: 7.5"
]


In [5]:
#from keras import backend as K
#K.tensorflow_backend._get_available_gpus()

In [6]:
physical_devices = tf.config.list_physical_devices('GPU')

In [7]:
physical_devices

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [8]:
#gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9)

#tf.Session(config=tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=True))

In [9]:
#import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "-1" 
#os.environ["CUDA_VISIBLE_DEVICES"]

In [10]:
cleanser = RawFightsCleanser()

In [11]:
fights = cleanser.load_and_cleanse('raw_total_fight_data.csv', sep=';')

In [12]:
# check and see if the expanded cleaning gives us the same as before; even do logistic regression and see if we get the same thing

In [13]:
fights.head()

Unnamed: 0,r_fighter,b_fighter,r_kd,b_kd,r_sig_str,b_sig_str,r_sig_str_pct,b_sig_str_pct,r_total_str,b_total_str,...,b_ground_att,r_ground_suc,b_ground_suc,r_ground_ratio,b_ground_ratio,r_ground_pct,b_ground_pct,ground_diff,loser,r_b_winner
0,Henry Cejudo,Marlon Moraes,0,0,90 of 171,57 of 119,52,47,99 of 182,59 of 121,...,1,26,1,0.866667,1.0,86,100,25,Marlon Moraes,r
1,Valentina Shevchenko,Jessica Eye,1,0,8 of 11,2 of 12,72,16,37 of 40,42 of 52,...,0,1,0,1.0,0.0,100,0,1,Jessica Eye,r
2,Tony Ferguson,Donald Cerrone,0,0,104 of 200,68 of 185,52,36,104 of 200,68 of 185,...,0,0,0,0.0,0.0,0,0,0,Donald Cerrone,r
3,Jimmie Rivera,Petr Yan,0,2,73 of 192,56 of 189,38,29,76 of 195,58 of 192,...,10,4,4,1.0,0.4,100,40,0,Jimmie Rivera,b
4,Tai Tuivasa,Blagoy Ivanov,0,1,64 of 144,73 of 123,44,59,66 of 146,81 of 131,...,6,0,6,0.0,1.0,0,100,-6,Tai Tuivasa,b


In [14]:
fights.columns

Index(['r_fighter', 'b_fighter', 'r_kd', 'b_kd', 'r_sig_str', 'b_sig_str',
       'r_sig_str_pct', 'b_sig_str_pct', 'r_total_str', 'b_total_str',
       ...
       'b_ground_att', 'r_ground_suc', 'b_ground_suc', 'r_ground_ratio',
       'b_ground_ratio', 'r_ground_pct', 'b_ground_pct', 'ground_diff',
       'loser', 'r_b_winner'],
      dtype='object', length=120)

In [15]:
fights.head()[['r_fighter', 'b_fighter', 'winner', 'r_b_winner']]

Unnamed: 0,r_fighter,b_fighter,winner,r_b_winner
0,Henry Cejudo,Marlon Moraes,Henry Cejudo,r
1,Valentina Shevchenko,Jessica Eye,Valentina Shevchenko,r
2,Tony Ferguson,Donald Cerrone,Tony Ferguson,r
3,Jimmie Rivera,Petr Yan,Petr Yan,b
4,Tai Tuivasa,Blagoy Ivanov,Blagoy Ivanov,b


In [16]:
diff_columns = [c for c in fights.columns if '_diff' in c]
diff_columns

['sig_str_diff',
 'total_str_diff',
 'td_diff',
 'head_diff',
 'body_diff',
 'leg_diff',
 'distance_diff',
 'clinch_diff',
 'ground_diff']

In [17]:
diffs = fights[diff_columns + ['r_b_winner']]
diffs.head(10)

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_b_winner
0,33,40,1,38,6,-11,-9,17,25,r
1,6,-5,2,4,4,-2,3,2,1,r
2,36,36,-1,22,10,4,35,1,0,r
3,17,18,-1,2,2,13,18,-1,0,b
4,-9,-15,-2,-26,-1,18,-12,9,-6,b
5,27,41,4,-2,7,22,3,3,21,r
6,69,67,0,79,7,-17,70,-1,0,r
7,-58,-71,0,-37,-6,-15,-44,-14,0,b
8,-10,-10,0,-9,-7,6,-4,-2,-4,b
9,23,27,0,20,-10,13,27,-5,1,r


In [18]:
scaled_diffs = diffs.copy()

In [19]:
for column in diff_columns:
    #col = scaled_diffs[column]
    mean = scaled_diffs[column].mean()
    sd = scaled_diffs[column].std()
    scaled_diffs[column] = scaled_diffs[column].apply(lambda x: (x - mean) / sd)

In [20]:
scaled_diffs['r_won'] = scaled_diffs.r_b_winner.apply(lambda x: 1 if x == 'r' else 0)

In [21]:
x_train, x_test, y_train, y_test = train_test_split(scaled_diffs[['sig_str_diff', 'total_str_diff', 'td_diff', 'head_diff', 'body_diff',
       'leg_diff', 'distance_diff', 'clinch_diff', 'ground_diff']], scaled_diffs['r_won'], test_size=.33, random_state=1)

In [22]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((3446, 9), (1698, 9), (3446,), (1698,))

In [23]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(x_train, y_train)

LogisticRegression()

In [24]:
lg_predictions = lr.predict(x_test)

In [25]:
confusion_matrix(y_test, lg_predictions)

array([[ 346,  197],
       [  77, 1078]], dtype=int64)

In [26]:
accuracy_score(y_test, lg_predictions)

0.8386336866902238

In [27]:
print(classification_report(y_test, lg_predictions))

              precision    recall  f1-score   support

           0       0.82      0.64      0.72       543
           1       0.85      0.93      0.89      1155

    accuracy                           0.84      1698
   macro avg       0.83      0.79      0.80      1698
weighted avg       0.84      0.84      0.83      1698



In [28]:
# looks the same as in the other notebook.
# => Go on to DNN
# just remembered that I'm pretty sure sklearn does not have a dnn feature, so need to either skip it or use
# something different
# Use Keras, as demonstrated in the NLP course I took.

In [29]:
from keras.utils import to_categorical

Using TensorFlow backend.


In [30]:
from keras.models import Sequential
from keras.layers import Dense

In [31]:
# Not sure how to make the model, so first model it after Portilla's NLP lecture
'''
model = Sequential()
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(8, input_dim=4, activation='relu'))
model.add(Dense(3, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
'''
# I guess I will need to change the 8 to the number of variables I will be using

"\nmodel = Sequential()\nmodel.add(Dense(8, input_dim=4, activation='relu'))\nmodel.add(Dense(8, input_dim=4, activation='relu'))\nmodel.add(Dense(3, activation='softmax'))\nmodel.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n"

In [32]:
to_categorical([0, 9, 1, 10, 2, 10])

array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)

In [33]:
to_categorical([0, 1])

array([[1., 0.],
       [0., 1.]], dtype=float32)

In [34]:
from sklearn.model_selection import train_test_split
# use shift-tab to show the docs

In [35]:
scaled_diffs.columns

Index(['sig_str_diff', 'total_str_diff', 'td_diff', 'head_diff', 'body_diff',
       'leg_diff', 'distance_diff', 'clinch_diff', 'ground_diff', 'r_b_winner',
       'r_won'],
      dtype='object')

In [36]:
scaled_diffs.head()

Unnamed: 0,sig_str_diff,total_str_diff,td_diff,head_diff,body_diff,leg_diff,distance_diff,clinch_diff,ground_diff,r_b_winner,r_won
0,1.114802,0.664146,0.259534,1.739103,0.64127,-1.377032,-0.660538,2.201119,1.717053,r,1
1,0.03307,-0.319321,0.658531,0.013585,0.392208,-0.307885,0.069362,0.186119,-0.130932,r,1
2,1.234994,0.576727,-0.53846,0.927095,1.139392,0.404879,2.015763,0.051785,-0.207931,r,1
3,0.473776,0.18334,-0.53846,-0.087916,0.143147,1.474026,0.981738,-0.216882,-0.207931,b,0
4,-0.567892,-0.537869,-0.937457,-1.50893,-0.230445,2.067996,-0.843013,1.126452,-0.669927,b,0


In [37]:
y_train.head()

4375    1
398     1
4908    1
797     0
1877    0
Name: r_won, dtype: int64

In [38]:
# might need to change the scaling on this from # of standard deviations to something like min max scaling

In [39]:
# The below is not correct.  You must convert the output to a 2 column array (see note below) 
# and change the last layer to 2, not 1.
model = Sequential()
model.add(Dense(9, input_dim=9, activation='relu'))
model.add(Dense(9, input_dim=9, activation='relu'))
model.add(Dense(1, activation='softmax'))
#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [40]:
#model.fit(x_train, y_train, epochs=10, verbose=2)

In [41]:
# I think I need to make the target a matrix of two values.  See the Portilla lecture where he used converted it into a three column matrix.
# Even though it it binary, so technically it is either/or, not like one out of three in the case of the Portilla NLP example, 
# meaning that one column is enough information, Keras still seems to want it in a two column format.

In [42]:
from keras.utils import to_categorical
y_binary = to_categorical(y_train)

In [43]:
y_binary[:5]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [44]:
sum(y_binary == y_train)

0

In [45]:
y_binary.shape

(3446, 2)

In [46]:
x_train.shape

(3446, 9)

In [47]:
model = Sequential()
model.add(Dense(9, input_dim=9, activation='relu'))
model.add(Dense(9, input_dim=9, activation='relu'))
model.add(Dense(2, activation='softmax'))
#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [48]:
model.fit(x_train, y_binary, epochs=40, verbose=2)

Epoch 1/40
 - 2s - loss: 0.5737 - accuracy: 0.7136
Epoch 2/40
 - 1s - loss: 0.4692 - accuracy: 0.8151
Epoch 3/40
 - 1s - loss: 0.4326 - accuracy: 0.8273
Epoch 4/40
 - 1s - loss: 0.4174 - accuracy: 0.8273
Epoch 5/40
 - 1s - loss: 0.4107 - accuracy: 0.8247
Epoch 6/40
 - 1s - loss: 0.4059 - accuracy: 0.8268
Epoch 7/40
 - 1s - loss: 0.4041 - accuracy: 0.8265
Epoch 8/40
 - 1s - loss: 0.4013 - accuracy: 0.8268
Epoch 9/40
 - 1s - loss: 0.3991 - accuracy: 0.8265
Epoch 10/40
 - 1s - loss: 0.3977 - accuracy: 0.8256
Epoch 11/40
 - 1s - loss: 0.3958 - accuracy: 0.8297
Epoch 12/40
 - 1s - loss: 0.3949 - accuracy: 0.8282
Epoch 13/40
 - 1s - loss: 0.3930 - accuracy: 0.8294
Epoch 14/40
 - 1s - loss: 0.3925 - accuracy: 0.8270
Epoch 15/40
 - 1s - loss: 0.3909 - accuracy: 0.8294
Epoch 16/40
 - 1s - loss: 0.3907 - accuracy: 0.8305
Epoch 17/40
 - 1s - loss: 0.3905 - accuracy: 0.8291
Epoch 18/40
 - 1s - loss: 0.3888 - accuracy: 0.8314
Epoch 19/40
 - 0s - loss: 0.3881 - accuracy: 0.8326
Epoch 20/40
 - 0s - l

<keras.callbacks.callbacks.History at 0x27b21980608>

In [49]:
model.predict(x_test)

array([[0.10577872, 0.8942213 ],
       [0.08197752, 0.9180225 ],
       [0.17018673, 0.82981324],
       ...,
       [0.63636786, 0.36363214],
       [0.35824755, 0.6417524 ],
       [0.03509319, 0.96490675]], dtype=float32)

In [50]:
model.predict(x_test).shape

(1698, 2)

In [51]:
dnn_predictions = model.predict(x_test)

In [52]:
dnn_predictions.shape

(1698, 2)

In [53]:
dnn_predictions

array([[0.10577872, 0.8942213 ],
       [0.08197752, 0.9180225 ],
       [0.17018673, 0.82981324],
       ...,
       [0.63636786, 0.36363214],
       [0.35824755, 0.6417524 ],
       [0.03509319, 0.96490675]], dtype=float32)

In [54]:
y_test

4227    1
3712    1
5112    1
4250    1
457     0
       ..
2857    0
4896    1
1128    0
4004    1
1105    1
Name: r_won, Length: 1698, dtype: int64

In [55]:
y_test_binary = to_categorical(y_test)

In [56]:
y_test_binary[:10]

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [1., 0.],
       [1., 0.],
       [0., 1.],
       [0., 1.],
       [0., 1.],
       [0., 1.]], dtype=float32)

In [57]:
y_test[0:10]

4227    1
3712    1
5112    1
4250    1
457     0
1392    0
28      1
4661    1
2773    1
3077    1
Name: r_won, dtype: int64

In [58]:
pred = np.apply_along_axis(lambda r: 1 if r[1] > r[0] else 0, 1, dnn_predictions)

In [59]:
pred

array([1, 1, 1, ..., 0, 1, 1])

In [60]:
pred[0:10]

array([1, 1, 1, 1, 0, 0, 1, 1, 1, 1])

In [61]:
confusion_matrix(y_test, pred)

array([[ 380,  163],
       [ 101, 1054]], dtype=int64)

In [62]:
accuracy_score(y_test, pred)

0.8445229681978799

In [63]:
pd.Series(pred).value_counts() / len(pred)

1    0.716726
0    0.283274
dtype: float64

In [64]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.79      0.70      0.74       543
           1       0.87      0.91      0.89      1155

    accuracy                           0.84      1698
   macro avg       0.83      0.81      0.82      1698
weighted avg       0.84      0.84      0.84      1698



In [65]:
# So it is slightly more accurate than logistic regression

In [66]:
# Try a bigger net

In [67]:
model2 = Sequential()
model2.add(Dense(18, input_dim=9, activation='relu'))
model2.add(Dense(18, input_dim=9, activation='relu'))
model2.add(Dense(2, activation='softmax'))
#model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model2.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [68]:
model2.fit(x_train, y_binary, epochs=100, verbose=2)

Epoch 1/100
 - 1s - loss: 0.4772 - accuracy: 0.8067
Epoch 2/100
 - 1s - loss: 0.4266 - accuracy: 0.8247
Epoch 3/100
 - 1s - loss: 0.4143 - accuracy: 0.8282
Epoch 4/100
 - 1s - loss: 0.4077 - accuracy: 0.8282
Epoch 5/100
 - 1s - loss: 0.4041 - accuracy: 0.8314
Epoch 6/100
 - 1s - loss: 0.3996 - accuracy: 0.8288
Epoch 7/100
 - 1s - loss: 0.3973 - accuracy: 0.8279
Epoch 8/100
 - 0s - loss: 0.3949 - accuracy: 0.8314
Epoch 9/100
 - 0s - loss: 0.3934 - accuracy: 0.8302
Epoch 10/100
 - 0s - loss: 0.3903 - accuracy: 0.8314
Epoch 11/100
 - 1s - loss: 0.3892 - accuracy: 0.8311
Epoch 12/100
 - 1s - loss: 0.3871 - accuracy: 0.8323
Epoch 13/100
 - 1s - loss: 0.3859 - accuracy: 0.8314
Epoch 14/100
 - 1s - loss: 0.3837 - accuracy: 0.8349
Epoch 15/100
 - 1s - loss: 0.3827 - accuracy: 0.8326
Epoch 16/100
 - 1s - loss: 0.3815 - accuracy: 0.8352
Epoch 17/100
 - 1s - loss: 0.3802 - accuracy: 0.8334
Epoch 18/100
 - 1s - loss: 0.3789 - accuracy: 0.8340
Epoch 19/100
 - 1s - loss: 0.3780 - accuracy: 0.8366
Ep

<keras.callbacks.callbacks.History at 0x27b36861588>

In [69]:
dnn_predictions2 = model2.predict(x_test)

In [70]:
pred2 = np.apply_along_axis(lambda r: 1 if r[1] > r[0] else 0, 1, dnn_predictions2)

In [72]:
accuracy_score(y_test, pred2)

0.8380447585394581

In [73]:
pd.Series(pred2).value_counts() / len(pred2)

1    0.714959
0    0.285041
dtype: float64

In [74]:
# not much of a difference
# for more experimentation, I think I'll try in PyCharm where I can use functions, not a bunch of copy/paste.
# Or write functions here.

In [75]:
from DNN_analyzer import DNNAnalyzer

In [81]:
dnn = DNNAnalyzer()
dnn.analyze_1(57)

Epoch 1/57
 - 1s - loss: 0.5882 - accuracy: 0.7008
Epoch 2/57
 - 1s - loss: 0.4711 - accuracy: 0.8056
Epoch 3/57
 - 1s - loss: 0.4355 - accuracy: 0.8221
Epoch 4/57
 - 1s - loss: 0.4211 - accuracy: 0.8207
Epoch 5/57
 - 1s - loss: 0.4137 - accuracy: 0.8198
Epoch 6/57
 - 1s - loss: 0.4080 - accuracy: 0.8221
Epoch 7/57
 - 1s - loss: 0.4037 - accuracy: 0.8233
Epoch 8/57
 - 1s - loss: 0.3997 - accuracy: 0.8256
Epoch 9/57
 - 1s - loss: 0.3968 - accuracy: 0.8276
Epoch 10/57
 - 1s - loss: 0.3950 - accuracy: 0.8250
Epoch 11/57
 - 1s - loss: 0.3928 - accuracy: 0.8262
Epoch 12/57
 - 1s - loss: 0.3913 - accuracy: 0.8273
Epoch 13/57
 - 1s - loss: 0.3900 - accuracy: 0.8262
Epoch 14/57
 - 1s - loss: 0.3888 - accuracy: 0.8273
Epoch 15/57
 - 1s - loss: 0.3876 - accuracy: 0.8279
Epoch 16/57
 - 1s - loss: 0.3874 - accuracy: 0.8279
Epoch 17/57
 - 1s - loss: 0.3862 - accuracy: 0.8282
Epoch 18/57
 - 1s - loss: 0.3852 - accuracy: 0.8305
Epoch 19/57
 - 1s - loss: 0.3845 - accuracy: 0.8273
Epoch 20/57
 - 1s - l

In [77]:
import importlib

In [78]:
from importlib import reload

In [79]:
importlib.reload(DNN_analyzer)

NameError: name 'DNN_analyzer' is not defined

In [None]:
import DNN_analyzer

In [None]:
dnn = DNN_analyzer.DNNAnalyzer()
dnn.analyze_1(57)

In [80]:
dnn.analyze_2(57)

Epoch 1/57
 - 1s - loss: 0.5772 - accuracy: 0.7261
Epoch 2/57
 - 1s - loss: 0.4298 - accuracy: 0.8239
Epoch 3/57
 - 1s - loss: 0.4109 - accuracy: 0.8236
Epoch 4/57
 - 1s - loss: 0.4026 - accuracy: 0.8294
Epoch 5/57
 - 1s - loss: 0.3990 - accuracy: 0.8305
Epoch 6/57
 - 1s - loss: 0.3945 - accuracy: 0.8285
Epoch 7/57
 - 0s - loss: 0.3922 - accuracy: 0.8276
Epoch 8/57
 - 0s - loss: 0.3898 - accuracy: 0.8311
Epoch 9/57
 - 1s - loss: 0.3877 - accuracy: 0.8311
Epoch 10/57
 - 1s - loss: 0.3864 - accuracy: 0.8326
Epoch 11/57
 - 1s - loss: 0.3848 - accuracy: 0.8308
Epoch 12/57
 - 1s - loss: 0.3835 - accuracy: 0.8340
Epoch 13/57
 - 1s - loss: 0.3819 - accuracy: 0.8343
Epoch 14/57
 - 1s - loss: 0.3813 - accuracy: 0.8317
Epoch 15/57
 - 1s - loss: 0.3807 - accuracy: 0.8363
Epoch 16/57
 - 1s - loss: 0.3787 - accuracy: 0.8334
Epoch 17/57
 - 1s - loss: 0.3776 - accuracy: 0.8343
Epoch 18/57
 - 0s - loss: 0.3761 - accuracy: 0.8352
Epoch 19/57
 - 0s - loss: 0.3757 - accuracy: 0.8381
Epoch 20/57
 - 1s - l

In [None]:
dnn.analyze_3(57)

In [None]:
dnn.analyze_4(57)

In [43]:
# Experiment with sigmoid output instead of softmax

In [None]:
model = Sequential()
model.add(Dense(18, input_dim=9, activation='relu'))
model.add(Dense(18, input_dim=18, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])