In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader
%pip install nltk

Note: you may need to restart the kernel to use updated packages.
Collecting py-cpuinfo
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo
Successfully installed py-cpuinfo-9.0.0
Note: you may need to restart the kernel to use updated packages.
Collecting pandas-datareader
  Using cached pandas_datareader-0.10.0-py3-none-any.whl (109 kB)
Collecting lxml (from pandas-datareader)
  Obtaining dependency information for lxml from https://files.pythonhosted.org/packages/44/1b/0771c38e65ad23e25368b5e07c920054774b8d12477a4fad116bf500de73/lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl.metadata
  Using cached lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl.metadata (3.8 kB)
Using cached lxml-4.9.3-cp38-cp38-manylinux_2_28_x86_64.whl (8.0 MB)
Installing collected packages: lxml, pandas-datareader
Successfully installed lxml-4.9.3 pandas-datareader-0.10.0
Note: you may need to restart the kernel to use updated packages.
Collecting nltk
  Using cached

In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 13.99 GB
Used RAM: 1.16 GB
Percentage Usage Of RAM: 9.6%
CPU Cores: 4
CPU Speed: 2.5000 GHz
Total Disk: 24.99 GB
Available Disk: 15.22 GB
Used Disk: 9.77 GB
Percentage Usage Of Disk: 39.1%


In [3]:
# import requirements
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from nnet_classifier import Classification

[nltk_data] Downloading package vader_lexicon to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
2023-10-28 17:06:21.117421: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
# get the data
boxing = pd.read_csv("boxing.csv")

In [5]:
# prepare the data for machine learning
boxing = boxing.sample(frac=1, random_state=0).reset_index(drop=True)  # shuffle the data
boxing = boxing.ffill().bfill()  # fill in missing values with the last known value 
boxing["age_A"] = boxing["age_A"] + 0.01  # add a small number to age_A so it isn't treated as a categorical variable

In [6]:
# get the testing data
y = boxing[["result"]]
X = boxing.drop(columns="result")
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [7]:
# build the model
print("\n---- Boxing Classification Analysis ----\n")
model = Classification(
    name="Tensorflow Without Feature Engineering", 
    path=None,
    rename=False, 
    time=False, 
    text=False,
    binary=True, 
    imputation=False, 
    variance=True,
    scale=True,
    atwood=False,
    binning=False,
    reciprocal=False, 
    interaction=False, 
    selection=False,
    tune=False,
    plots=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.explore(boxing)
    model.validate(X, y)  # build the machine learning pipeline
    predictions = model.predict(testX)
    print("\nModel Performance:")
    print(f"Accuracy: {model.accuracy}")
    print(f"F1: {model.f1}")
    print(f"In Control: {model.in_control}")
    print("Confusion Matrix:")
    pd.set_option("display.width", 1000)
    print(model.confusion / model.confusion.sum().sum())


---- Boxing Classification Analysis ----

Visualizing The Data:
> Plotting Correlations
> won_A vs. kos_A
> won_B vs. kos_B
> judge1_A vs. judge1_B
> judge1_A vs. judge2_A
> judge1_A vs. judge2_B
> judge1_A vs. judge3_A
> judge1_A vs. judge3_B
> judge1_B vs. judge2_A
> judge1_B vs. judge2_B
> judge1_B vs. judge3_A
> judge1_B vs. judge3_B
> judge2_A vs. judge2_B
> judge2_A vs. judge3_A
> judge2_A vs. judge3_B
> judge2_B vs. judge3_A
> judge2_B vs. judge3_B
> judge3_A vs. judge3_B
> Plotting age_A
> Plotting age_B
> Plotting height_A
> Plotting height_B
> Plotting reach_A
> Plotting reach_B
> Plotting weight_A
> Plotting weight_B
> Plotting won_A
> Plotting won_B
> Plotting lost_A
> Plotting lost_B
> Plotting drawn_A
> Plotting drawn_B
> Plotting kos_A
> Plotting kos_B
> Plotting judge1_A
> Plotting judge1_B
> Plotting judge2_A
> Plotting judge2_B
> Plotting judge3_A
> Plotting judge3_B
> Plotting stance_A
> Plotting stance_B
> Plotting result
> Plotting decision
> stance_A vs. stance_B

2023-10-28 17:08:46.131743: W tensorflow/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 89263008 exceeds 10% of free system memory.


KeyboardInterrupt: 

In [None]:
# model diagnostics
print("Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10].tolist()):
    print(f"{i+1}. {indicator}")
print(" ")
print("Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10].tolist()):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

In [None]:
# score the model
accuracy = accuracy_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)
f1 = f1_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    average="macro",
)

print(f"Accuracy: {accuracy}")
print(f"F1: {f1}")

In [None]:
# show the confusion matrix
ytest = testy.iloc[:,0].to_numpy()
labels = np.unique(np.concatenate((predictions, ytest)))
confusion = confusion_matrix(
    y_true=ytest,   # rows
    y_pred=predictions,  # columns
    labels=labels,
)
confusion = pd.DataFrame(
    confusion, 
    columns=labels, 
    index=labels,
)
print("Confusion Matrix:")
pd.set_option("display.width", 1000)
print(confusion / confusion.sum().sum())

In [None]:
# save the machine learning pipeline
model.dump()

In [None]:
# refit the model to include the test data
model.refit(testX, testy)