In [181]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import classification_report, log_loss
import numpy as np
from scipy.optimize import fmin_l_bfgs_b
import unittest
import pandas as pd
from sklearn.utils import resample
from solution import MultinomialLogReg, OrdinalLogReg
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder

## Preprocess the data

In [222]:
np.random.seed(42)
# Read the data
df = pd.read_csv("dataset.csv", sep=";")
df_og =df.copy()
# Encode the data
encoder_labs = LabelEncoder()
df["ShotType"] = encoder_labs.fit_transform(df["ShotType"])
encoder = OneHotEncoder(sparse_output=False)
for col in ["Competition", "PlayerType", "Movement"]:
    # Fit and transform the column (convert to dense matrix)
    encoded = encoder.fit_transform(df[[col]])
    # Create new column names for the encoded columns
    encoded_columns = encoder.get_feature_names_out([col])
    # Convert the encoded array to a DataFrame with the new column names
    encoded_df = pd.DataFrame(encoded, columns=encoded_columns)
    # Drop the original column and concatenate the new encoded columns
    df = pd.concat([df.drop(columns=[col]), encoded_df], axis=1)


# Testing functions on the dataset

In [None]:
#################################
# Testing if the regressions work
#################################
# Split into train and test

train, test = train_test_split(df, test_size=0.3, stratify=df["ShotType"])
# Get the features and target
X_train,y_train  = train.drop(columns=["ShotType"]), train["ShotType"]
X_test, y_test = test.drop(columns=["ShotType"]), test["ShotType"]
# Scale the data
scaler = StandardScaler()
scaler.fit(X_train[["Angle", "Distance"]])
X_train.loc[:, ["Angle", "Distance"]] = scaler.transform(X_train[["Angle", "Distance"]])
X_test.loc[:, ["Angle", "Distance"]] = scaler.transform(X_test[["Angle", "Distance"]])
# build the model
model = MultinomialLogReg()
model_pred = model.build(X_train, y_train)
pred = model_pred.predict(X_test)
pred_classes = np.argmax(pred, axis=1)
print(classification_report(y_test, pred_classes))
# Use the other one
model = OrdinalLogReg()
model_pred = model.build(X_train, y_train)
pred = model_pred.predict(X_test)
pred_classes = np.argmax(pred, axis=1)
print(classification_report(y_test, pred_classes))

              precision    recall  f1-score   support

           0       0.83      0.87      0.85       917
           1       0.44      0.13      0.21        30
           2       0.35      0.28      0.31       119
           3       0.60      0.77      0.67       292
           4       0.59      0.24      0.34       132
           5       0.00      0.00      0.00        18

    accuracy                           0.73      1508
   macro avg       0.47      0.38      0.40      1508
weighted avg       0.71      0.73      0.71      1508

              precision    recall  f1-score   support

           0       0.75      0.93      0.83       917
           1       0.00      0.00      0.00        30
           2       0.00      0.00      0.00       119
           3       0.54      0.64      0.58       292
           4       0.17      0.03      0.05       132
           5       0.00      0.00      0.00        18

    accuracy                           0.69      1508
   macro avg       0.24

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [214]:
df_og["ShotType"].value_counts(sort=False)

ShotType
above head    3055
layup          973
other          439
hook shot      397
dunk            99
tip-in          61
Name: count, dtype: int64

# Part 2.1

In [None]:
m = 100
X = df.drop(columns = "ShotType")
y = df["ShotType"]
# Scaling the data
scaler = StandardScaler()
X.loc[:,["Angle", "Distance"]] = scaler.fit_transform(X.loc[:,["Angle", "Distance"]])
betas = []
model = MultinomialLogReg()
for i in tqdm(range(m)):
    X_bs, y_bs = resample(X,y)
    model_pred = model.build(X_bs, y_bs)
    betas.append(model_pred.beta)
betas_means = np.mean(betas, axis=0)
betas_stds = np.std(betas, axis=0)

# Save the results
np.save("betas_means.npy", betas_means)
np.save("betas_stds.npy", betas_stds)

100%|██████████| 100/100 [15:25<00:00,  9.25s/it]


In [252]:
betas_means = np.load("betas_means.npy")
betas_stds = np.load("betas_stds.npy")
betas_means = np.vstack([betas_means ,np.zeros_like(betas_means[0]) ]) 
betas_stds = np.vstack([betas_stds ,np.zeros_like(betas_means[0]) ]) 

In [None]:
# Define the column names, adding 'intercept' at the beginning
columns_with_intercept = ["intercept"] + list(df.drop(columns="ShotType").columns)

# Create the DataFrame with the intercept column first
df_betas_means = pd.DataFrame(columns=columns_with_intercept, data=betas_means)


labs = [i for i in range(df["ShotType"].nunique())]
shotTypes_col = encoder_labs.inverse_transform(labs)
df_betas_means["ShotType"] = shotTypes_col
df_betas_means

# REPORT: FMAKE A TABLE (OR GRAPH) FOR EACH OF THE ORIGINAL FEATUERES (BEFORE HOT-ENCODED), AND ONE TABLE (OR GRAPH) FOR ANGLE AND DISTANCE
# INTERCEPT INTERPRETATION

  The registered formatting callable for the type.
  The registered formatting callable for the type.


Unnamed: 0,intercept,Transition,TwoLegged,Angle,Distance,Competition_EURO,Competition_NBA,Competition_SLO1,Competition_U14,Competition_U16,PlayerType_C,PlayerType_F,PlayerType_G,Movement_dribble or cut,Movement_drive,Movement_no,ShotType
0,7.382262,-0.660275,-5.005171,0.350645,10.000197,0.16025,0.837179,0.118137,5.941237,2.422601,2.567161,3.044555,2.734697,6.905456,-5.203708,6.674041,above head
1,-4.289706,0.341244,6.681422,0.229364,2.782821,0.712452,2.012092,0.511923,-5.799122,0.24681,-0.420819,-1.513962,-1.418082,-3.308217,-1.123822,1.139434,dunk
2,3.170065,-1.038811,-4.778926,0.102721,7.828946,0.001577,-0.044005,-0.076293,4.114354,1.203898,1.520636,1.609225,1.064215,-4.143024,-1.92649,10.195302,hook shot
3,5.116806,-0.204165,10.412763,0.665415,5.608077,-0.715668,0.320177,-0.228658,5.394756,2.286932,1.787289,2.183931,2.098561,7.553559,8.188147,-9.66323,layup
4,9.485233,-0.526539,-6.354777,0.382064,9.737039,0.242291,1.091367,-0.175907,6.979993,3.116562,2.942272,3.762883,3.654169,3.604194,4.62336,2.185996,other
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,tip-in


## Part 2.2

In [166]:
def multinomial_bad_ordinal_good(num_classes=5, num_data_points=500):
    # Generate some random features, from the uniform distribution
    data = np.random.uniform(0,num_classes,(num_data_points, num_classes))
    cols = [f"{i}" for i in range(num_classes)]
    df = pd.DataFrame(data, columns=cols)
    # Assign the target based on the sum of the featues 
    feature_mean = df.mean(axis=1) + np.random.randn(len(df)) * 0.3 # Add a bit of gaussian noise
    # Assign the class based on the mean quantiles
    df["target"] = pd.qcut(feature_mean, q=num_classes, labels=False, duplicates="drop")
    return df

In [190]:
# Test the function, it can't be a too large dataset, otherwise the work the same, report accuracy and F1
df = multinomial_bad_ordinal_good(5,200)
train, test = train_test_split(df, test_size=0.3, stratify=df["target"])
X_train,y_train  = train.drop(columns=["target"]), train["target"]
X_test, y_test = test.drop(columns=["target"]), test["target"]
model = MultinomialLogReg()
model_pred = model.build(X_train, y_train)
pred = model_pred.predict(X_test)
print(log_loss(y_test, pred))
pred_classes = np.argmax(pred, axis=1)
print(classification_report(y_test, pred_classes))
# Use the other one
model = OrdinalLogReg()
model_pred = model.build(X_train, y_train)
pred = model_pred.predict(X_test)
print(log_loss(y_test, pred))
pred_classes = np.argmax(pred, axis=1)
print(classification_report(y_test, pred_classes))

1.5168593302513578
              precision    recall  f1-score   support

           0       0.71      0.83      0.77        12
           1       0.14      0.08      0.11        12
           2       0.24      0.33      0.28        12
           3       0.27      0.33      0.30        12
           4       0.86      0.50      0.63        12

    accuracy                           0.42        60
   macro avg       0.44      0.42      0.42        60
weighted avg       0.44      0.42      0.42        60

1.1376939325798328
              precision    recall  f1-score   support

           0       0.71      0.83      0.77        12
           1       0.25      0.17      0.20        12
           2       0.28      0.42      0.33        12
           3       0.36      0.33      0.35        12
           4       0.89      0.67      0.76        12

    accuracy                           0.48        60
   macro avg       0.50      0.48      0.48        60
weighted avg       0.50      0.48      