In [28]:
import pandas as pd
import numpy as np
import seaborn as sns
import xgboost as xgb

import os
import warnings

from pathlib import Path

#Configurations
warnings.filterwarnings('ignore')
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

## Directories and Filenames
Contains paths and filenames to access test and training data.

In [29]:
ROOT_DIR = Path.cwd().parent
SRC_DIR = os.path.join(ROOT_DIR, 'src')
DATA_DIR = os.path.join(ROOT_DIR, 'data')

processed_training_data_path = os.path.join(DATA_DIR, 'train_processed.csv')
processed_test_data_path = os.path.join(DATA_DIR, 'test_processed.csv')

In [30]:
df_train = pd.read_csv(processed_training_data_path)
df_unseen_test = pd.read_csv(processed_test_data_path)

## Train, Validation and Test Splits & Modeling

In [31]:
df_test = df_train.sample(frac=0.1, random_state=42)
df_train = df_train.drop(df_test.index)

print(f"Train shape: {df_train.shape}")
print(f"Test shape: {df_test.shape}")
print(f"Unseen test shape: {df_unseen_test.shape}")

df_train.head()

Train shape: (888, 18)
Test shape: (99, 18)
Unseen test shape: (10000, 17)


Unnamed: 0,x7_Hambo,x7_Polka,x7_Polskor,x7_Schottis,x7_Slängpolskor,x12_True,x1,x2,x3,x4,x5,x6,x8,x9,x10,x11,x13,y
0,0.0,0.0,0.0,0.0,1.0,1.0,-0.21761,1.198357,-1.17269,0.041297,0.168716,0.987879,1.243974,0.323734,-0.275316,-0.462594,-0.64026,2
1,1.0,0.0,0.0,0.0,0.0,1.0,-1.723374,0.037146,0.63959,0.505788,-0.394709,0.23503,-0.343594,-0.40181,-0.042394,0.763768,-1.584218,0
2,0.0,0.0,0.0,1.0,0.0,1.0,0.815268,-0.401341,1.466614,0.476071,-0.055452,-0.279269,0.88845,0.078986,-1.363136,-0.845952,1.370782,1
3,0.0,0.0,0.0,0.0,1.0,1.0,-0.456902,1.890258,0.383763,0.279145,-0.166733,1.651127,-0.9358,-0.617531,0.041439,-0.285398,-0.344786,2
4,0.0,0.0,0.0,1.0,0.0,1.0,0.878887,-1.381426,-0.539423,0.952232,-1.010925,-1.574866,-1.471067,1.137805,0.408868,-0.852787,0.733035,1


In [32]:
#train validation split of training data
from sklearn.model_selection import train_test_split
from sklearn.multiclass import OneVsRestClassifier
x_train, x_val, y_train, y_val = train_test_split(df_train.drop('y', axis=1), df_train['y'], test_size=0.2, random_state=42)

print(f"Train shape: {x_train.shape}")
print(f"Validation shape: {x_val.shape}")
print(f"Test shape: {df_test.shape}")

#Modeling
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

def print_results(results):
    print(f"Best params: {results.best_params_}")
    means = results.cv_results_['mean_test_score']
    stds = results.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, results.cv_results_['params']):
        print(f"{round(np.sqrt(-mean), 4)} (+/-{round(np.sqrt(std), 4)}) for {params}")

#XGBoost
xgb_model = xgb.XGBClassifier(objective='multi:softmax', random_state=42, n_jobs=-1, verbosity=2)
parameters = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 10],
    'learning_rate': [0.01, 0.2]
}

cv = GridSearchCV(xgb_model, parameters, cv=3, n_jobs=-1, verbose=2)
cv.fit(x_train, y_train)    

print_results(cv)

#Generate multilclass confusion matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix

confusion_matrix(y_val, cv.predict(x_val))










Train shape: (710, 17)
Validation shape: (178, 17)
Test shape: (99, 18)
Fitting 3 folds for each of 12 candidates, totalling 36 fits
[16:39:45] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:39:45] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 12 extra nodes, 0 pruned nodes, max_depth=3
[16:39:45] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:39:45] INFO: /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/tree/updater_prune.cc:98: tree pruning end, 14 extra nodes, 0 pruned nodes, max_depth=3
[16:39:45] INFO: /Users/runner/work/xgb

ImportError: cannot import name 'plot_confusion_matrix' from 'sklearn.metrics' (/Users/michaelfuest/.pyenv/versions/3.11.0/envs/multilabel/lib/python3.11/site-packages/sklearn/metrics/__init__.py)