In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb
from sklearn.utils import compute_sample_weight

In [None]:
test = pd.read_csv('test.csv')
feat = test.columns
train = pd.read_csv('train.csv')
train = train.loc[~train.sii.isna()]
object_columns = train.select_dtypes(include=['object']).columns

# Create a dictionary mapping object columns to 'category' dtype
dtype_dict = {col: 'category' for col in object_columns}

# Convert object columns to categorical
train = train.astype(dtype_dict)
train.dtypes

In [None]:
# Assuming you have your data in X (features) and y (target)
X = train[feat]
y = train['sii']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)
sample_weights = compute_sample_weight('balanced', y_train)

# Define quadratic weighted kappa objective function
def qwk_objective(y_true, y_pred):
    print(y_true.shape)
    y_true = np.argmax(y_true, axis=1)
    y_pred = np.argmax(y_pred, axis=1)
    kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return 'qwk', -kappa  # Negative because XGBoost minimizes the objective

# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train,enable_categorical=True,weight=sample_weights)
dtest = xgb.DMatrix(X_test, label=y_test,enable_categorical=True)

# Set XGBoost parameters
params = {
    'max_depth': 4,
    'eta': 0.1,
    #'subsample':0.2,
    'objective': 'multi:softmax',#qwk_objective,
    'num_class': len(np.unique(y)),  # number of classes
    'eval_metric': 'mlogloss',#qwk_metric
    "tree_method": "hist",
    #"disable_default_eval_metric": 1
}

# Train the model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_rounds, watchlist,
                  #feval=qwk_objective, 
                  early_stopping_rounds=10, verbose_eval=2)

# Make predictions
y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

# Convert predictions to class labels
#y_train_pred = np.argmax(y_train_pred, axis=1)
#y_test_pred = np.argmax(y_test_pred, axis=1)

# Evaluate final performance
train_qwk = cohen_kappa_score(y_train, y_train_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_test_pred, weights='quadratic')

print(f"Final Train QWK: {train_qwk:.4f}")
print(f"Final Test QWK: {test_qwk:.4f}")

In [None]:
# best iteration of loss is not representative of loss function!! 
model.best_iteration

In [None]:
model.get_fscore()

# check outputs

In [None]:
dtrain.get_label()

In [None]:
X_testAnal = X_test.copy()
X_testAnal['sii'] = y_test
X_testAnal['pred'] = y_test_pred
X_testAnal['tp'] = X_testAnal['sii'] == X_testAnal['pred']
X_testAnal['fp'] = X_testAnal['sii'] != X_testAnal['pred']

In [None]:
X_testAnal['tp'].value_counts()

In [None]:
X_testAnal.groupby('tp')['sii'].hist()

In [None]:
X_trainAnal = X_train.copy()
X_trainAnal['sii'] = y_train
X_trainAnal['pred'] = y_train_pred
X_trainAnal['tp'] = X_trainAnal['sii'] == X_trainAnal['pred']
X_trainAnal['fp'] = X_trainAnal['sii'] != X_trainAnal['pred']
X_trainAnal.groupby('tp')['sii'].hist()