In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
from sklearn.utils import compute_sample_weight
import xgboost as xgb

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import QuantileTransformer

# data handling

In [None]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
testF = test.columns
trainF = train.columns
bothF = list(set(testF) & set(trainF))
missinF = list(set(trainF)-set(testF))
len(bothF),len(testF), len(trainF)

def findNumericalColumns(df):
    numerical_columns=[]
    for f in df.select_dtypes(np.float64).columns:
        if len(df[f].unique())>30:
            numerical_columns.append(f) 

    normalDistF = [f+'normalD' for f in numerical_columns]
    unifDistF = [f+'unifD' for f in numerical_columns]
    return numerical_columns, normalDistF, unifDistF
def addEncodings(df,numerical_columns, normalDistF, unifDistF):
    qtUnif = QuantileTransformer(n_quantiles=10, output_distribution='uniform', random_state=42)  
    qtNorm = QuantileTransformer(n_quantiles=10, output_distribution='normal', random_state=42)  
    df[unifDistF] = qtUnif.fit_transform(df[numerical_columns])
    df[unifDistF] = qtNorm.fit_transform(df[numerical_columns])
    return df, qtUnif, qtNorm

def transformTestEncoding(df,qtUnif, qtNorm,numerical_columns, normalDistF, unifDistF):
    normalDistF = [f+'normalD' for f in numerical_columns]
    unifDistF = [f+'unifD' for f in numerical_columns]
    df[unifDistF] = qtUnif.transform(df[numerical_columns])
    df[unifDistF] = qtNorm.transform(df[numerical_columns])
    return df

def discretize(df,numerical_columns):
    edges=[]
    for f in numerical_columns:
        df[f+'disc'], bin_edges = pd.qcut(df[f], q=10, labels=False, duplicates='drop', retbins=True)
        edges.append(bin_edges)
    return df, edges

def discretizeTest(df,numerical_columns, edges):
    for i,f in enumerate(numerical_columns):
        df[f+'disc'] = pd.cut(df[f], bins=edges[i], labels=False, include_lowest=True)
    return df



def transform(df):
    df = df.loc[~df.sii.isna()] # has impact on how well algo fits, if categories are 0-3 or 0-4 with nan inside!!

    object_columns = df.select_dtypes(include=['object']).columns
    dtype_dict = {col: 'category' for col in object_columns}
    df = df.astype(dtype_dict)

    seasons = [s for s in df.columns if 'Season' in s]
    for s in seasons:
        df['isSameSeason'+s] = (df['Basic_Demos-Enroll_Season'] == df[s]).astype(np.int8)
    
    #df = df.loc[df['Physical-Weight'] > 0]            # this has bad effect!! probably omitting a lot of data!
    #df['BIA-BIA_Fat'] = df['BIA-BIA_Fat'].clip(0,100) # also has bad effect vs baseline

    df['activityScore'] = np.where(pd.isna(df['PAQ_A-PAQ_A_Total']) & ~pd.isna(df['PAQ_C-PAQ_C_Total']), df['PAQ_C-PAQ_C_Total'],  # If a is NaN and b is not NaN, use b
                np.where(~pd.isna(df['PAQ_A-PAQ_A_Total']) & pd.isna(df['PAQ_C-PAQ_C_Total']), df['PAQ_A-PAQ_A_Total'],  # If a is not NaN and b is NaN, use a
                np.where(df['Basic_Demos-Age'] > 13, df['PAQ_A-PAQ_A_Total'], df['PAQ_C-PAQ_C_Total']))) 
    df['activityScoreSeason'] = np.where(pd.isna(df['PAQ_A-Season']) & ~pd.isna(df['PAQ_C-Season']), df['PAQ_C-Season'],  # If a is NaN and b is not NaN, use b
                np.where(~pd.isna(df['PAQ_A-Season']) & pd.isna(df['PAQ_C-Season']), df['PAQ_A-Season'],  # If a is not NaN and b is NaN, use a
                np.where(df['Basic_Demos-Age'] > 13, df['PAQ_A-Season'], df['PAQ_C-Season']))) 
    #df=df.drop('Physical-Waist_Circumference',axis=1)
    return df


train = transform(train)

X = train[testF]
#X = X.drop('id',axis=1)
#X=X.drop('Physical-Waist_Circumference',axis=1)
y = train['sii']
numerical_columns, normalDistF, unifDistF = findNumericalColumns(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.6, random_state=42)
if 0:
    # no difference in performance at all
    X_train, qtUnif, qtNorm = addEncodings(X_train,numerical_columns, normalDistF, unifDistF)
    X_test = transformTestEncoding(X_test, qtUnif, qtNorm,numerical_columns, normalDistF, unifDistF)
if 1:
    # slightly better
    X_train,edges = discretize(X_train, numerical_columns)
    X_test = discretizeTest(X_test, numerical_columns, edges)
sample_weights = compute_sample_weight('balanced', y_train)

# model
- encoding or not, doesn't matter, exactly same performance

In [None]:
# Define quadratic weighted kappa objective function
def qwk_objective(y_true, y_pred):
    y_pred = y_pred.reshape(len(y_pred), -1)
    y_pred = np.argmax(y_pred, axis=1)
    kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return 'qwk', -kappa  # Negative because XGBoost minimizes the objective

# Define evaluation metric
def qwk_metric(y_true, y_pred):
    y_pred = y_pred.reshape(len(y_pred), -1)
    y_pred = np.argmax(y_pred, axis=1)
    return 'qwk', cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train,enable_categorical=True,weight=sample_weights)
dtest = xgb.DMatrix(X_test, label=y_test,enable_categorical=True)

# Set XGBoost parameters
params = {
    'max_depth': 3,
    'eta': 0.1,
    'subsample':0.2,
    
    'objective': 'multi:softprob',#qwk_objective,
    'num_class': len(np.unique(y)),  # number of classes
    'eval_metric': 'mlogloss'#qwk_metric
}

# Train the model
num_rounds = 800
watchlist = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=100, verbose_eval=10)

# Make predictions
y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

# Convert predictions to class labels
y_train_pred = np.argmax(y_train_pred, axis=1)
y_test_pred = np.argmax(y_test_pred, axis=1)

# Evaluate final performance
train_qwk = cohen_kappa_score(y_train, y_train_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_test_pred, weights='quadratic')

print(f"Final Train QWK: {train_qwk:.4f}")
print(f"Final Test QWK: {test_qwk:.4f}")

# 0.972 / 0.306, no matter if only with unif encoder, both or overwritten

In [None]:
[125]	train-mlogloss:0.57127	test-mlogloss:1.08316
Final Train QWK: 0.7206
Final Test QWK: 0.3879

# eval

In [None]:
X_testAnal = X_test.copy()
X_testAnal['sii'] = y_test
X_testAnal['pred'] = y_test_pred
X_testAnal['tp'] = X_testAnal['sii'] == X_testAnal['pred']
X_testAnal['fp'] = X_testAnal['sii'] != X_testAnal['pred']
X_testAnal.groupby('tp')['sii'].hist(),X_testAnal.groupby('sii')['tp'].value_counts()

In [None]:
X_trainAnal = X_train.copy()
X_trainAnal['sii'] = y_train
X_trainAnal['pred'] = y_train_pred
X_trainAnal['tp'] = X_trainAnal['sii'] == X_trainAnal['pred']
X_trainAnal['fp'] = X_trainAnal['sii'] != X_trainAnal['pred']
X_trainAnal.groupby('tp')['sii'].hist(),X_trainAnal.groupby('sii')['tp'].value_counts()