In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import cohen_kappa_score
import xgboost as xgb

from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import QuantileTransformer

# data handling

In [None]:
train = pd.read_csv('train.csv')
dd = pd.read_csv('data_dictionary.csv')
test = pd.read_csv('test.csv')

object_columns = train.select_dtypes(include=['object']).columns
dtype_dict = {col: 'category' for col in object_columns}
train = train.astype(dtype_dict)

object_columns = test.select_dtypes(include=['object']).columns
dtype_dict = {col: 'category' for col in object_columns}
test = test.astype(dtype_dict)

testF = test.columns
trainF = train.columns
bothF = list(set(testF) & set(trainF))
missinF = list(set(trainF)-set(testF))
len(bothF),len(testF), len(trainF)

def findNumericalColumns(train):
    numerical_columns=[]
    for f in train.select_dtypes(np.float64).columns:
        if len(train[f].unique())>30:
            numerical_columns.append(f) 

    normalDistF = [f+'normalD' for f in numerical_columns]
    unifDistF = [f+'unifD' for f in numerical_columns]
    return numerical_columns, normalDistF, unifDistF
def addEncodings(train,numerical_columns, normalDistF, unifDistF):
    qtUnif = QuantileTransformer(n_quantiles=1000, output_distribution='uniform', random_state=42)  
    qtNorm = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=42)  

    train[unifDistF] = qtUnif.fit_transform(train[numerical_columns])
    train[unifDistF] = qtNorm.fit_transform(train[numerical_columns])

    return train, qtUnif, qtNorm

def transformTestEncoding(test,qtUnif, qtNorm,numerical_columns, normalDistF, unifDistF):
    normalDistF = [f+'normalD' for f in numerical_columns]
    unifDistF = [f+'unifD' for f in numerical_columns]

    test[unifDistF] = qtUnif.transform(test[numerical_columns])
    test[unifDistF] = qtNorm.transform(test[numerical_columns])

    return test

def transform(df):
    df = df.loc[~df.sii.isna()]
    seasons = [s for s in df.columns if 'Season' in s]
    for s in seasons:
        df['isSameSeason'+s] = (df['Basic_Demos-Enroll_Season'] == df[s]).astype(np.int8)
    
    df = df.loc[df['Physical-Weight'] > 0]
    df['BIA-BIA_Fat'] = df['BIA-BIA_Fat'].clip(0,100)

    df['activityScore'] = np.where(pd.isna(df['PAQ_A-PAQ_A_Total']) & ~pd.isna(df['PAQ_C-PAQ_C_Total']), df['PAQ_C-PAQ_C_Total'],  # If a is NaN and b is not NaN, use b
                np.where(~pd.isna(df['PAQ_A-PAQ_A_Total']) & pd.isna(df['PAQ_C-PAQ_C_Total']), df['PAQ_A-PAQ_A_Total'],  # If a is not NaN and b is NaN, use a
                np.where(df['Basic_Demos-Age'] > 13, df['PAQ_A-PAQ_A_Total'], df['PAQ_C-PAQ_C_Total']))) 
    df['activityScoreSeason'] = np.where(pd.isna(df['PAQ_A-Season']) & ~pd.isna(df['PAQ_C-Season']), df['PAQ_C-Season'],  # If a is NaN and b is not NaN, use b
                np.where(~pd.isna(df['PAQ_A-Season']) & pd.isna(df['PAQ_C-Season']), df['PAQ_A-Season'],  # If a is not NaN and b is NaN, use a
                np.where(df['Basic_Demos-Age'] > 13, df['PAQ_A-Season'], df['PAQ_C-Season']))) 
    #df=df.drop('Physical-Waist_Circumference',axis=1)
    return df

train = transform(train)

X = train[testF]
y = train['sii']
numerical_columns, normalDistF, unifDistF = findNumericalColumns(X)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, qtUnif, qtNorm = addEncodings(X_train,numerical_columns, normalDistF, unifDistF)
X_test = transformTestEncoding(X_test, qtUnif, qtNorm,numerical_columns, normalDistF, unifDistF)

# model

In [None]:
# Define quadratic weighted kappa objective function
def qwk_objective(y_true, y_pred):
    y_pred = y_pred.reshape(len(y_pred), -1)
    y_pred = np.argmax(y_pred, axis=1)
    kappa = cohen_kappa_score(y_true, y_pred, weights='quadratic')
    return 'qwk', -kappa  # Negative because XGBoost minimizes the objective

# Define evaluation metric
def qwk_metric(y_true, y_pred):
    y_pred = y_pred.reshape(len(y_pred), -1)
    y_pred = np.argmax(y_pred, axis=1)
    return 'qwk', cohen_kappa_score(y_true, y_pred, weights='quadratic')

# Convert data to DMatrix format
dtrain = xgb.DMatrix(X_train, label=y_train,enable_categorical=True)
dtest = xgb.DMatrix(X_test, label=y_test,enable_categorical=True)

# Set XGBoost parameters
params = {
    'max_depth': 6,
    'eta': 0.3,
    'objective': 'multi:softprob',#qwk_objective,
    'num_class': len(np.unique(y)),  # number of classes
    'eval_metric': 'mlogloss'#qwk_metric
}

# Train the model
num_rounds = 100
watchlist = [(dtrain, 'train'), (dtest, 'test')]
model = xgb.train(params, dtrain, num_rounds, watchlist, early_stopping_rounds=10, verbose_eval=10)

# Make predictions
y_train_pred = model.predict(dtrain)
y_test_pred = model.predict(dtest)

# Convert predictions to class labels
y_train_pred = np.argmax(y_train_pred, axis=1)
y_test_pred = np.argmax(y_test_pred, axis=1)

# Evaluate final performance
train_qwk = cohen_kappa_score(y_train, y_train_pred, weights='quadratic')
test_qwk = cohen_kappa_score(y_test, y_test_pred, weights='quadratic')

print(f"Final Train QWK: {train_qwk:.4f}")
print(f"Final Test QWK: {test_qwk:.4f}")