In [1]:
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from sklearn.impute import KNNImputer
from sklearn.model_selection import StratifiedKFold

from sklearn.metrics import log_loss
import numpy as np

import xgboost as xgb

In [9]:
#Importing data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
greeks = pd.read_csv('./data/greeks.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

df = pd.merge(train, greeks, how='left', on='Id')

In [10]:
# Data Cleaning
def encode(dataframe):
    le = LabelEncoder()
    obj = list(dataframe.loc[:, dataframe.dtypes == 'object'].columns)
    for i in obj:
        if i not in ['id', 'Epsilon']:
            dataframe[i] = le.fit_transform(dataframe[i])
    return dataframe

df = encode(df)  
test = encode(test)

In [11]:
features = ['AB', 'AF', 'AH', 'AM', 'AR', 'AX', 'AY', 'AZ', 'BC', 'BD ', 'BN',
       'BP', 'BQ', 'BR', 'BZ', 'CB', 'CC', 'CD ', 'CF', 'CH', 'CL', 'CR', 'CS',
       'CU', 'CW ', 'DA', 'DE', 'DF', 'DH', 'DI', 'DL', 'DN', 'DU', 'DV', 'DY',
       'EB', 'EE', 'EG', 'EH', 'EJ', 'EL', 'EP', 'EU', 'FC', 'FD ', 'FE', 'FI',
       'FL', 'FR', 'FS', 'GB', 'GE', 'GF', 'GH', 'GI', 'GL']

target = 'Class'

In [12]:
imputer = KNNImputer(n_neighbors=2)

df[features] = imputer.fit_transform(df[features])
test[features] = imputer.fit_transform(test[features])

In [13]:
# XGBoost parameters
params = {
    "objective": "binary:logistic",
    "max_depth": 10,
    "verbosity": 0,
    'eta': 0.09123900972866311, 
    'max_depth': 3, 
    'gamma': 0.19155496758925133, 
    'colsample_bytree': 0.728182625320748, 
    'subsample': 0.8440687156514323, 
    'min_child_weight': 3, 
    "tree_method": "gpu_hist",  
    "gpu_id": 0,  
}

# Modeling
X = df[features]
y = df[target]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
scores = []

for train_idx, val_idx in skf.split(X, y):
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[val_idx], y.iloc[val_idx]
    
    model = xgb.XGBClassifier(**params)
    
    model.fit(X_train, y_train)

    val_preds = model.predict_proba(X_valid)

    val_score = log_loss(y_valid, val_preds)

    scores.append(val_score)

print(f'Log-loss scores: {scores}')
print(f'Mean Log-loss: {np.mean(scores)}')


Log-loss scores: [0.17990107671367198, 0.18950467967594753, 0.15694548935535965, 0.2653883074983191, 0.18963259583653053]
Mean Log-loss: 0.19627442981596577


In [14]:
prediction = model.predict_proba(test[features])

In [15]:
sample_submission[['class_0', 'class_1']] = prediction

In [16]:
sample_submission

Unnamed: 0,Id,class_0,class_1
0,00eed32682bb,0.771648,0.228352
1,010ebe33f668,0.771648,0.228352
2,02fa521e1838,0.771648,0.228352
3,040e15f562a2,0.771648,0.228352
4,046e85c7cc7f,0.771648,0.228352


In [17]:
sample_submission.to_csv('submission.csv', index=False)