# Imports

In [None]:
import joblib
import numpy as np
import optuna
import pandas as pd
import torch
from sklearn.base import BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, fbeta_score
from sklearn.model_selection import train_test_split, KFold
from sklearn.utils import compute_class_weight
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from tqdm import tqdm
import xgboost as xgb
import lightgbm as lgb
from xgboost.callback import TrainingCallback
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression

In [None]:
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

# Load data

In [None]:
DATA_DIR = 'data'

In [None]:
test_x = pd.read_csv(f'{DATA_DIR}/test_values.csv')
train_x = pd.read_csv(f'{DATA_DIR}/train_values.csv')
train_y = pd.read_csv(f'{DATA_DIR}/train_labels.csv')
train_y['damage_grade'] -= 1  # Adjust y

In [None]:
geo_level_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']
categorical_columns = ['foundation_type', 'ground_floor_type', 'land_surface_condition',
                       'legal_ownership_status', 'other_floor_type',
                       'plan_configuration', 'position', 'roof_type']

# Autoencoder

In [None]:
geo = pd.concat([train_x[geo_level_columns], test_x[geo_level_columns]])
label_encoder_1 = LabelEncoder()
label_encoder_2 = LabelEncoder()
label_encoder_3 = LabelEncoder()
geo["geo_level_1_id"] = label_encoder_1.fit_transform(geo["geo_level_1_id"])
geo["geo_level_2_id"] = label_encoder_2.fit_transform(geo["geo_level_2_id"])
geo["geo_level_3_id"] = label_encoder_3.fit_transform(geo["geo_level_3_id"])

In [None]:
class Encoder(torch.nn.Module):
    def __init__(self, lv1_shape, lv2_shape, lv3_shape, dim_1, dim_2, dim_3, enc_shape):
        super(Encoder, self).__init__()

        self.encode_lv1 = torch.nn.Embedding(lv1_shape, dim_1)
        self.encode_lv2 = torch.nn.Embedding(lv2_shape, dim_2)
        self.encode_lv3 = torch.nn.Embedding(lv3_shape, dim_3)
        self.relu = torch.nn.ReLU()
        self.encoder = torch.nn.Linear(dim_1 + dim_2 + dim_3, enc_shape)

    def forward(self, x):
        x1 = self.encode_lv1(x[:, 0])
        x2 = self.encode_lv2(x[:, 1])
        x3 = self.encode_lv3(x[:, 2])
        x = torch.concat((x1, x2, x3), dim=1)
        x = self.relu(x)
        x = self.encoder(x)
        x = self.relu(x)
        return x


class Decoder(torch.nn.Module):
    def __init__(self, lv1_shape, lv2_shape, lv3_shape, enc_shape):
        super(Decoder, self).__init__()

        self.decode_lv1 = torch.nn.Linear(enc_shape, lv1_shape)
        self.decode_lv2 = torch.nn.Linear(enc_shape, lv2_shape)
        self.decode_lv3 = torch.nn.Linear(enc_shape, lv3_shape)

    def forward(self, x):
        x1 = self.decode_lv1(x)
        x2 = self.decode_lv2(x)
        x3 = self.decode_lv3(x)

        return x1, x2, x3


class Autoencoder(torch.nn.Module):

    def __init__(self, lv1_shape, lv2_shape, lv3_shape, enc_shape, dim_1, dim_2, dim_3):
        super(Autoencoder, self).__init__()

        self.encoder = Encoder(
            lv1_shape=lv1_shape,
            lv2_shape=lv2_shape,
            lv3_shape=lv3_shape,
            enc_shape=enc_shape,
            dim_1=dim_1,
            dim_2=dim_2,
            dim_3=dim_3
        )

        self.decoder = Decoder(
            lv1_shape=lv1_shape,
            lv2_shape=lv2_shape,
            lv3_shape=lv3_shape,
            enc_shape=enc_shape
        )

    def forward(self, x):
        return self.decoder(self.encoder(x))

## Training

### Prepare dataset

In [None]:
dataset = TensorDataset(
    torch.from_numpy(np.array(geo)).type(torch.long),
    torch.from_numpy(np.array(geo)).type(torch.long)
)
dataloader = DataLoader(
    dataset=dataset,
    batch_size=128
)

### Define criterions

In [None]:
geo_lv1_weights = compute_class_weight('balanced', classes=geo['geo_level_1_id'].unique(),
                                       y=geo['geo_level_1_id'].values)
geo_lv2_weights = compute_class_weight('balanced', classes=geo['geo_level_2_id'].unique(),
                                       y=geo['geo_level_2_id'].values)
geo_lv3_weights = compute_class_weight('balanced', classes=geo['geo_level_3_id'].unique(),
                                       y=geo['geo_level_3_id'].values)

criterion_1 = torch.nn.CrossEntropyLoss(torch.from_numpy(geo_lv1_weights).type(torch.float).to(DEVICE))
criterion_2 = torch.nn.CrossEntropyLoss(torch.from_numpy(geo_lv2_weights).type(torch.float).to(DEVICE))
criterion_3 = torch.nn.CrossEntropyLoss(torch.from_numpy(geo_lv3_weights).type(torch.float).to(DEVICE))

### Train model

In [None]:
model = Autoencoder(
    lv1_shape=geo['geo_level_1_id'].nunique(),
    lv2_shape=geo['geo_level_2_id'].nunique(),
    lv3_shape=geo['geo_level_3_id'].nunique(),
    enc_shape=16,
    dim_1=8,
    dim_2=16,
    dim_3=32
).to(DEVICE)

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=10)

model.train()

best_loss = float('inf')
best_model = None

num_epochs = 10000
patience = 50
epochs_no_improvement = 0

for epoch in range(num_epochs):
    print(f'EPOCH: {epoch + 1}')
    training_loss = 0
    for x, y in tqdm(dataloader, desc="training"):
        x = x.to(DEVICE)
        y = y.to(DEVICE)

        optimizer.zero_grad()
        x1, x2, x3 = model(x)

        loss = criterion_1(x1, y[:, 0]) + criterion_2(x2, y[:, 1]) + criterion_3(x3, y[:, 2])

        loss.backward()
        optimizer.step()

        training_loss += loss.item()

    training_loss /= len(dataloader.dataset)
    print(f'{training_loss = }')

    scheduler.step(training_loss)

    if best_loss > training_loss:
        best_loss = training_loss
        best_model = model
        epochs_no_improvement = 0
    else:
        epochs_no_improvement += 1
        if epochs_no_improvement >= patience:
            print(f'{patience} epochs without improvement, stopping training.')
            break

print(f'Training complete. Best loss: {best_loss}')

## Save model

In [None]:
joblib.dump(best_model.encoder, 'encoder_weights_81632.pkl')

# Prepare data

In [None]:
encoder = joblib.load('encoder_weights_81632.pkl')
encoder.to(DEVICE)
encoder.eval();

In [None]:
class RemoveLowCountCategories(BaseEstimator, TransformerMixin, ClassNamePrefixFeaturesOutMixin):

    def __init__(self, threshold, replace_value):
        super().__init__()
        self.threshold = threshold
        self.replace_value = replace_value
        self.keep_dict = None

    def fit(self, X, y=None):
        assert isinstance(X, pd.DataFrame)

        self.keep_dict = {
            column: X[column].value_counts().index[(X[column].value_counts() > self.threshold)] for column in X.columns
        }

        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)
        assert self.keep_dict is not None

        out = X.copy()

        for column, keep_categories in self.keep_dict.items():
            to_replace = set(out[column].unique()).difference(set(keep_categories))
            out[column].replace(to_replace, self.replace_value, inplace=True)

        # Replace geo_id_3 with -1 if geo_id_2 is -1
        out['geo_level_3_id'] = out.apply(
            lambda x: -1 if x['geo_level_2_id'] == -1 else x['geo_level_3_id'],
            axis=1)

        return out

In [None]:
# Get geo level columns
geo_train = train_x[geo_level_columns]

# Label encoding
geo_train.loc[:, "geo_level_1_id"] = label_encoder_1.transform(geo_train["geo_level_1_id"])
geo_train.loc[:, "geo_level_2_id"] = label_encoder_2.transform(geo_train["geo_level_2_id"])
geo_train.loc[:, "geo_level_3_id"] = label_encoder_3.transform(geo_train["geo_level_3_id"])

# Auto encoding
geo_train_tensor = torch.from_numpy(np.array(geo_train)).type(torch.long).to(DEVICE)
geo_train = pd.DataFrame(encoder(geo_train_tensor).detach().cpu().numpy())
train_x_modified = pd.concat([train_x, geo_train], axis=1)

# Fix columns type
train_x_modified.columns = train_x_modified.columns.astype(str)

# Remove low count geo id
remove_low_count = RemoveLowCountCategories(3, -1)
remove_low_count.fit(geo)
train_x_modified.loc[:, geo_level_columns] = remove_low_count.transform(train_x_modified[geo_level_columns])

# One-hot encoding
one_hot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore', min_frequency=1, sparse_output=False)
encoded_categorical = one_hot_encoder.fit_transform(train_x_modified[categorical_columns])
encoded_categorical = pd.DataFrame(
    encoded_categorical,
    columns=one_hot_encoder.get_feature_names_out(categorical_columns)
)
train_x_modified.drop(categorical_columns, axis=1, inplace=True)
train_x_modified = pd.concat([train_x_modified, encoded_categorical], axis=1)

# Drop building id
train_x_modified.drop('building_id', axis=1, inplace=True)
train_y_modified = train_y.drop('building_id', axis=1)

# Prepare model

In [None]:
# Split train test
X_train, X_test, y_train, y_test = train_test_split(train_x_modified, train_y_modified, test_size=0.3)


def objective(trial):
    params = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'eval_metric': 'auc',
        'booster': 'gbtree',
        'tree_method': 'gpu_hist',
        'max_depth': trial.suggest_int('max_depth', 10, 13),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'n_estimators': trial.suggest_int('n_estimators', 800, 1000),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 0.9),
        'eta': trial.suggest_float('eta', 0.01, 0.1),
        'gamma': trial.suggest_float('gamma', 0.7, 1.0),
        'min_child_weight': trial.suggest_float('min_child_weight', 5, 7),
        'random_state': 37
    }

    xgb_model = xgb.XGBClassifier(**params)
    xgb_model.fit(X_train, y_train)

    y_pred = xgb_model.predict(X_test)
    f1 = f1_score(y_test, y_pred, average='micro')

    return f1


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100, n_jobs=3)

print(f"Best trial: {study.best_trial.value}")
print(f"Best parameters: {study.best_trial.params}")

# Train model

In [None]:
params = {
    'objective': 'multi:softmax',
    'num_class': 3,
    'eval_metric': 'auc',
    'booster': 'gbtree',
    'tree_method': 'gpu_hist',
    'max_depth': 12,
    'subsample': 0.7134570758579321,
    'n_estimators': 958,
    'colsample_bytree': 0.7145579796503638,
    'eta': 0.024867991827546986,
    'gamma': 0.8898009647421944,
    'min_child_weight': 6.13488658574256
}


class TqdmCallback(TrainingCallback):
    def __init__(self, total):
        super().__init__()
        self.pbar = tqdm(total=total, desc="Training")

    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        return False

    def after_training(self, model):
        self.pbar.close()
        return model


model = xgb.XGBClassifier(**params, callbacks=[TqdmCallback(params['n_estimators'])])
model.fit(train_x_modified, train_y_modified)

# Submission

## Load data

In [None]:
test_values = pd.read_csv(f'{DATA_DIR}/test_values.csv')

## Prepare data

In [None]:
# Get geo level columns
geo_test = test_values[geo_level_columns]

# Label encoding
geo_test.loc[:, "geo_level_1_id"] = label_encoder_1.transform(geo_test["geo_level_1_id"])
geo_test.loc[:, "geo_level_2_id"] = label_encoder_2.transform(geo_test["geo_level_2_id"])
geo_test.loc[:, "geo_level_3_id"] = label_encoder_3.transform(geo_test["geo_level_3_id"])

# Auto encoding
geo_submission_tensor = torch.from_numpy(np.array(geo_test)).type(torch.long).to(DEVICE)
geo_test = pd.DataFrame(encoder(geo_submission_tensor).detach().cpu().numpy())
test_values_modified = pd.concat([test_values, geo_test], axis=1)

# Remove low count geo id
test_values_modified.loc[:, geo_level_columns] = remove_low_count.transform(
    test_values_modified[geo_level_columns])

# One-hot encoding
test_encoded_categorical = one_hot_encoder.fit_transform(
    test_values_modified[categorical_columns])
test_encoded_categorical = pd.DataFrame(
    test_encoded_categorical,
    columns=one_hot_encoder.get_feature_names_out(categorical_columns)
)
test_values_modified.drop(categorical_columns, axis=1, inplace=True)
test_values_modified = pd.concat([test_values_modified, test_encoded_categorical], axis=1)

# Drop building id
test_values_modified.drop('building_id', axis=1, inplace=True)

# Fix columns type
test_values_modified.columns = test_values_modified.columns.astype(str)

## Make prediction

In [None]:
sub_pred = model.predict(test_values_modified)
sub_pred = pd.DataFrame(sub_pred)

submission = pd.read_csv(f'{DATA_DIR}/submission_format.csv')
submission['damage_grade'] = sub_pred + 1
submission.to_csv('submission.csv', index=False)