In [1]:
# Install packages
import sys
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy
!{sys.executable} -m pip install catboost

You should consider upgrading via the '/Users/magnus/repos/TDT05-classification-project/venv/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/Users/magnus/repos/TDT05-classification-project/venv/bin/python -m pip install --upgrade pip' command.[0m


In [None]:
# Imports
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, metrics

RANDOM_SEED = 42  # For reproducibility

In [None]:
# Loading the data
training_data = pd.read_csv('../input/tdt05-2021-challenge-2/challenge2_train.csv', index_col=0)
X_test = pd.read_csv('../input/tdt05-2021-challenge-2/challenge2_test.csv', index_col=0)
training_data

In [None]:
# Outlier removal by interquartile range (IQR) method
interval_features = ['f11', 'f17', 'f24', 'f28']
Q1 = training_data[interval_features].quantile(0.25)
Q3 = training_data[interval_features].quantile(0.75)
IQR = Q3 - Q1

initial_row_count = training_data.shape[0]

training_data = training_data[~(training_data > (Q3 + 1.5 * IQR)).any(axis=1)]

rows_removed = initial_row_count - training_data.shape[0]
print(f'Rows removed: {rows_removed}')

In [None]:
# Imputing
fill_value = -999 

training_data = training_data.fillna(fill_value)
X_test = X_test.fillna(fill_value)

In [None]:
# Convert numerical features to categorical
categorical_numerical_features = ['f0', 'f6', 'f20']

training_data[categorical_numerical_features] = training_data[categorical_numerical_features].astype(str)
X_test[categorical_numerical_features] = X_test[categorical_numerical_features].astype(str)

In [None]:
# Feature dropping
dropped_features = []
initial_column_count = training_data.shape[1]

training_data.drop(dropped_features, axis=1, inplace=True)
X_test.drop(dropped_features, axis=1, inplace=True)

columns_removed = initial_column_count - training_data.shape[1]
print(f'Columns removed: {columns_removed}')

In [None]:
# Partition the data
X = training_data.drop(columns=['target']).copy()
y = training_data[['target']].copy()

In [None]:
# Training
categorical_feature_indices_after_dropping = [0, 1, 2, 4, 6, 8, 9, 10, 12, 13, 14, 15, 18, 20, 22, 23, 25, 26]

catboost_model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=RANDOM_SEED,
    logging_level='Silent'
)

print('Training started...')

catboost_model.fit(
    X, y,
    cat_features=categorical_feature_indices_after_dropping
)

print('Training completed.')

In [None]:
# Prediction
y_pred = catboost_model.predict(X_test, prediction_type='Probability')
predictions = pd.DataFrame({'id': X_test.index, 'target': y_pred[:,1]})

path = '../output/catboost.txt'
predictions.to_csv(path, index=False)
print(f'Saved results to file with path {path}.')

In [None]:
import shap
sorted_feature_importance = catboost_model.get_feature_importance()
explainer = shap.TreeExplainer(catboost_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test, feature_names=X_test.index, max_display=X_test.shape[1])