In [None]:
from itertools import combinations

import pandas as pd

import xgboost as xgb

import numpy as np

import seaborn as sns

from matplotlib import pyplot as plt

from scipy.stats import fisher_exact, chi2_contingency, skew, kurtosis

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder, \
    OrdinalEncoder, PowerTransformer, QuantileTransformer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, \
    f1_score, roc_auc_score, confusion_matrix
from sklearn.feature_selection import SelectPercentile, mutual_info_classif, \
    mutual_info_regression
from sklearn.ensemble import IsolationForest

In [None]:
df = pd.read_csv('./data/DontGetKicked/training.csv')
df_train = pd.read_csv('./data/DontGetKicked/test.csv')

In [None]:
# Classify columns into categorical and numerical
categorical_columns = [
    'Auction', 'Make', 'Model', 'Trim', 'SubModel', 'Color', 
    'VehYear', 'Transmission', 'WheelType', 'Nationality', 'Size', 
    'TopThreeAmericanName', 'PRIMEUNIT', 'AUCGUART', 'VNST',
    'WheelTypeID', 'VNZIP1', 'IsOnlineSale', 'PurchDate'
]
df[categorical_columns] = df[categorical_columns].astype('category')
df_train[categorical_columns] = df_train[categorical_columns].astype('category')
numerical_columns = list(set(df.columns) - set(categorical_columns))
target_column =  'IsBadBuy'
numerical_columns.remove(target_column)
df[numerical_columns] = df[numerical_columns].astype('float')
df_train[numerical_columns] = df_train[numerical_columns].astype('float')


# Define helper functions
def drop_columns(df, columns):
    df = df.drop(columns=columns)
    for column in columns:
        if column in categorical_columns:
            categorical_columns.remove(column)
        if column in numerical_columns:
            numerical_columns.remove(column)
    return df


def add_categorical_column(df, column, values):
    df[column] = values
    df[column] = df[column].astype('category')
    categorical_columns.append(column)
    return df


def add_numerical_column(df, column, values):
    df[column] = values
    df[column] = df[column].astype('float')
    numerical_columns.append(column)
    return df


def plot_distributions(df, columns):
    num_columns = len(columns)
    columns_per_row = 2
    rows = (num_columns + columns_per_row - 1) // columns_per_row
    fig, axes = plt.subplots(rows, columns_per_row, figsize=(12, rows * 4))
    axes = axes.flatten()
    for index, column in enumerate(df[columns].columns):
        sns.histplot(df[column], kde=True, ax=axes[index])
        axes[index].set_title(f'Distribution of {column}')
    for del_index in range(index + 1, len(axes)):
        fig.delaxes(axes[del_index])
    plt.tight_layout()


def nmad(df):
    df_clean = df.dropna()
    median = np.median(df_clean)
    return np.median(np.abs(df_clean - median)) / median


def cov(df):
    df_clean = df.dropna()
    return np.std(df_clean) / np.mean(df_clean)

In [None]:
# Drop unnecessary columns
unnecessary_columns = ['RefId', 'WheelTypeID', 'BYRNO',
             'VNZIP1', 'SubModel', 'Trim', 'Model',
             'VehYear'
             ]    
df = drop_columns(df=df, columns=unnecessary_columns)
unnecessary_columns.remove('RefId')
df_train = drop_columns(df=df_train, columns=unnecessary_columns)
unnecessary_columns.append('RefId')

# Extract month from purchase date string
purchase_month = df['PurchDate'].apply(lambda x: x.split('/')[0])
df = add_categorical_column(df=df, column='PurchMonth', values=purchase_month)
df = drop_columns(df=df, columns=['PurchDate'])

purchase_month = df_train['PurchDate'].apply(lambda x: x.split('/')[0])
df_train = add_categorical_column(df=df_train, column='PurchMonth', values=purchase_month)
df_train = drop_columns(df=df_train, columns=['PurchDate'])

# Convert wheel type to upper case
df['WheelType'] = df['WheelType'].apply(lambda x: x.upper())
df_train['WheelType'] = df_train['WheelType'].apply(lambda x: x.upper())

# Show remaining columns
print(df.info())

In [None]:
# Define sensible value ranges and mark out-of-range-values as missing
numerical_column_ranges = {
    'VehicleAge': (0, 30),
    'VehOdo': (0, 120000),
    'VehBCost': (1000, 46000),
    'WarrantyCost': (400, 8000),
    'MMRAcquisitionAuctionAveragePrice': (800, 46000),
    'MMRAcquisitionAuctionCleanPrice': (1000, 46000),
    'MMRAcquisitionRetailAveragePrice': (1000, 46000),
    'MMRAcquisitonRetailCleanPrice': (1000, 46000),
    'MMRCurrentAuctionAveragePrice': (300, 46000),
    'MMRCurrentAuctionCleanPrice': (400, 46000),
    'MMRCurrentRetailAveragePrice': (800, 46000),
    'MMRCurrentRetailCleanPrice': (1000, 46000)
}
for column, (min_value, max_value) in numerical_column_ranges.items():
    df[column] = df[column].apply(lambda x: x if min_value <= x <= max_value else None)
for column, (min_value, max_value) in numerical_column_ranges.items():
    df_train[column] = df_train[column].apply(lambda x: x if min_value <= x <= max_value else None)

# Drop columns with little variation
mask = df[numerical_columns].apply(nmad) < 0.1
columns = mask[mask].index.to_list()
df = drop_columns(df=df, columns=columns)
df_train = drop_columns(df=df_train, columns=columns)

# Show column stats
for column in numerical_columns:
    print(df[column].describe())

In [None]:
# Replace non-sensible values
df['Make'] = df['Make'].astype('object').replace({'TOYOTA SCION': 'SCION'}).astype('category')
df['Transmission'] = df['Transmission'].astype('object').replace({'Manual': 'MANUAL'}).astype('category')
df['Color'] = df['Color'].astype('object').replace({'NOT AVAIL': None}).astype('category')

df_train['Make'] = df_train['Make'].astype('object').replace({'TOYOTA SCION': 'SCION'}).astype('category')
df_train['Transmission'] = df_train['Transmission'].astype('object').replace({'Manual': 'MANUAL'}).astype('category')
df_train['Color'] = df_train['Color'].astype('object').replace({'NOT AVAIL': None}).astype('category')

# Replace rare values
for column in categorical_columns:
    # Categorize as 'OTHERS'
    value_counts = df[column].value_counts()
    proba = value_counts / value_counts.sum()
    mask = proba <= 0.01
    replace_dict = {key: 'OTHER' for key in mask[mask].reset_index().iloc[:, 0].to_list()}
    df[column] = df[column].astype('object').replace(replace_dict).astype('category')
    df_train[column] = df_train[column].astype('object').replace(replace_dict).astype('category')
    # If frequency of 'OTHERS' is too low, mark as missing
    num_other = len(df.loc[df[column] == 'OTHER', column])
    if num_other / len(df[column]) < 0.01:
        df[column] = df[column].astype('object').replace({'OTHER': None}).astype('category')
        df_train[column] = df_train[column].astype('object').replace({'OTHER': None}).astype('category')

# Drop columns with little variation
for column in categorical_columns:
    value_counts = df[column].value_counts()
    proba = value_counts / value_counts.sum()
    if (proba > 0.99).any():
        df = drop_columns(df=df, columns=[column])
        df_train = drop_columns(df=df_train, columns=[column])

# Drop columns with too many missing values, if not correlated with target
prob_na = df[categorical_columns].isna().sum() / len(df)
mask = prob_na > 0.2
na_columns = mask[mask].reset_index().iloc[:, 0].to_list()
for column in na_columns:
    # p_value = chi2_contingency(pd.crosstab(df[column], df[target_column])).pvalue
    p_value = fisher_exact(pd.crosstab(df[column], df[target_column])).pvalue
    if p_value > 0.01:
        df = drop_columns(df=df, columns=[column])
        df_train = drop_columns(df=df_train, columns=[column])

# Show column stats
for column in categorical_columns:
    print(df[column].value_counts())

In [None]:
# # Check for outliers
# # Copy data frame
# df_iso = df.drop(columns=[target_column]).copy()

# # Replace missing values
# for column in categorical_columns:
#     df_iso[column] = df_iso[column].fillna(df_iso[column].mode().iloc[0])
# for column in numerical_columns:
#     df_iso[column] = df_iso[column].fillna(df_iso[column].median())

# # Encode categorical values
# one_hot_encoder = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)
# one_hot_encoded = one_hot_encoder.fit_transform(df_iso[categorical_columns])
# df_iso = pd.concat([pd.DataFrame(one_hot_encoded, columns=one_hot_encoder.get_feature_names_out()), df_iso[numerical_columns].reset_index(drop=True)], axis=1) 

# # Scale values
# scaler = StandardScaler()
# X = scaler.fit_transform(df_iso)

# # Check for outliers using isolation forest
# clf = IsolationForest()
# outliers = clf.fit_predict(X)

# # Show ratio of outliers
# print((outliers == -1).sum() / outliers.shape[0] * 100)

# # Drop outliers
# df = df.drop(df[outliers == -1].index)

In [None]:
# # Show ratio of missing values
# print(df.isna().any(axis=1).sum() / len(df) * 100)

# # Impute missing values
# categorical_imputer = SimpleImputer(strategy='most_frequent')
# df[categorical_columns] = categorical_imputer.fit_transform(df[categorical_columns])
# numerical_imputer = SimpleImputer(strategy='median')
# df[numerical_columns] = numerical_imputer.fit_transform(df[numerical_columns])

# # Convert to categorical and float
# df[categorical_columns] = df[categorical_columns].astype('category')
# df[numerical_columns] = df[numerical_columns].astype('float')

In [None]:
# # Compute mutual information for all combinations of numerical columns
# column_combinations = list(combinations(numerical_columns, 2))
# mutual_info_scores = [mutual_info_regression(df[[column_1]].values, df[[column_2]].values.squeeze()) for column_1, column_2 in column_combinations]
# # Sort list descending
# mutual_info_columns = sorted([(list(columns), float(score[0])) for columns, score in zip(column_combinations, mutual_info_scores)], key=lambda x: x[1], reverse=True)
# # Iterate over 20% of the most correlated columns and choose column with the lowest variance to drop
# column_ratio = int(len(mutual_info_columns) * 0.2)
# columns_drop = []
# for columns, value in mutual_info_columns[:column_ratio]:
#     index = df[columns].var().argmin()
#     columns_drop.append(columns[index])
# columns_drop = list(set(columns_drop))
# df = drop_columns(df=df, columns=columns_drop)

# # Show remaining columns
# print(df.info())

In [None]:
# # Drop columns that have low correlation with the target
# # Numerical columns
# selector = SelectPercentile(score_func=mutual_info_classif, percentile=30)
# mask = selector.fit(df[numerical_columns], df[target_column]).get_support()
# unselected_columns = [column for column, drop in zip(numerical_columns, mask) if not drop]
# df = drop_columns(df=df, columns=unselected_columns)

# # Categorical columns
# encoder = OrdinalEncoder()
# selector = SelectPercentile(score_func=lambda X, y: mutual_info_classif(X, y, discrete_features=True), percentile=30)
# mask = selector.fit(encoder.fit_transform(df[categorical_columns]), df[target_column]).get_support()
# unselected_columns = [column for column, drop in zip(categorical_columns, mask) if not drop]
# df = drop_columns(df=df, columns=unselected_columns)

# # Show remaining columns
# df.info()

In [None]:
# # Transform non-normal distribution
# # Plot distributions before transformation
# plot_distributions(df, numerical_columns)

# # Print Skrew and Kurtosis
# for column in df[numerical_columns]:
#     print(f'Column: {column}, Skew: {skew(df[column])}, Kurtosis: {kurtosis(df[column])}')

# # transformer = PowerTransformer()
# transformer = QuantileTransformer(output_distribution='normal')
# df[numerical_columns] = transformer.fit_transform(df[numerical_columns])

# # Plot distributions after transformation
# plot_distributions(df, numerical_columns)

In [None]:
from xgboost import XGBClassifier

# Split dataset in train and test data
# X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=[target_column]), df[target_column], test_size=0.2)
X_train, X_test, y_train, y_test = df.drop(columns=[target_column]), df.drop(columns=[target_column]), df[target_column], df[target_column]

# Train classifier and predict data 
clf = XGBClassifier(enable_categorical=True)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

# Compute metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
confusion_mat = confusion_matrix(y_test, y_pred)

# Print metrics
print(f'Accuracy:{accuracy}')
print(f'Presicion: {precision}')
print(f'Recall: {recall}')
print(f'F1: {f1}')
print(f'ROC AUC: {roc_auc}')
print('Confusion Matrix:')
print(f'{confusion_mat}')

In [None]:
y_pred = clf.predict(df_train.drop(columns=["RefId"]))

In [None]:
df_train = df_train["RefId"].astype(int)
df_train = pd.concat([df_train, pd.DataFrame(y_pred, columns=[target_column])], axis=1)

In [None]:
df_train.to_csv('../results/DontGetKicked/submission.csv', index=False)