In [None]:
import pandas as pd
from pathlib import Path
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from scipy import stats

In [5]:
CONTINOUS_COLUMNS = ['V4', 'V16',
                     'V19', 'V20', 'V22']

DISCRETE_COLUMNS = ['V5', 'V6'] 

NOMINAL_COLUMNS = ['V1', 'V2', 'V7', 'V8', 'V9',
                       'V10', 'V11', 'V12', 'V13',
                       'V14', 'V15', 'V17', 'V18',
                       'V21', 'V23', 'V25',
                       'V26', 'V27', 'V28']

BELL_CURVE_TYPE_COLUMNS = ["V4", "V19"]

NOT_BELL_CURVE_TYPE_COLUMNS = ["V16", "V20", "V22"]

In [None]:
data_path = 'data/exam_results_train.csv'
df = pd.read_csv(data_path,header=4)

df = df.replace('?', np.nan)

list_column_names = ["V" + str(i) for i in range(1, 29)]
df.columns = list_column_names
df = df.drop(columns=['V3'])

for col in CONTINOUS_COLUMNS:
    df[col] = pd.to_numeric(df[col], errors='coerce')

fill_values_nominal = {col: df[col].mode()[0] for col in NOMINAL_COLUMNS}
fill_values_discrete = {col: df[col].median() for col in DISCRETE_COLUMNS}
fill_values_continuous = {col: df[col].mean(skipna=True) for col in CONTINOUS_COLUMNS}

for col in NOMINAL_COLUMNS:
    df[col].fillna(fill_values_nominal[col], inplace=True)

for col in DISCRETE_COLUMNS:
    df[col].fillna(fill_values_discrete[col], inplace=True)

for col in CONTINOUS_COLUMNS:
    df[col].fillna(fill_values_continuous[col], inplace=True)

outlier_info = {}
zscore_info = {}
for col in CONTINOUS_COLUMNS:
    # Calculate Z-score values for the column
    df[col + '_zscore'] = stats.zscore(df[col])

    # Assuming that outliers are indicated by absolute Z-scores greater than 3
    outlier_indices = df[abs(df[col + '_zscore']) > 3].index

    # Replace outliers with the median of the column
    mean_value = df[col].mean()
    outlier_info[col] = {'outlier_replacement': mean_value, 'outlier_indices': list(outlier_indices)}

    df.loc[outlier_indices, col] = mean_value

    # Drop the Z-score column as it's no longer needed
    df.drop(columns=[col + '_zscore'], inplace=True)

# OneHot Encoding for ML
onehot_encoders = {}
new_columns = []

for col in NOMINAL_COLUMNS:
    encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')

    # print("Type of OH encoder: ", type(encoder))
    new_data = encoder.fit_transform(df[col].to_numpy().reshape(-1, 1))

    new_columns.extend(encoder.get_feature_names_out([col]))

    new_df = pd.DataFrame(new_data, columns=encoder.get_feature_names_out([col]))
    df = pd.concat([df, new_df], axis=1)

    onehot_encoders[col] = encoder

df.drop(columns=NOMINAL_COLUMNS, inplace=True)

min_max_scaler_dict = {}
min_max_scaler = MinMaxScaler()
for col in df.columns:
    df[col] = min_max_scaler.fit_transform(df[[col]])
    min_max_scaler_dict[col] = min_max_scaler

y = df["V24"]
X = df.drop(columns="V24")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.10, random_state=42)

xgb = XGBClassifier(max_depth=4, n_estimators=10)
xgb.fit(X_train, y_train)

y_train_pred = xgb.predict(X_train)

y_test_pred = xgb.predict(X_test)

train_accuracy = accuracy_score(y_train, y_train_pred)
test_accuracy = accuracy_score(y_test, y_test_pred)

print("Train Accuracy: ", train_accuracy)
print("Test Accuracy: ", test_accuracy)