In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import chi2_contingency
import numpy as np

Load data. Drop duplicate rows

In [None]:
df1 = pd.read_csv("Training_part1.csv",sep=";")
df1 = df1.drop_duplicates() #There are 370 duplicated rows in both dataframes (duplicate: all values in row, including 'id', are equal to another row)
df1

In [None]:
df2 = pd.read_csv("Training_part2.csv",sep=";")
df2 = df2.drop_duplicates() #There are 370 duplicated rows in both dataframes (duplicate: all values in row, including 'id', are equal to another row)
df2

In [None]:
df_joint = pd.merge(df1, df2, on='id', how='outer')
df_joint

In [None]:
df = df_joint.drop('id',axis=1)
df

Check class imbalance

In [None]:
sum(df['Class']=='y') #There's big class imbalance (~300 negatives, ~3400 positives)

Check missing values

In [None]:
df.isna().sum() # Some columns with a few dozen NAN values, RAS has >2000 NAN values

Test collinearity of FAN and NUS

In [None]:
df["FAN"].corr(df["NUS"]) #NUS AND FAN looked collinear on visual inspection, check correlation coefficient

In [None]:
df = df.drop("NUS",axis=1) #FAN AND NUS are collinear. Remove NUS to avoid multicollinearity issues

Test collinearity of ERG and GJAH, RAS and XIN

In [None]:
contingency = pd.crosstab(df['RAS'], df['XIN']) # ERG AND GJAH looked collinear on visual inspection, check cramers V metric
                                                # RAS AND XIN also look collinear, check cramers V
                                                # to be fully on the safe side, should have renamed the labels of one variable to match the names of the labels of the other
chi2 = chi2_contingency(contingency)[0]
n = contingency.sum().sum()
phi2 = chi2 / n
r, k = contingency.shape
cramers_v = np.sqrt(phi2 / min(r-1, k-1))

print(cramers_v)

In [None]:
df = df.drop("GJAH",axis=1) # ERG AND GJAH are collinear. Remove GJAH to avoid multicollinearity issues
df = df.drop("RAS",axis=1) # RAS and XIN are collinear, and RAS is noisy (there are many missing values), so it's preferable to remove RAS

Plot class-conditional univariate distributions to check for clear hints of which features are discriminative, which features aren't. (Numerical features)

Note: To make this analysis statistically significant I would run ANOVA and Kruskall-Wallis tests for each combination of one feature and the class label, to judge whether there's significant statistical association between the two variables. Plotting the class-conditional univariate distributions is an inexact but faster way to assess this.

BIB: Class-conditional distributions look considerably different, feature is discriminative. Looks like the numerical feature with most different class-conditional distributions

FAN: Class-conditional distributions look somewhat different, feature has some discriminative power

SIS: Class-conditional distributions look somewhat different, feature has some discriminative power

LUK: class 'y' has very extreme outliers but, apart from those, class-conditional distributions are nearly identical. No significant discriminative power.

UIN: class-conditional distributions are nearly identical. No significant discriminative power.

WET: class-conditional distributions are nearly identical. No significant discriminative power.


In [None]:
class_feat = 'Class' #Plotting univariate class-conditional distributions for hints of which numerical features might be discriminative. To make this precise, should run ANOVA/Kruskall-Wallis tests
class_value = 'y'
df[df[class_feat]==class_value].hist(bins=50) # Note: this doesn't plot non-numerical features 
#ax.set_xlim(0,500)

In [None]:
class_value = 'n'
df[df[class_feat]==class_value].hist(bins=50)
#ax.set_xlim(0,500)

Plot class-conditional univariate distributions to check for clear hints of which features are discriminative, which features aren't. (Categorical features)

Note: To make this analysis statistically significant I would run Chi-Squared and Cramer's V tests for each combination of one feature and the class label, to judge whether there's significant statistical association between the two variables. Plotting the class-conditional univariate distributions is an inexact but faster way to assess this.

COD: Class-conditional distributions are identical. No discriminative power.

ERG: Class-conditional distributions are different. Feature has some discriminative power.

MYR: Some categories of MYR have very different sample frequencies per class. Feature has some discriminative power, would need to convert to on-hot encoded variable, or set of binary variables, to use in a decision tree classifier.

PKD: same observations as MYR.

TOK: Class-conditional distributions are somewhat different. Feature has some discriminative power.

VOL: Class-conditional distributions are very different. Feature has strong discriminative power.

XIN: Class-conditional distributions are very different. Feature has strong discriminative power.

KAT: Class-conditional distributions are only slightly different. Feature has little discriminative power.




In [None]:
covariate_name = "KAT" #VOL, XIN and ERG look like they are discriminative. Chi-Squared/Cramers V tests could confirm this.
target="y"
bins_temp = df[df[class_feat]==target][covariate_name].unique()
bins = np.array([i for i in bins_temp if str(i).lower() != 'nan'])
bins.sort()
df[df[class_feat]==target][covariate_name].value_counts().loc[bins].plot.bar() #use for categorical variables

In [None]:
target="n"
bins_temp = df[df[class_feat]==target][covariate_name].unique()
bins = np.array([i for i in bins_temp if str(i).lower() != 'nan'])
bins.sort()
df[df[class_feat]==target][covariate_name].value_counts().loc[bins].plot.bar() #use for categorical variables

VOL + XIN + ERG already enable quite good class-separation as seen from plots of class-conditional joint distribution of these three variables

In [None]:
df['VOL_XIN_ERG'] = df['VOL'] + '_' + df['XIN'] + '_' + df['ERG'] #Looks like it's possible to achieve quite decent class separation with these three variables. Will throw in the best numerical variable ('BIB') as well.
covariate_name = "VOL_XIN_ERG"
target="y"
bins_temp = df[df[class_feat]==target][covariate_name].unique()
bins = np.array([i for i in bins_temp if str(i).lower() != 'nan'])
bins.sort()
df[df[class_feat]==target][covariate_name].value_counts().loc[bins].plot.bar() #use for categorical variables

In [None]:
target="n"
bins_temp = df[df[class_feat]==target][covariate_name].unique()
bins = np.array([i for i in bins_temp if str(i).lower() != 'nan'])
bins.sort()
df[df[class_feat]==target][covariate_name].value_counts().loc[bins].plot.bar() #use for categorical variables

Impute missing ERG values

In [None]:
df.isna().sum() #There are 4 samples with missing ERG values, all in 'yes' class, for now will impute the class-conditional mode on those samples

In [None]:
erg_y_mode = df[df["Class"]=='y']["ERG"].mode()[0] #for now will impute the missing ERG values with the class-conditional mode
df['ERG'].fillna(erg_y_mode,inplace=True)
df.isna().sum()

In [None]:
column_list = ["BIB", "ERG", "VOL", "XIN","Class"] #keep only these features to train classification model on
df_store = df[column_list]
df_store

In [None]:
df_store.to_csv("processed_data.csv",index=False)

Dataset is tabular, with both numerical and categorical features. Good class separation seems achievable with model including only small subset of features. Based on this, will use a Decision Tree Classifier.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [None]:
df_processed = pd.read_csv("processed_data.csv")
df_processed

Transform categorical variables

In [None]:
df_processed = pd.get_dummies(df_processed,columns = ["ERG"],drop_first=False) # One-hot encode multinomial categorical feature
df_processed['VOL'] = df_processed['VOL'].map({'t': True, 'f': False})
df_processed['XIN'] = df_processed['XIN'].map({'t': True, 'f': False})
df_processed

In [None]:
df_processed.to_csv("transformed_data.csv",index=False)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelBinarizer
import matplotlib.pyplot as plt
import joblib

In [None]:
df_transformed = pd.read_csv("transformed_data.csv")
df_transformed

In [None]:
df_target = df_transformed['Class']
df_features = df_transformed.drop('Class',axis=1)

In [None]:
df_target

In [None]:
df_features

In [None]:
X_train, X_eval, y_train, y_eval = train_test_split(df_features, df_target, test_size=0.2, stratify=df_target, random_state=42) #startify ensures class proprtions are preserved in train and test sets

Fit Decision Tree Classifier

In [None]:
model = DecisionTreeClassifier(class_weight='balanced', random_state=42,max_depth=10) #class_weight=balanced weights samples inversely to class proportions to compensate for class-imbalance
# max_depth = 10 to avoid overfitting. Given limited amount of features, several of which are binomial categorical features, small depth should be sufficient.
model.fit(X_train, y_train)

Plot accuracy, F1 and AUC. Since there's high class-imbalance, F1 and AUC metrics are more informative than accuracy.

In [None]:
y_pred = model.predict(X_eval)
y_proba = model.predict_proba(X_eval)[:, 1] 
metrics = {
    'Accuracy': accuracy_score(y_eval, y_pred),
    'F1': f1_score(y_eval, y_pred,pos_label='y'),
    'ROC-AUC': roc_auc_score(y_eval, y_proba)
}

In [None]:
plt.bar(metrics.keys(), metrics.values(), color=['skyblue', 'lightgreen', 'salmon'])
plt.ylim(0, 1)
plt.title('Decision Tree Evaluation Metrics')
plt.show()

Plot ROC curve to see how model performance varies with decision threshold

In [None]:
lbin = LabelBinarizer()
y_eval_bin = lbin.fit_transform(y_eval)
y_proba = model.predict_proba(X_eval)[:, 1]
fpr, tpr, thresholds = roc_curve(y_eval_bin, y_proba)
auc = roc_auc_score(y_eval_bin, y_proba)
plt.figure(figsize=(8,6))
plt.plot(fpr, tpr, label=f'ROC curve (AUC = {auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', alpha=0.5)
num_points = 5
indices = np.linspace(0, len(thresholds)-1, num_points, dtype=int)
for i in indices:
    plt.text(fpr[i], tpr[i], f'{thresholds[i]:.2f}')

plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve with Thresholds')
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()


Save model

In [None]:
joblib.dump(model,'decision_tree_classifier.joblib')