# Case Study 2 - Multiple Runs

## 0 - Defining Basic Functionalities

This notebook, as in **case2.ipynb**, we will explore the **wilt** dataset. Similar to **Case1.ipynb/Case1_stat.ipynb**, we will run the same experiment multiple times in order to see if a preprocessing step improves the model's performance with statistical significance.

In [None]:
import pandas as pd
import numpy as np
import uci_dataset as database
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

import raimitigations.dataprocessing as dp

USE_AUC_TH = False

RESULT_KEYS = ["roc", "pr", "rc", "f1"]
COL_METRIC = "Metric"
COL_VALUE = "Value"
COL_TEST = "Test Case"

SEQ_FWD = 0
SEQ_BWD = 1
CATBOOST = 2

# -----------------------------------
def remove_corr_feat(df, label_col):
	cor_feat = dp.CorrelatedFeatures(
					method_num_num=["spearman", "pearson", "kendall"],				# Used for Numerical x Numerical correlations
					num_corr_th=0.9,												# Used for Numerical x Numerical correlations
					num_pvalue_th=0.05,												# Used for Numerical x Numerical correlations
					method_num_cat="model",											# Used for Numerical x Categorical correlations
					model_metrics=["f1", "auc"],									# Used for Numerical x Categorical correlations
					metric_th=0.9,													# Used for Numerical x Categorical correlations
					cat_corr_th=0.9,												# Used for Categorical x Categorical correlations
					cat_pvalue_th=0.01,												# Used for Categorical x Categorical correlations
					save_json=False,
					verbose=False
				)
	cor_feat.fit(df=df, label_col=label_col)
	proc_df = cor_feat.transform(df)
	return proc_df


# -----------------------------------
def transform_num_data(train_x, test_x, scaler_ref, num_col):
	ignore = None
	if num_col is not None:
		ignore = [col for col in train_x.columns if col not in num_col]
	transformer = scaler_ref(exclude_cols=ignore, verbose=False)
	transformer.fit(train_x)
	train_x_scl = transformer.transform(train_x)
	test_x_scl = transformer.transform(test_x)
	return train_x_scl, test_x_scl

# -----------------------------------
def feature_selection(train_x, train_y, test_x, feat_sel_type):
	if feat_sel_type == SEQ_FWD:
		feat_sel = dp.SeqFeatSelection(forward=True, n_jobs=4, verbose=False)
	elif feat_sel_type == SEQ_BWD:
		feat_sel = dp.SeqFeatSelection(forward=False, n_jobs=4, verbose=False)
	else:
		feat_sel = dp.CatBoostSelection(verbose=False)
	feat_sel.fit(X=train_x, y=train_y)
	train_x_sel = feat_sel.transform(train_x)
	test_x_sel = feat_sel.transform(test_x)

	features = feat_sel.get_selected_features()
	return train_x_sel, test_x_sel

# -----------------------------------
def artificial_smote(train_x, train_y, strategy, under_sample):
	rebalance = dp.Rebalance(
				X=train_x,
				y=train_y,
				strategy_over=strategy,
				over_sampler=True,
				under_sampler=under_sample,
				verbose=False
			)
	train_x_res, train_y_res = rebalance.fit_resample()
	return train_x_res, train_y_res

# -----------------------------------
def artificial_ctgan(train_x, train_y, strategy, savefile):
	synth = dp.Synthesizer(
				X=train_x,
				y=train_y,
				epochs=1000,
				model="ctgan",
				load_existing=True,
				save_file=savefile,
				verbose=False
			)
	synth.fit()
	syn_train_x, syn_train_y = synth.transform(X=train_x, y=train_y, strategy=strategy)
	return syn_train_x, syn_train_y

# -----------------------------------
# -----------------------------------
# -----------------------------------
def result_statistics(result_list):
	result_stat = {}
	for result in result_list:
		for key in RESULT_KEYS:
			if key in result_stat.keys():
				result_stat[key].append(result[key])
			else:
				result_stat[key] = [result[key]]

	return result_stat


# -----------------------------------
def add_results_df(result_df, result_stat, test_name):
	col_test = []
	col_metric = []
	col_value = []
	for metric in RESULT_KEYS:
		col_value += result_stat[metric]
		col_test += [test_name for _ in range(len(result_stat[metric]))]
		col_metric += [metric for _ in range(len(result_stat[metric]))]

	new_df = pd.DataFrame()
	new_df[COL_VALUE] = col_value
	new_df[COL_TEST] = col_test
	new_df[COL_METRIC] = col_metric
	new_df[COL_VALUE] = new_df[COL_VALUE].apply(float)

	if result_df is None:
		return new_df
	
	result_df = pd.concat([result_df, new_df], axis=0)
	
	return result_df


# -----------------------------------
def test_base(df, label_col, n_exec, model_name):
	result_list = []
	for n in range(n_exec):
		train_x, test_x, train_y, test_y = dp.split_data(df, label_col, test_size=0.25)
		result = dp.train_model_fetch_results(train_x, train_y, test_x, test_y, model_name, USE_AUC_TH)
		result_list.append(result)

	result_stat = result_statistics(result_list)

	return result_stat


# -----------------------------------
def test_corr(df, label_col, n_exec, model_name):
	result_list = []
	proc_df = remove_corr_feat(df, label_col)
	for n in range(n_exec):
		train_x, test_x, train_y, test_y = dp.split_data(proc_df, label_col, test_size=0.25)
		result = dp.train_model_fetch_results(train_x, train_y, test_x, test_y, model_name, USE_AUC_TH)
		result_list.append(result)

	result_stat = result_statistics(result_list)

	return result_stat


# -----------------------------------
def test_corr_transf(df, label_col, n_exec, scaler_ref, model_name, num_col=None):
	result_list = []
	proc_df = remove_corr_feat(df, label_col)
	for n in range(n_exec):
		train_x, test_x, train_y, test_y = dp.split_data(proc_df, label_col, test_size=0.25)
		train_x, test_x = transform_num_data(train_x, test_x, scaler_ref, num_col)
		result = dp.train_model_fetch_results(train_x, train_y, test_x, test_y, model_name, USE_AUC_TH)
		result_list.append(result)

	result_stat = result_statistics(result_list)

	return result_stat

# -----------------------------------
def test_smote_transf(df, label_col, n_exec, model_name, rcorr=True, scaler_ref=None, num_col=None, feat_sel_type=None, art_str=None, under=False):
	result_list = []
	proc_df = df
	if rcorr:
		proc_df = remove_corr_feat(proc_df, label_col)
	for n in range(n_exec):
		train_x, test_x, train_y, test_y = dp.split_data(proc_df, label_col, test_size=0.25)
		if art_str is not None:
			train_x, train_y = artificial_smote(train_x, train_y, art_str, under)
		if feat_sel_type is not None:
			train_x, test_x = feature_selection(train_x, train_y, test_x, feat_sel_type)
		if scaler_ref is not None:
			train_x, test_x = transform_num_data(train_x, test_x, scaler_ref, num_col)
		result = dp.train_model_fetch_results(train_x, train_y, test_x, test_y, model_name, USE_AUC_TH)
		result_list.append(result)

	result_stat = result_statistics(result_list)

	return result_stat



# -----------------------------------
def test_ctgan_first(df, label_col, n_exec, model_name, rcorr=True, scaler_ref=None, num_col=None, feat_sel_type=None, art_str=None, savefile=None):
	result_list = []
	proc_df = df
	if rcorr:
		proc_df = remove_corr_feat(proc_df, label_col)
	for n in range(n_exec):
		train_x, test_x, train_y, test_y = dp.split_data(proc_df, label_col, test_size=0.25)
		if art_str is not None:
			train_x, train_y = artificial_ctgan(train_x, train_y, art_str, savefile)
		if feat_sel_type is not None:
			train_x, test_x = feature_selection(train_x, train_y, test_x, feat_sel_type)
		if scaler_ref is not None:
			train_x, test_x = transform_num_data(train_x, test_x, scaler_ref, num_col)
		result = dp.train_model_fetch_results(train_x, train_y, test_x, test_y, model_name, USE_AUC_TH)
		result_list.append(result)

	result_stat = result_statistics(result_list)

	return result_stat

# -----------------------------------
# -----------------------------------
# -----------------------------------
def plot_results(res_df, y_lim=[0.7, 1.0]):
	plt.figure().clear()
	plt.close()
	plt.cla()
	plt.clf()

	fig = plt.gcf()
	fig.set_size_inches(18, 10)
	#fig.set_dpi(100)

	sns.set_theme(style="whitegrid")
	plt.ylim(y_lim[0], y_lim[1])
	ax = sns.barplot(x=COL_METRIC, y=COL_VALUE, hue=COL_TEST, data=res_df)
	plt.legend(bbox_to_anchor=(1.02, 1), loc='upper left', borderaxespad=0, fontsize=20)
	#ax.axes.set_title("Title",fontsize=50)
	ax.set_xlabel(COL_METRIC, fontsize=30)
	ax.set_ylabel(COL_VALUE, fontsize=30)
	ax.tick_params(labelsize=15)
	plt.show()

## 1 - Understanding the Data

In [None]:
df = database.load_wilt()
label_col = "class"
df[label_col] = df[label_col].replace({"w": 1, "n": 0})
df

In [None]:
df.info()

In [None]:
num_col = ["GLCM_pan", "Mean_Green", "Mean_Red", "Mean_NIR", "SD_pan"]

## 2 - Efficiency of Different Classes from raimitigations.dataprocessing lib

### KNN Model

#### Data Transformations

Here we compare 8 different training pipelines with a KNN model. For each pipeline, 50 runs are performed, and the mean and stdev of the metrics are computed and plotted in the graph below. 
- baseline dataset
- baseline dataset, removal of correlated features
- baseline dataset, removal of correlated features, standard scaler
- baseline dataset, removal of correlated features, minmax scaler
- baseline dataset, removal of correlated features, quantile transformer
- baseline dataset, removal of correlated features, data normalizer
- baseline dataset, removal of correlated features, robust scaler
- baseline dataset, removal of correlated features, power transformer

Note: Some of the experiments below take 10+ min to run. You may change `N_EXEC` to a smaller number to decrease the time it takes to run the notebook.

In [None]:
MODEL_NAME = "knn"
N_EXEC = 50


result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

result_cor = test_corr(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(result_df, result_cor, "Corr.")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataStandardScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Std.")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataMinMaxScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Min/Max.")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataQuantileTransformer, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Quantile")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataNormalizer, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Normalizer")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataRobustScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Robust")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataPowerTransformer, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Power")

In [None]:
plot_results(result_df)

#### Feature Selection

Now we will perform feature selection, where all features are removed from the dataset to improve the model performance. We will compare two different feature selection pipelines with two baseline models.
- Baseline KNN model
- KNN model with robust scaler
- KNN model, robust scaler, sequential feature selection 
- KNN model, robust scaler, CatBoost feature selection

In [None]:
MODEL_NAME = "knn"

result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataRobustScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Robust")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=False, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=SEQ_BWD)
result_df = add_results_df(result_df, restult_fs, "Seq.Bwd.Robust")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=False, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=CATBOOST)
result_df = add_results_df(result_df, restult_fs, "CatBoost Robust")

In [None]:
plot_results(result_df)

#### Artificial Instances - SMOTE

Since we have imbalanced classes (most trees are not diseased), we will experiment with creating artificial data.
- baseline KNN model
- baseline KNN model with robust scaler and removal of correlated features
- KNN model, remove correlated features, SMOTE artificial data
- KNN model, remove correlated features, robust scaler, SMOTE artificial data
- KNN model, remove correlated features, SMOTE artificial data, TomekLink under sampling
- KNN model, remove correlated features, robust scaler, SMOTE artificial data, TomekLink under sampling

In [None]:
MODEL_NAME = "knn"

result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataRobustScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Robust")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=None, feat_sel_type=None, art_str=0.2, under=False)
result_df = add_results_df(result_df, restult_fs, "SM")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.2, under=False)
result_df = add_results_df(result_df, restult_fs, "SM Robust")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=None, feat_sel_type=None, art_str=0.2, under=True)
result_df = add_results_df(result_df, restult_fs, "SM+TK")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.2, under=True)
result_df = add_results_df(result_df, restult_fs, "SM+TK Robust")

In [None]:
plot_results(result_df)

Now we will compare different quantities of artificial instances generated (the float refers to the ratio of minority class to majority class). All of these models contain correlated feature removal and the robust scaler.

In [None]:
MODEL_NAME = "knn"

result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataRobustScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Robust")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.1, under=False)
result_df = add_results_df(result_df, restult_fs, "SM Robust 0.1")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.4, under=False)
result_df = add_results_df(result_df, restult_fs, "SM Robust 0.4")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.6, under=False)
result_df = add_results_df(result_df, restult_fs, "SM Robust 0.6")


In [None]:
plot_results(result_df)

#### Artificial Instances - CTGAN

Instead of doing over sampling using SMOTE and its variations, we can create artificial instances of the minority class using the CTGAN. Here we perform the following experiments:
- KNN baseline model
- baseline KNN model with robust scaler and removal of correlated features
- KNN model, robust scaler, removal of correlated features, over sampling with CTGAN ratio of 0.15
- KNN model, robust scaler, removal of correlated features, over sampling with CTGAN ratio of 0.3


In [None]:
MODEL_NAME = "knn"

result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataRobustScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Robust")

restult_fs = test_ctgan_first(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.15, savefile="2_1.pkl")
result_df = add_results_df(result_df, restult_fs, "CTGAN 0.15 Robust")

restult_fs = test_ctgan_first(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=dp.DataRobustScaler, num_col=num_col, feat_sel_type=None, art_str=0.3, savefile="2_2.pkl")
result_df = add_results_df(result_df, restult_fs, "CTGAN 0.3 Robust")


In [None]:
plot_results(result_df)

### XGBoost

After the extensive experimenting with KNN models, we can also perform the same comparisons of experiments with XGBoost models instead. The following cells demonstrate this.

In [None]:
MODEL_NAME = "xgb"

result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

result_cor = test_corr(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(result_df, result_cor, "Corr.")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataStandardScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Std.")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataMinMaxScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Min/Max.")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataQuantileTransformer, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Quantile")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataNormalizer, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Normalizer")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataRobustScaler, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Robust")

result_tr = test_corr_transf(df, label_col, N_EXEC, dp.DataPowerTransformer, MODEL_NAME, num_col)
result_df = add_results_df(result_df, result_tr, "Power")

In [None]:
plot_results(result_df)

In [None]:
result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=None, feat_sel_type=SEQ_BWD)
result_df = add_results_df(result_df, restult_fs, "Seq.Bwd.Qtl.")

restult_fs = test_smote_transf(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=None, feat_sel_type=CATBOOST)
result_df = add_results_df(result_df, restult_fs, "CatBoost Qtl.")

In [None]:
plot_results(result_df, y_lim=[0.6,1.0])

In [None]:
MODEL_NAME = "xgb"
result_base = test_base(df, label_col, N_EXEC, MODEL_NAME)
result_df = add_results_df(None, result_base, "Baseline")

restult_fs = test_ctgan_first(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=None, feat_sel_type=None, art_str=0.1, savefile="2_3.pkl")
result_df = add_results_df(result_df, restult_fs, "CTGAN 0.1")

restult_fs = test_ctgan_first(df, label_col, N_EXEC, MODEL_NAME, rcorr=True, scaler_ref=None, feat_sel_type=None, art_str=0.15, savefile="2_4.pkl")
result_df = add_results_df(result_df, restult_fs, "CTGAN 0.15")


In [None]:
plot_results(result_df, y_lim=[0.6,1.0])