## Neural Network Approach



### Setup

In [1]:
import pandas as pd
import numpy as np


from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

from pathlib import Path
import os

In [2]:
# read dynamic path
base_dir = Path(os.getcwd())/"implementation"
data_dir = base_dir / "data/source/"
result_dir = base_dir / "data/results/"

### load & prepare datasets (permutated fe and mdi fe)

In [3]:
knn_data_raw_mdi = pd.read_csv(data_dir / "ACHE/ache_mdi.csv")
knn_data_raw_per = pd.read_csv(data_dir / "ACHE/ache_per.csv")
knn_data_raw = pd.read_csv(data_dir / "ACHE/ache.csv")

In [4]:
lookup = {"inactive": 0, "active": 1}

knn_data_per = {
    "data": np.array(knn_data_raw_per.iloc[:, 1:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw_per.iloc[0:, -1]]),
    "feature_names": knn_data_raw_per.columns[1:-1],
    "target_names": ["inactive", "active"],
}

knn_data_mdi = {
    "data": np.array(knn_data_raw_mdi.iloc[:, 1:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw_mdi.iloc[0:, -1]]),
    "feature_names": knn_data_raw_mdi.columns[1:-1],
    "target_names": ["inactive", "active"],
}

knn_data_base = {
    "data": np.array(knn_data_raw.iloc[:, 2:-1]),
    "target": np.array([lookup[y] for y in knn_data_raw.iloc[0:, -1]]),
    "feature_names": knn_data_raw.columns[2:-1],
    "target_names": ["inactive", "active"],
}

split into train- and test-set

In [5]:
X_train_base, X_test_base, y_train_base, y_test_base = train_test_split(
    knn_data_base["data"], knn_data_base["target"], test_size=0.3, random_state=4232
)

X_train_mdi, X_test_mdi, y_train_mdi, y_test_mdi = train_test_split(
    knn_data_mdi["data"], knn_data_mdi["target"], test_size=0.3, random_state=4232
)

X_train_per, X_test_per, y_train_per, y_test_per = train_test_split(
    knn_data_per["data"], knn_data_per["target"], test_size=0.3, random_state=4232
)

### Apply K-NearestNeighbour

In [6]:
parameters = {"kneighborsclassifier__n_neighbors": list(range(1, 20, 2))}
# normalize date with pipeline
pipe1 = make_pipeline(StandardScaler(), KNeighborsClassifier())

knn_models_base = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)
knn_models_mdi = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)
knn_models_per = GridSearchCV(pipe1, parameters, cv=20, n_jobs=-1)

Fit model with the provided parameters

In [7]:
knn_models_per.fit(X_train_per, y_train_per)
knn_models_mdi.fit(X_train_mdi, y_train_mdi)
knn_models_base.fit(X_train_base, y_train_base)

In [8]:
print(
    f"""
    All Features \t {knn_models_base.best_estimator_.score(X_test_base,y_test_base)}
    Feature Permuation \t {knn_models_per.best_estimator_.score(X_test_per,y_test_per)}
    MDI \t {knn_models_mdi.best_estimator_.score(X_test_mdi,y_test_mdi)}
"""
)


    All Features 	 0.6843853820598007
    Feature Permuation 	 0.7574750830564784
    MDI 	 0.7574750830564784



In [11]:
df = pd.read_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index_col="Index")
df.loc[len(df["Name"])] = ["fe_rf_per_knn",knn_models_per.best_score_]
df.to_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index="Index")

In [9]:
prediction_per = knn_models_per.best_estimator_.predict(X_test_per)

result_df = pd.concat(
    [
        pd.DataFrame(columns=["INDEX"]),
        pd.DataFrame(columns=knn_data_raw_per.columns[1:-1]),
    ]
)

for i, row in enumerate(X_test_per):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test_per
result_df["PRED"] = prediction_per

result_df.to_csv(result_dir / "ACHE/fe_rf_per_knn.csv", encoding="utf-8")

In [12]:
df = pd.read_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index_col="Index")
df.loc[len(df["Name"])] = ["fe_rf_mdi_knn",knn_models_mdi.best_score_]
df.to_csv(result_dir/"ACHE/val/ache_val.csv",sep=",",index="Index")

In [10]:
prediction_mdi = knn_models_per.best_estimator_.predict(X_test_mdi)

result_df = pd.concat(
    [
        pd.DataFrame(columns=["INDEX"]),
        pd.DataFrame(columns=knn_data_raw_per.columns[1:-1]),
    ]
)

for i, row in enumerate(X_test_mdi):
    data = [i]
    data.extend(row)
    result_df.loc[len(result_df["INDEX"])] = data

result_df["LABEL"] = y_test_mdi
result_df["PRED"] = prediction_mdi

result_df.to_csv(result_dir / "ACHE/fe_rf_mdi_knn.csv", encoding="utf-8")