In [None]:
## load libraries and sklearn Python packages for evaluation. This example include feature generation including
## the feature from the 1) MinMaxScaler, 2) StandardScaler, 3) RobustScaler, 4) from the QuantileTransformer output
## 5) the output from the KBinsDiscritizer, 6) 7 components of a PCA mapping, and 7) 7 components of TruncatedSVD map. 
## This will extend the original 22 features we have from the original data to new 124 features we will use for three di
## different type of classifiers, such as logistic, SVM, and Neural Network
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.neural_network import MLPClassifier 
from sklearn.linear_model import LogisticRegression ## only use this if you want to do an extra feature selection from the new map of features
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.manifold import TSNE ## you can try TSNE if you consider
from sklearn.pipeline import FeatureUnion
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.feature_selection import RFE
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import RocCurveDisplay
from matplotlib import pyplot
import matplotlib.pyplot as plt
import numpy as np
import sys
import pandas as pd

In [None]:
data=[]
## read csv file it is faster
dataframeObject = pd.DataFrame(pd.read_csv(str(sys.argv[1]))) ## add the name or the path of the .csv file transformed from the EDA

## assigning the features names and putting them in a list
features=list(dataframeObject.columns.values)

for index in range(1,len(features)):
    index_feature=features[index]
    dataframeObject[[index_feature]].replace(np.nan,0)
    data.append(dataframeObject[[index_feature]].to_numpy())

data=np.squeeze(np.array(data))
shape=np.shape(data)

## convert all strings in integer values or dummy features similar to the EDA 
for count in range(0,shape[0]):
  possibilities=[]
  data_temp=[]
  if isinstance(data[count,0],str):
    for in_count in range(0,shape[1]):
       if not(data[count,in_count] in possibilities):
          possibilities.append(data[count,in_count])
       index_val = int(possibilities.index(data[count,in_count]))
       data_temp.append(index_val)
    data[count,:]=np.array(data_temp)

shape=np.shape(data)

In [None]:
## data definition
DATA=np.transpose(data[0:22,:])
labels=data[22,:]

shape=np.shape(DATA)
##definition of crossvalidation using the KFold object from sklearn. Define the second argument for the k-folding evaluation
kf = KFold(n_splits=int(sys.argv[2]))
kf.get_n_splits(DATA)
acc=np.zeros([int(sys.argv[2])])

KFold(n_splits=int(sys.argv[2]), random_state=None, shuffle=False)

## definition of the extra argument for the plotting in case it is 1
if int(sys.argv[3])==1:
    plt.ion()
    plt.show()

In [None]:
## crossvalidation execution the folding is defined  as an input parameter
for i, (train_index, test_index) in enumerate(kf.split(DATA)):
        print(f":Fold {i}:")
        #print(f"  Train: index={train_index}")
        #print(f"  Test:  index={test_index}")
        transforms = list()
        transforms.append(('mms', MinMaxScaler()))
        transforms.append(('ss', StandardScaler()))
        transforms.append(('rs', RobustScaler()))
        transforms.append(('qt', QuantileTransformer(n_quantiles=100, output_distribution='normal')))
        transforms.append(('kbd', KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')))
        transforms.append(('pca', PCA(n_components=7)))
        transforms.append(('svd', TruncatedSVD(n_components=7)))

        # create the feature union
        fu = FeatureUnion(transforms)
        # define the feature selection only if you consider it to use it
        #rfe = RFE(estimator=LogisticRegression(solver='liblinear'), n_features_to_select=15)
        # define the model
        model = MLPClassifier(solver='adam', alpha=1e-5,hidden_layer_sizes=(100,10),learning_rate='adaptive',random_state=1)
        steps = list()
        steps.append(('fu', fu))
        #steps.append(('rfe', rfe))
        # normalize before feeding the model
        scaler=MinMaxScaler()
        steps.append(('sc',scaler))
        steps.append(('m', model))
        pipeline = Pipeline(steps=steps)
        #train the model
        DATA_train=DATA[train_index,:]
        DATA_test=DATA[test_index,:]
        #tsne_results =  tsne_pipeline.fit_transform(DATA_train) ## uncommented only if you consider it
        pipeline.fit(DATA_train,labels[train_index].astype('int'))
        predictions = pipeline.predict(DATA_test)
        # calculate classification accuracy
        acc[i] = accuracy_score(labels[test_index].astype('int'), predictions)
        print(acc[i])
        if int(sys.argv[3])==1:
            RocCurveDisplay.from_estimator(pipeline, DATA_test, labels[test_index].astype('int'))
            plt.draw()
            plt.pause(0.001)
            input("Press [enter] to continue.")
acc_mean=np.mean(acc)
acc_std=np.std(acc)
print(f":accuracy:{acc_mean} +/- {acc_std}")