In [None]:
import pandas as pd
import numpy as np

from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import Perceptron
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn import tree


from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.model_selection import cross_val_score

NUM_SYNDROMES = 13
NUM_SYMPTOMS = 81
NUM_ITERATIONS = 9

In [None]:
tcm_data = pd.read_csv("FILE_NAME", usecols=lambda x: x != "ID")
tcm_x, tcm_y = tcm_data.shape
tcm_data

In [None]:
df = pd.DataFrame(tcm_data)
shift_cols = ["HTN", "DM", "CHD", "CVD"]

for i in range(len(shift_cols)):
    col_name = df.pop(shift_cols[i])
    df.insert(tcm_y-1, shift_cols[i], col_name)  

df = df.to_numpy()

In [None]:
class MLModel():
    def __init__(self, instance):
        self.name = str(instance.__name__)
        if self.name == "LogisticRegression":
            self.instance = instance(random_state=0, C=0.1)
        elif self.name == "SVM":
            param_grid = {
                "kernel": "poly"
            }
            self.instance = GridSearch(estimator=instance, param_grid=param_grid, cv=5)
        else:
            self.instance = instance()
        self.accuracy = 0.0
        self.mean = 0.0
        self.std = 0.0
        self.sqrt_mean = 0.0

In [None]:
def update_balanced_data(input_data, syndrome_col):
    zero_data = input_data[input_data[:,syndrome_col] == 0]
    one_data = input_data[input_data[:,syndrome_col] == 1]

    zero_r, zero_c = zero_data.shape
    one_r, one_c = one_data.shape

    if zero_r > one_r:
        np.random.shuffle(zero_data)
        balanced_data = np.concatenate((zero_data[0:one_r,:],one_data), axis=0)
    else:
        np.random.shuffle(one_data)
        balanced_data = np.concatenate((zero_data,one_data[0:zero_r,:]), axis=0)

    return balanced_data

In [None]:
# ml_models = [MLModel(instance=DecisionTreeClassifier), MLModel(instance=SVC), MLModel(instance=LogisticRegression), MLModel(instance=RandomForestClassifier), MLModel(instance=KNeighborsClassifier), MLModel(instance=Perceptron)]
ml_models = [MLModel(instance=SVC)]
extended_models = [[m for m in ml_models] for _ in range(NUM_SYNDROMES)]
avg_acc = []
def run_each_syndrome_measurement_once():
    for sdr_f in range(NUM_SYNDROMES):
        # print(f"R{sdr_f+1}")
        data = update_balanced_data(df, NUM_SYMPTOMS + sdr_f)
        print(f'R{sdr_f+1} - Balanced data shape: {data.shape}')
        
        train_data = data[:,:NUM_SYMPTOMS]
        test_data = data[:, NUM_SYMPTOMS+sdr_f]
        X_train, X_test, Y_train, Y_test = train_test_split(train_data, test_data, test_size=0.3)

        X1_test, X2_test = np.array_split(X_test, 2)
        Y1_test, Y2_test = np.array_split(Y_test, 2)

        kfold = KFold(n_splits=5, shuffle=True, random_state=0)
  
        for model in range(len(ml_models)):
            clf = extended_models[sdr_f][model].instance
            clf.fit(X_train, Y_train)
            # Model score
            model_score = clf.score(X_test, Y_test)

            # Get confident score base on X_test
            Y_pred = clf.predict(X_test)

            score_sdt_p1 = clf.score(X1_test, Y1_test)
            np.random.shuffle(Y2_test)
            score_sdt_p2 = clf.score(X2_test, Y2_test)

            
            # pred_acc = accuracy_score(Y_test, Y_pred)
            extended_models[sdr_f][model].accuracy += model_score

            mean_sqrt_error = mean_squared_error(Y_test, Y_pred)
            extended_models[sdr_f][model].sqrt_mean += mean_sqrt_error
            
            val_score = cross_val_score(clf, X_train, Y_train, cv=kfold)
            extended_models[sdr_f][model].std += val_score.std()
            extended_models[sdr_f][model].mean += val_score.mean()
            
            avg_acc.append(model_score * 100)
            

            print(f"Name: {extended_models[sdr_f][model].name}   Accuracy: {model_score * 100:.3f}%   Squared Mean: {mean_sqrt_error:.3f}   Standard Deviation: {val_score.std():.3f}   Mean: {val_score.mean():.3f}\n")

In [None]:
run_each_syndrome_measurement_once()

In [None]:
for m in range(NUM_ITERATIONS):
    print(f"Starting iteration: {m+1}")
    run_each_syndrome_measurement_once()

for i in range(NUM_SYNDROMES):
    for j in range(len(ml_models)):
        extended_models[i][j].accuracy /= NUM_ITERATIONS
        extended_models[i][j].sqrt_mean /= NUM_ITERATIONS
        extended_models[i][j].std /= NUM_ITERATIONS
        extended_models[i][j].mean /= NUM_ITERATIONS
        print(f"R{i+1}\n")
        print(f"Name: {extended_models[i][j].name}   Accuracy: {(extended_models[i][j].accuracy // NUM_SYNDROMES)*100:.3f}%  Mean Squared Error: {(extended_models[i][j].sqrt_mean // NUM_SYNDROMES):.3f}    Standard Dev: {extended_models[i][j].std // NUM_SYNDROMES}")