In [1]:
import openpyxl
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.metrics import accuracy_score
from numpy.linalg import norm
from sklearn.model_selection import LeaveOneOut
from decimal import *
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score
import math
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score

number_runnings = 50

fetal = pd.read_csv("fetal.csv")
fetal["Outcome"].value_counts()

Y = fetal['Outcome']
target_names=["0", "1"]
X = fetal[['accelerations',	'fetal_movement',	'uterine_contractions',	'light_decelerations',	'severe_decelerations',	'prolongued_decelerations',
              'abnormal_short_term_variability',	'mean_value_of_short_term_variability',	'percentage_of_time_with_abnormal_long_term_variability',
              'mean_value_of_long_term_variability',	'histogram_width',	'histogram_min',	'histogram_max',	'histogram_number_of_peaks',
              'histogram_number_of_zeroes',	'histogram_mode',	'histogram_mean',	'histogram_median',	'histogram_variance',	'histogram_tendency'	]]
maximums = np.amax(np.array(X), axis=0)
X_featurenames = X.columns

noOfInstances = fetal.shape[0]
print(noOfInstances)
df = pd.DataFrame(fetal)
corr = df.corrwith(df["Outcome"])
print(corr)
counter = Counter(Y)
print(counter[0])
print(counter[1])

feature_names = ['accelerations',	'fetal_movement',	'uterine_contractions',	'light_decelerations',	'severe_decelerations',	'prolongued_decelerations',
              'abnormal_short_term_variability',	'mean_value_of_short_term_variability',	'percentage_of_time_with_abnormal_long_term_variability',
              'mean_value_of_long_term_variability',	'histogram_width',	'histogram_min',	'histogram_max',	'histogram_number_of_peaks',
              'histogram_number_of_zeroes',	'histogram_mode',	'histogram_mean',	'histogram_median',	'histogram_variance',	'histogram_tendency']
results = pd.read_excel('resultsSignsFetalMSPMS_NoMinMax.xlsx')
noOfIterations = 20

model = XGBClassifier()
# fit the model
model.fit(X, Y)
# get importance
importance = model.feature_importances_
# summarize feature importance
for i in range(len(feature_names)):
    print(importance[i])

df_LIME = pd.read_excel('rankingsFetalLIME.xlsx')
df_LIME = df_LIME[df_LIME.columns[1:]]
W_LIME = []
df_TreeShap = pd.read_excel('rankingsFetalTreeShap.xlsx')
df_TreeShap = df_TreeShap[df_TreeShap.columns[1:]]
df_KernelShap = pd.read_excel('rankingsFetalKernelShap.xlsx')
df_KernelShap = df_KernelShap[df_KernelShap.columns[1:]]
W_Shap = []
W_all = []

for instance in range (0, noOfInstances):
    m = pd.DataFrame(columns = ['L', 'S'])
    line_LIME  = [results.iloc[instance, 1], results.iloc[instance, 2]]
    m.loc[0] = line_LIME
    line_Shap  = [results.iloc[instance, 3], results.iloc[instance, 4]]
    m.loc[1] = line_Shap
    A = []
    for i in range(0, 2):
        s = 0
        for j in range(0, 2):
            s = s + m.iloc[i, j] + m.iloc[j, i]
        s = s / 4
        A.append(s)
  
    data_LIME = []
    data_Shap = []
    for i in range(noOfIterations):
        data_LIME.append(df_LIME.loc[instance * noOfIterations + i].values.flatten().tolist())
        
    for i in range(int(noOfIterations / 2)):
        data_Shap.append(df_KernelShap.loc[instance].values.flatten().tolist())
        data_Shap.append(df_TreeShap.loc[instance].values.flatten().tolist())
        
    M_LIME  = pd.DataFrame(data_LIME, columns = feature_names)
    w_LIME = []
    M_Shap  = pd.DataFrame(data_Shap, columns = feature_names)
    w_Shap = []
    
    for i in range(noOfIterations):
        x = [0] * len(feature_names)
        for k in range(0, 2):
            x = x + A[k] * M_LIME.loc[i].values.flatten()
        y = sum(A)
        w = [0] * len(feature_names)
        for k in range(len(feature_names)):
            w[k] = x[k] / y
        w_LIME.append(w)

        x = [0] * len(feature_names)
        for k in range(0, 2):
            x = x + A[k] * M_Shap.loc[i].values.flatten()
        y = sum(A)
        w = [0] * len(feature_names)
        for k in range(len(feature_names)):
            w[k] = x[k] / y
        w_Shap.append(w)
        
    averageLIME = np.mean(w_LIME, axis=0)
    W_LIME.append(averageLIME)
    averageShap = np.mean(w_Shap, axis=0)
    W_Shap.append(averageShap)
    averageMatrix = [averageLIME]
    average = np.mean(averageMatrix, axis=0)
    sum_average = sum(average)
    sum_corr = sum(corr) - 1
    for i in range(0, len(feature_names)):
        #average[i] = (average[i] / sum_average + corr.iloc[i] / sum_corr) / 2
        #average[i] = average[i] * corr.iloc[i]
        #average[i] = average[i] * (corr.iloc[i] / sum_corr)
        #average[i] = average[i] + corr.iloc[i]
        average[i] = average[i] 
    W_all.append(average)
    # print(averageLIME)
    # print(averageAnchor)
    # print(averageShap)
    # print(average)
    # print("============")

average_all = np.nanmean(W_all, axis=0)
print(average_all)


def compute_weights(distance):
    weights = []
    for i in range(len(distance)): 
        weights.append(norm(W_all[i]) * distance[i])
    return weights

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
    
# knn = KNeighborsClassifier(n_neighbors = 5)
# knn.fit(X_train,Y_train) 
# Y_pred = knn.predict(X_test)
# print(accuracy_score(Y_test,Y_pred))
# print(f1_score(Y_test,Y_pred))

#X2 = scaler.fit_transform(X)
scale_pos_weight = counter[0] / counter[1]
model = xgb.XGBClassifier(scale_pos_weight = scale_pos_weight)
# model.fit(X_train, Y_train)
# pred = model.predict(X_test)
# accuracy = accuracy_score(Y_test, pred)
# print("Accuracy:", accuracy)
scores = cross_val_score(model, X, Y, cv = 10, scoring='roc_auc')
print(np.mean(scores))


for i in range(noOfInstances):
    for j in range(len(feature_names)):
        X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])


X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, shuffle = True)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#X = scaler.fit_transform(X)
#knn = KNeighborsClassifier(n_neighbors = 5, weights = 'uniform')
model = xgb.XGBClassifier(scale_pos_weight = scale_pos_weight)
scores = cross_val_score(model, X, Y, cv = 10, scoring='roc_auc')
print(np.mean(scores))


# knn = KNeighborsClassifier(n_neighbors = 5)
# knn.fit(X_train,Y_train) 
# Y_pred = knn.predict(X_test)
# print(accuracy_score(Y_test,Y_pred))
# print(f1_score(Y_test,Y_pred))
    
# knn = KNeighborsClassifier(n_neighbors = 5, weights = compute_weights)
# knn.fit(X_train,Y_train) 
# Y_pred = knn.predict(X_test)
# print(accuracy_score(Y_test,Y_pred))
# print(f1_score(Y_test,Y_pred))
# print("---------------")

# knn = KNeighborsClassifier(n_neighbors = 5, weights = compute_weights)
# scores = cross_val_score(knn, X, Y, cv=20)
# print(scores)
# print(np.mean(scores))

# cv = LeaveOneOut()
# s = 0
# for train_ix, test_ix in cv.split(X):
#     knn = KNeighborsClassifier(n_neighbors = 5, weights = compute_weights)
#     X_train, X_test = X.to_numpy()[train_ix, :], X.to_numpy()[test_ix, :]
#     Y_train, Y_test = Y[train_ix], Y[test_ix]
#     knn.fit(X_train, Y_train)  
#     pred = knn.predict(X_test)
#     accuracy_classifier = accuracy_score(Y_test, pred)
#     s = s + accuracy_classifier
# print(s / len(X))

def euclidian_distance(a, b):
    dim = len(a)
    distance = 0
    for i in range(dim):
        distance += abs(a[i] - b[i])**2  
    distance = math.sqrt(distance)    
    return distance


def knn_predict_non_weighted(X_train, X_test, Y_train, Y_test, k):
    
    from collections import Counter
    Y_hat_test = []

    for test_point in X_test:
        distances = []
        for train_point in X_train:
            dim = len(train_point)
            distance = 0
            for i in range(dim):
                distance = distance + math.sqrt((train_point[i] - test_point[i]) * (train_point[i] - test_point[i]))
            distances.append(distance)

        df_dists = pd.DataFrame(data = distances, columns = ['dist'], 
                                index = Y_train.index)

        df_nn = df_dists.sort_values(by=['dist'], axis = 0)[:k]

        counter = Counter(Y_train[df_nn.index])

        prediction = counter.most_common()[0][0]

        Y_hat_test.append(prediction)
        
    return Y_hat_test



def knn_predict_weighted(X_train, X_test, Y_train, Y_test, k):
    
    from collections import Counter
    Y_hat_test = []

    for test_point in X_test:
        distances = []
        for train_point in X_train:
            dim = len(train_point)
            distance = 0
            for i in range(dim):
                distance = distance + math.sqrt(average_all[i] * (train_point[i] - test_point[i]) * (train_point[i] - test_point[i]))
            distances.append(distance)

        df_dists = pd.DataFrame(data = distances, columns = ['dist'], 
                                index = Y_train.index)

        df_nn = df_dists.sort_values(by=['dist'], axis = 0)[:k]

        counter = Counter(Y_train[df_nn.index])

        prediction = counter.most_common()[0][0]

        Y_hat_test.append(prediction)
        
    return Y_hat_test

outputs = pd.DataFrame(columns = ['ACC_WL', 'F1_WL', 'ROC_WL'])
acc_non_weighted = 0
acc_weighted = 0
f1_non_weighted = 0
f1_weighted = 0
roc_non_weighted = 0
roc_weighted = 0
for i in range(number_runnings):
    output = []
    #print('------------------')
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, shuffle = True)
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)
    
    Y_hat_test = knn_predict_non_weighted(X_train, X_test, Y_train, Y_test, k=5)
    #print(f1_score(Y_test, Y_hat_test))
    acc_non_weighted += accuracy_score(Y_test, Y_hat_test)
    output.append(accuracy_score(Y_test, Y_hat_test))
    f1_non_weighted += f1_score(Y_test, Y_hat_test)
    output.append(f1_score(Y_test, Y_hat_test))
    roc_non_weighted += roc_auc_score(Y_test, Y_hat_test)
    output.append(roc_auc_score(Y_test, Y_hat_test))
    
    outputs.loc[i] = output


acc_non_weighted = acc_non_weighted / number_runnings
acc_weighted = acc_weighted / number_runnings
f1_non_weighted = f1_non_weighted / number_runnings
f1_weighted = f1_weighted / number_runnings
roc_non_weighted = roc_non_weighted / number_runnings
roc_weighted = roc_weighted / number_runnings
output = []
output.append(acc_non_weighted)
output.append(f1_non_weighted)
output.append(roc_non_weighted)


outputs.loc[number_runnings] = output

outputs.to_excel("Signs_accuraciesDiabetesMSPMS_NoMinMax_LIME_" + str(number_runnings) + ".xlsx")

# cv = LeaveOneOut()
# s_non_weighted = 0
# s_weighted = 0
# for train_ix, test_ix in cv.split(X):
#     X_train, X_test = X.to_numpy()[train_ix, :], X.to_numpy()[test_ix, :]
#     Y_train, Y_test = Y[train_ix], Y[test_ix]
#     scaler = StandardScaler()
#     X_train = scaler.fit_transform(X_train)
#     X_test = scaler.transform(X_test)
#     Y_hat_test_non_weighted = knn_predict_non_weighted(X_train, X_test, Y_train, Y_test, k=5)
#     s_non_weighted = s_non_weighted + accuracy_score(Y_test, Y_hat_test_non_weighted)
#     Y_hat_test_weighted = knn_predict_weighted(X_train, X_test, Y_train, Y_test, k=5)
#     s_weighted = s_weighted + accuracy_score(Y_test, Y_hat_test_non_weighted)
# print(s_non_weighted / len(X))
# print(s_weighted / len(X))

2126
baseline value                                            0.243235
accelerations                                            -0.378693
fetal_movement                                            0.062202
uterine_contractions                                     -0.270586
light_decelerations                                      -0.078528
severe_decelerations                                           NaN
prolongued_decelerations                                  0.351735
abnormal_short_term_variability                           0.483140
mean_value_of_short_term_variability                     -0.030078
percentage_of_time_with_abnormal_long_term_variability    0.478808
mean_value_of_long_term_variability                      -0.072185
histogram_width                                          -0.217677
histogram_min                                            -0.053095
histogram_max                                             0.000216
histogram_number_of_peaks                                

  c /= stddev[:, None]
  c /= stddev[None, :]


0.16361637
0.039452665
0.055589207
0.030210951
0.0
0.15426126
0.1664445
0.021587783
0.07321094
0.021267923
0.01470931
0.01915477
0.026327614
0.016911361
0.036992695
0.046137966
0.068591766
0.025282377
0.020250501
0.0
[ 7.60823529  6.68941176  5.90724706 10.30934118 11.          9.99884706
  8.55334118  7.6352      7.29861176  7.81555294 10.55978824  5.29310588
  7.56623529  7.22437647  9.60249412  7.33661176  6.24590588  7.65237647
  9.70331765 11.        ]
0.9352789624002134


  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])
  X.iloc[i, j] = X.iloc[i, j] * math.sqrt(W_all[i][j])


0.9285958728531147
