## Autoencoder ML results

In [None]:
from sources.ml_f1 import*
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
import pandas as pd

# ML models  
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import plot_importance
from sklearn.utils import resample

In [None]:
# The train data
X_data1 = pd.read_csv('normalised/X_auto_feats.csv')
X_umap = pd.read_csv('X_umap_feats.csv')
y = pd.read_csv('normalised/all_color_y.csv')

In [None]:
y

In [None]:
X_data1['umap1'] = X_umap["umap1"]
X_data1['umap2'] = X_umap["umap2"]


In [None]:
X_data1

In [None]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_norm = scaler.fit_transform(X_data1)
X_data = pd.DataFrame( X_norm, columns = X_data1.columns)

# X_data  = X_data.drop("index", axis="columns")

In [None]:
# X_data = X_data1

In [None]:
X_train1, X_test, y_train1, y_test = train_test_split(X_data, y, test_size= 0.25, random_state=1, stratify = y, shuffle = True)


In [None]:
## random forest (RF)
# The Random Hyper parameter Grid

# number of trees in the forest
n_estimators = [50, 100, 150]

# Number of feature to consider at every split
max_features = [2, 3]

# Maximum number of levels in tree
max_depth = [5, 10]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 3]

# Method of selecting samples for training each tree
bootstrap = [True, False]


# Create the random grid
rf_par = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
               # 'min_samples_split': min_samples_split,
               # 'min_samples_leaf': min_samples_leaf,
               # 'bootstrap': bootstrap}



rf_model= RandomForestClassifier(random_state=1)
rf_par = dict(n_estimators=n_estimators)

In [None]:
# Super Vector Machines
svm_model = SVC(kernel='linear')


svm_par = {'gamma': np.linspace(0.0001, 10, 15)}


In [None]:
# KNN model
knn_model = KNeighborsClassifier()

## KNN parameters
knn_par = {'n_neighbors' : [5, 10, 15], 'p':[1, 2], 'weights' : ['uniform', 'distance'] }


## logisitc regression (LR)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]

lr_model = LogisticRegression()
lr_par = dict(solver=solvers,penalty=penalty,C=c_values)

In [None]:
# Set up models and Parameters for a "for loop"  

models = [[lr_model, 'lr'], [knn_model, 'knn'], [rf_model, 'rf']]

parameters = [ lr_par, knn_par, rf_par]

In [None]:
ml_dicts = {}

In [None]:
features = [
            # ['qir'], 
            # ['qir', 'class_star'],
            # ['qir', 'class_star', 'log(S8/S45)'],
            # ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)'],
            ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)', 'Mstar'],
            # ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)', 'Mstar', 'log(S45/S36)'],
            # ['qir', 'class_star', 'Mstar', 'log(S45/S36)'],
            ['qir','auto1','auto2'],
            ['qir','umap1','umap2'],
            
]

In [None]:
splits = [0.2] #,0.4,0.6,0.8]

# Loop through different ML models coupled with thier hyper paramter (use the same splits for all features)
for m, par in zip(models, parameters):
    for s in splits:
        # X_train, X_vald, y_train, y_vald = train_test_split(X_balanced, y_balanced, test_size= s, random_state=1, stratify = y_balanced, shuffle = True)
        X_train, X_vald, y_train, y_vald = train_test_split(X_train1, y_train1, test_size= s, random_state=1, stratify = y_train1, shuffle = True)
        key0 = str(m[1])
        print(key0)
        ml_dicts[key0] = {} # defining The main subkeys, which are the machine learning models
        
        i = 1
        for f in features:
            xtr =  X_train[f]
            xva =  X_vald[f]
            xte =  X_test[f]
            
            results = get_f1_ml (m[0], par, xtr, y_train, xva, y_vald, xte, y_test) # to get the f1 for the ml model

            key = "F"+str((i)) # Create keys for the each feature set in order to reference results
            ml_dicts[key0][key] = {}

            ml_dicts[key0][key]['tot_f1_vald'] = results[0]
            ml_dicts[key0][key]['tot_f1_test'] = results[1]
            ml_dicts[key0][key]['jack_train'] = results[2]
            ml_dicts[key0][key]['jack_vald'] = results[3]
            ml_dicts[key0][key]['jack_test'] = results[4]
            i += 1

In [None]:
arr_all = []
for m, d in zip (models, ml_dicts.keys()):
    f1_arr_vald = []
    f1_arr_test = []
    sd_vald_arr = []
    sd_arr = [] 
    
    # print(ml_dicts[d])
    for key in ml_dicts[d].keys():
        f1_arr_vald.append(ml_dicts[d][key][ 'tot_f1_vald' ]) # append total valdation f1 score to an array
        f1_arr_test.append(ml_dicts[d][key][ 'tot_f1_test' ]) # append total test f1 score to an array
        
        sd_train = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_train' ]) ), ml_dicts[d][key][ 'jack_train' ])[0]
        sd_vald = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_vald' ]) ), ml_dicts[d][key][ 'jack_vald' ])[0]
        sd_test = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_test' ]) ), ml_dicts[d][key][ 'jack_test' ])[0]
        
        sd_v = np.sqrt( np.array((sd_train**2)) + np.array((sd_vald**2)))
        sd = np.sqrt( np.array((sd_train**2)) + np.array((sd_test**2)))
       
        sd_vald_arr.append(sd_v)
        sd_arr.append(sd)
        # append the SD to the sd_arr
    arr_all.append([ list(ml_dicts[d].keys()), f1_arr_vald, f1_arr_test, sd_vald_arr, sd_arr])    


In [None]:
colors = ['blue', 'green', 'orange', 'red']
fig, axs = plt.subplots(2, figsize=(15, 9), sharex=True, sharey =True)

count = 0
n = 5

space = []
tickFeat = []

for result, model, color in zip(arr_all, models, colors):
    a = np.linspace(n*count, n*(1+count)-2,len(features)) # to get index on the x-axis
    space.extend(a)
    tickFeat.extend(result[0])
    axs[0].errorbar( a, result[1], result[3], fmt='o', label =model[1], color = color)
    axs[0].set_title( "F1 Score for different features with for Machine learning models", fontweight ='bold', fontsize =12)
    axs[0].set_ylabel("F1 score(vald)", fontweight ='bold', fontsize =12)
    axs[0].set_ylim(.80, 1)
    axs[0].legend(loc = 'lower left')
    
    axs[1].errorbar( a, result[2], result[4], fmt='o', label =model[1], color = color)
    axs[1].set_xlabel("Features", fontweight ='bold', fontsize =12)
    axs[1].set_ylabel("F1 score(test)", fontweight ='bold', fontsize =12)
    axs[1].set_ylim(.80, 1)
    axs[1].legend(loc = 'lower left')
    
    count += 1

plt.xticks(space, tickFeat, rotation = 'vertical',  fontsize =12)
plt.savefig('normalised/ml_photo')
plt.show()


In [None]:
print(tickFeat)

In [None]:
arr_all

In [None]:
colors = ['blue', 'green', 'orange']

plt.figure(figsize=(10, 7))

count = 0
n = 5
space = []
tickFeat = []
modelname = ['LR', r'$k$NN', 'SVM']
x_label = ['F5', 'F-Auto', 'F-tSNE', 'F5', 'F-Auto', 'F-tSNE', 'F5', 'F-Auto', 'F-tSNE']
for result, model, color in zip(arr_all, models, colors):
    a = np.linspace(n*count, n*(1+count)-2,len(features)) # to get index on the x-axis
    space.extend(a)
    tickFeat.extend(result[0])
    plt.errorbar( a, result[1], result[3], fmt='o', label =modelname[count], color = color)
    # plt.title( "F1 Score on validation dataset for different features with the SD", fontweight ='bold', fontsize =12)
    plt.ylabel("F1 score", fontsize =20)
    plt.ylim(.80, 1)
    plt.legend(loc = 'lower left', fontsize=18)
    
    count += 1
    
# plt.xticks(space, tickFeat, rotation = 'vertical',  fontsize =12)
plt.xticks(space, x_label, rotation = 'vertical',  fontsize =18)
plt.yticks(fontsize =20)

plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout() 
plt.savefig('autoecoder-tsne-ML.pdf')
plt.show()



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# # 9 features Not normalised
# data = [[['F1', 'F2', 'F3'],
#   [0.9460093896713615, 0.9391100702576113, 0.9191090269636577],
#   [0.93342776203966, 0.9347517730496453, 0.9036827195467423],
#   [0.011403668046981797, 0.013822182484723958, 0.015614808860477473],
#   [0.010608843085923086, 0.012886083375339424, 0.014668605038853161]],
#  [['F1', 'F2', 'F3'],
#   [0.9519343493552168, 0.9181818181818182, 0.8464912280701753],
#   [0.9516358463726884, 0.9138525155065471, 0.8526816021724372],
#   [0.03377880888508106, 0.010171642721227364, 0.013640417659187698],
#   [0.033452533066699076, 0.00834244994083523, 0.010928097541911623]],
#  [['F1', 'F2', 'F3'],
#   [0.9702734839476814, 0.9386792452830188, 0.9679715302491104],
#   [0.9655172413793104, 0.9441260744985674, 0.9563350035790981],
#   [0.020886209650073755, 0.04095841441390338, 0.017753031753787517],
#   [0.020676647382497606, 0.0405593230821267, 0.017462551526233407]]]

# 9 features normalised
data = [[['F1', 'F2', 'F3'],
  [0.930774, 0.9397874852420307, 0.9178403755868544],
  [0.930298719772404, 0.9358974358974359, 0.9043231750531537],
  [0.009310, 0.009029540429940724, 0.014087855214666304],
  [0.007344375381009317, 0.007491645451451747, 0.013031246394187438]],
 [['F1', 'F2', 'F3'],
  [0.955497, 0.9473684210526315, 0.9388696655132642],
  [0.9426573426573426, 0.9434229137199434, 0.9406009783368273],
  [0.004061, 0.008962371110196318, 0.0315068687902453],
  [0.009615402435801501, 0.007667072818743167, 0.030958490350562768]],
 [['F1', 'F2', 'F3'],
  [0.934688, 0.9374262101534828, 0.9679715302491104],
  [0.9647735442127965, 0.9441260744985674, 0.9563350035790981],
  [0.004905, 0.04024028035083308, 0.017753031753787517],
  [0.020999849951083373, 0.03982968916039549, 0.017462551526233407]]]

# Normalised data
# data = [
#     [['F1', 'F2', 'F3'],
#      [0.930774, 0.9397874852420307, 0.9248826291079812],
#      [0.930298719772404, 0.9358974358974359, 0.9162561576354681],
#      [0.009310, 0.009029540429940724, 0.013427934722864947],
#      [0.007344375381009317, 0.007491645451451747, 0.012292397127908219]],
#     [['F1', 'F2', 'F3'],
#      [0.955497, 0.9473684210526315, 0.9358226371061844],
#      [0.9426573426573426, 0.9434229137199434, 0.9361702127659576],
#      [0.004061, 0.008962371110196318, 0.009120083874880069],
#      [0.009615402435801501, 0.007667072818743167, 0.00732350647707195]],
#     [['F1', 'F2', 'F3'],
#      [0.934688, 0.9374262101534828, 0.9679715302491104],
#      [0.9647735442127965, 0.9441260744985674, 0.9544468546637743],
#      [0.004905, 0.04024028035083308, 0.014877443069406228],
#      [0.020999849951083373, 0.03982968916039549, 0.014560495784041798]]
# ]


# Original data
# data = [
#     [['F1', 'F2', 'F3'],
#   [0.930774, 0.9391100702576113, 0.9259694477085781],
#   [0.93342776203966, 0.9347517730496453, 0.9162561576354681],
#   [0.009310, 0.013822182484723958, 0.012637593810675722],
#   [0.010608843085923086, 0.012886083375339424, 0.011474169134909801]],
#  [['F1', 'F2', 'F3'],
#   [0.955497, 0.9181818181818182, 0.851528384279476],
#   [0.9516358463726884, 0.9138525155065471, 0.8505747126436781],
#   [0.004061, 0.010171642721227364, 0.013258153663161566],
#   [0.033452533066699076, 0.00834244994083523, 0.010766954009748607]],
#  [['F1', 'F2', 'F3'],
#   [0.934688, 0.9386792452830188, 0.9679715302491104],
#   [0.9655172413793104, 0.9441260744985674, 0.9544468546637743],
#   [0.004905, 0.04095841441390338, 0.014877443069406228],
#   [0.020676647382497606, 0.0405593230821267, 0.014560495784041798]]
# ]
# Convert to DataFrame
dfs = []
# model_names = ['LR', 'kNN', 'SVM']
model_names = ['LR', r'$k$NN', 'SVM']
colors = ['blue', 'green', 'orange']

for i, model_data in enumerate(data):
    df = pd.DataFrame({
        'Feature': model_data[0],
        'Mean': model_data[1],
        'Median': model_data[2],
        'Std': model_data[3],
        'SEM': model_data[4],
        'Model': model_names[i],
        'Color': colors[i]
    })
    dfs.append(df)

df = pd.concat(dfs, ignore_index=True)

# Plotting
plt.figure(figsize=(10, 7))

# Create x-axis positions
n_models = len(model_names)
features = ['F1', 'F2', 'F3']
x_label = ['F5', 'F-Auto', 'F-tSNE'] * n_models

# Calculate x positions
x_pos = []
for i in range(n_models):
    x_pos.extend(np.linspace(5*i, 5*(i+1)-2, len(features)))

# Plot each model
for i, model in enumerate(model_names):
    model_df = df[df['Model'] == model]
    plt.errorbar(
        x_pos[i*len(features):(i+1)*len(features)],
        model_df['Mean'],
        yerr=model_df['Std'],
        fmt='o',
        label=model,
        color=colors[i],
        capsize=5
    )

plt.ylabel("F1 score", fontsize=20)
plt.ylim(0.80, 1)
plt.legend(loc='lower left', fontsize=18)
plt.xticks(x_pos, x_label, rotation='vertical', fontsize=18)
plt.yticks(fontsize=20)
plt.grid(True, linestyle='--', alpha=0.4)
plt.tight_layout()
plt.savefig('autoecoder-tsne-ML-norm.png')
plt.show()

In [None]:
df

In [None]:
print