## We now compare the results of each notebook in experiment 6

In [1]:
from sources.ml_f1 import*
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
import pandas as pd

# ML models  
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import plot_importance
from sklearn.utils import resample

In [2]:
# We call the test and train data saved from the processing notebook
%store -r base_dict
# mightee_data = pd.read_csv('raw_data.csv')
mightee_data = pd.read_csv('normalised/scaled_raw_zs.csv')

# The train data
# X_data = pd.read_csv('X_train_table.csv')
# y_data = pd.read_csv('y_train_table.csv')
# # The Unseen test data
# X_test = pd.read_csv('X_test_table.csv')
# y_test = pd.read_csv('y_test_table.csv')
# The train data
X_data = pd.read_csv('normalised/X_train_zs.csv')
y_data = pd.read_csv('normalised/y_train_zs.csv')

# The Unseen test data
X_test = pd.read_csv('normalised/X_test_zs.csv')
y_test = pd.read_csv('normalised/y_test_zs.csv')

y = y_data['labels']

In [3]:
# Combine features and labels for sampling
data = pd.concat([X_data, y], axis=1)

# Separate classes
class_counts = y.value_counts()
majority_class = class_counts.idxmax()
minority_class = class_counts.idxmin()

df_majority = data[data['labels'] == majority_class]
df_minority = data[data['labels'] == minority_class]

# Downsample majority class
df_majority_downsampled = resample(df_majority,
                                   replace=False,    # sample without replacement
                                   n_samples=len(df_minority),  # to match minority class
                                   random_state=42)  # reproducible results

# Combine minority class with downsampled majority class
df_balanced = pd.concat([df_majority_downsampled, df_minority])

# Split back into X and y
X_balanced = df_balanced.drop('labels', axis=1)
y_balanced = df_balanced['labels']

In [4]:
# saving the dataframe as raw_data
X_balanced.to_csv('normalised/X_train_bal.csv', index = False, header=True)
y_balanced.to_csv('normalised/y_train_bal.csv', index = False, header=True)

In [5]:
type(y_data['labels'][1])

numpy.int64

In [6]:
len(y_data['labels'])

3209

In [7]:
print("Length of original SFGs", len(y_data['labels']==1))
print("Length of original AGN", len(y_data['labels']==0))
print("Length of balanced SFGs", len(df_balanced['labels']==1))
print("Length of balanced AGN", len(df_balanced['labels']==0))

Length of original SFGs 3209
Length of original AGN 3209
Length of balanced SFGs 2288
Length of balanced AGN 2288


---

### Hyperparameters

#### RF

In [8]:
## random forest (RF)
# The Random Hyper parameter Grid

# number of trees in the forest
n_estimators = [50, 100, 150]

# Number of feature to consider at every split
max_features = [2, 3]

# Maximum number of levels in tree
max_depth = [5, 10]

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 3]

# Method of selecting samples for training each tree
bootstrap = [True, False]


# Create the random grid
rf_par = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
               # 'min_samples_split': min_samples_split,
               # 'min_samples_leaf': min_samples_leaf,
               # 'bootstrap': bootstrap}



rf_model= RandomForestClassifier(random_state=1)
rf_par = dict(n_estimators=n_estimators)

#### SVM

In [9]:
# Super Vector Machines
svm_model = SVC(kernel='linear')


svm_par = {'gamma': np.linspace(0.0001, 10, 15)}


#### KNN and LR

In [10]:
# KNN model
knn_model = KNeighborsClassifier()

## KNN parameters
knn_par = {'n_neighbors' : [5, 10, 15], 'p':[1, 2], 'weights' : ['uniform', 'distance'] }


## logisitc regression (LR)
solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [1000, 100, 10, 1.0, 0.1, 0.01, 0.001]

lr_model = LogisticRegression()
lr_par = dict(solver=solvers,penalty=penalty,C=c_values)

In [11]:
# Set up models and Parameters for a "for loop"  

models = [[lr_model, 'lr'], [knn_model, 'knn'], [svm_model, 'svm']] #, [rf_model, 'rf']]

parameters = [ lr_par, knn_par, svm_par] #, rf_par]

In [12]:
ml_dicts = {}

---

In [13]:
features = [['qir'], 
            ['qir', 'class_star'],
            ['qir', 'class_star', 'log(S8/S45)'],
            ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)'],
            ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)', 'Mstar'],
            ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)', 'Mstar', 'log(S45/S36)'],
            ['qir', 'class_star', 'Mstar', 'log(S45/S36)']]

In [14]:
splits = [0.2] #,0.4,0.6,0.8]

# Loop through different ML models coupled with thier hyper paramter (use the same splits for all features)
for m, par in zip(models, parameters):
    for s in splits:
        # X_train, X_vald, y_train, y_vald = train_test_split(X_balanced, y_balanced, test_size= s, random_state=1, stratify = y_balanced, shuffle = True)
        X_train, X_vald, y_train, y_vald = train_test_split(X_data, y, test_size= s, random_state=1, stratify = y, shuffle = True)
        key0 = str(m[1])
        print(key0)
        ml_dicts[key0] = {} # defining The main subkeys, which are the machine learning models
        
        i = 1
        for f in features:
            xtr =  X_train[f]
            xva =  X_vald[f]
            xte =  X_test[f]
            
            results = get_f1_ml (m[0], par, xtr, y_train, xva, y_vald, xte, y_test) # to get the f1 for the ml model

            key = "F"+str((i)) # Create keys for the each feature set in order to reference results
            ml_dicts[key0][key] = {}

            ml_dicts[key0][key]['tot_f1_vald'] = results[0]
            ml_dicts[key0][key]['tot_f1_test'] = results[1]
            ml_dicts[key0][key]['jack_train'] = results[2]
            ml_dicts[key0][key]['jack_vald'] = results[3]
            ml_dicts[key0][key]['jack_test'] = results[4]
            i += 1

lr
knn
svm


In [15]:
arr_all = []
for m, d in zip (models, ml_dicts.keys()):
    f1_arr_vald = []
    f1_arr_test = []
    sd_vald_arr = []
    sd_arr = [] 
    
    # print(ml_dicts[d])
    for key in ml_dicts[d].keys():
        f1_arr_vald.append(ml_dicts[d][key][ 'tot_f1_vald' ]) # append total valdation f1 score to an array
        f1_arr_test.append(ml_dicts[d][key][ 'tot_f1_test' ]) # append total test f1 score to an array
        
        sd_train = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_train' ]) ), ml_dicts[d][key][ 'jack_train' ])[0]
        sd_vald = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_vald' ]) ), ml_dicts[d][key][ 'jack_vald' ])[0]
        sd_test = jack_SD(np.zeros( len(ml_dicts[d][key][ 'jack_test' ]) ), ml_dicts[d][key][ 'jack_test' ])[0]
        
        sd_v = np.sqrt( np.array((sd_train**2)) + np.array((sd_vald**2)))
        sd = np.sqrt( np.array((sd_train**2)) + np.array((sd_test**2)))
       
        sd_vald_arr.append(sd_v)
        sd_arr.append(sd)
        # append the SD to the sd_arr
    arr_all.append([ list(ml_dicts[d].keys()), f1_arr_vald, f1_arr_test, sd_vald_arr, sd_arr])    


In [16]:
total_sample = len(X_train) + len(X_vald) + len(X_test)

# fractions
tr = (len(X_train)/total_sample) * 100
tv = (len(X_vald)/total_sample) * 100 
t = (len(X_test)/total_sample) * 100

print('Total sample', total_sample)
print('Length of train', len(X_train))
print('Length of validation', len(X_vald))
print('Length of test', len(X_test))

print('Samples Fractions: [train(%): {0:.3f} vald(%): {1:.3f} test(%): {2:.3f}]'.format(tr,tv,t))

Total sample 4279
Length of train 2567
Length of validation 642
Length of test 1070
Samples Fractions: [train(%): 59.991 vald(%): 15.004 test(%): 25.006]


In [None]:
colors = ['blue', 'green', 'orange', 'red']
fig, axs = plt.subplots(2, figsize=(15, 9), sharex=True, sharey =True)

count = 0
n = 5

space = []
tickFeat = []

for result, model, color in zip(arr_all, models, colors):
    a = np.linspace(n*count, n*(1+count)-2,len(features)) # to get index on the x-axis
    space.extend(a)
    tickFeat.extend(result[0])
    axs[0].errorbar( a, result[1], result[3], fmt='o', label =model[1], color = color)
    axs[0].set_title( "F1 Score for different features with for Machine learning models", fontweight ='bold', fontsize =12)
    axs[0].set_ylabel("F1 score(vald)", fontweight ='bold', fontsize =12)
    axs[0].set_ylim(.80, 1)
    axs[0].legend(loc = 'lower left')
    
    axs[1].errorbar( a, result[2], result[4], fmt='o', label =model[1], color = color)
    axs[1].set_xlabel("Features", fontweight ='bold', fontsize =12)
    axs[1].set_ylabel("F1 score(test)", fontweight ='bold', fontsize =12)
    axs[1].set_ylim(.80, 1)
    axs[1].legend(loc = 'lower left')
    
    count += 1

plt.xticks(space, tickFeat, rotation = 'vertical',  fontsize =12)
plt.savefig('normalised/ml_photo')
plt.show()


In [None]:
colors = ['blue', 'green', 'orange', 'red']

plt.figure(figsize=(10, 7))

count = 0
n = 5

space = []
tickFeat = []

for result, model, color in zip(arr_all, models, colors):
    a = np.linspace(n*count, n*(1+count)-2,len(features)) # to get index on the x-axis
    space.extend(a)
    tickFeat.extend(result[0])
    plt.errorbar( a, result[1], result[3], fmt='o', label =model[1], color = color)
    plt.title( "F1 Score on validation dataset for different features with the SD", fontweight ='bold', fontsize =12)
    plt.ylabel("F1 score(validation data)", fontweight = 'bold', fontsize =12)
    plt.ylim(.80, 1)
    plt.legend(loc = 'lower left')
    
    count += 1

plt.xticks(space, tickFeat, rotation = 'vertical',  fontsize =12)
plt.show()



In [None]:
colors = ['blue', 'green', 'orange', 'red']

plt.figure(figsize=(10, 7))

count = 0
n = 5

space = []
tickFeat = []

for result, model, color in zip(arr_all, models, colors):
    a = np.linspace(n*count, n*(1+count)-2,len(features)) # to get index on the x-axis
    space.extend(a)
    tickFeat.extend(result[0])
    plt.errorbar( a, result[2], result[4], fmt='o', label =model[1], color = color)
    plt.title( "F1 Score on unseen test dataset for different features with the SD", fontweight ='bold', fontsize =12)
    plt.ylabel("F1 score(test data)", fontweight = 'bold', fontsize =12)
    plt.ylim(.80, 1)
    plt.legend(loc = 'lower left')
    
    count += 1

plt.xticks(space, tickFeat, rotation = 'vertical',  fontsize =12)
plt.show()



In [None]:
import csv

colors = ['blue', 'green', 'orange', 'red']

count = 0
n = 5

space = []
tickFeat = []

# Prepare a list to hold rows for the CSV
csv_rows = []

for result, model, color in zip(arr_all, models, colors):
    a = np.linspace(n * count, n * (1 + count) - 2, len(features))  # same x-axis indexing
    space.extend(a)
    tickFeat.extend(result[0])

    for xi, feature_name, f1_score, sd in zip(a, result[0], result[2], result[4]):
        csv_rows.append([feature_name, xi, f1_score, sd, model[1]])

    count += 1

# Write to CSV
with open('normalised/model_f1_scores_zscales.csv', mode='w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Feature', 'X_Position', 'F1_Score', 'SD', 'Model'])
    writer.writerows(csv_rows)

print("CSV file 'model_f1_scores.csv' has been written.")


# BASELINE

In [None]:
### Saving the Result
def arrays_to_csv(array1, array2, column_name, output_file='output.csv'):
    """
    Creates a DataFrame from two arrays and writes it to a CSV file.
    
    Parameters:
    - array1: First array (will become first column)
    - array2: Second array (will become second column)
    - column_name: Tuple of column names (e.g., ('Column1', 'Column2'))
    - output_file: Name of the output CSV file (default: 'output.csv')
    
    Returns:
    - None (writes file to disk)
    """
    # Create DataFrame from the arrays
    df = pd.DataFrame({
        column_name[0]: array1,
        column_name[1]: array2
    })
    
    # Write to CSV
    df.to_csv(output_file, index=False)
    print(f"Successfully wrote data to {output_file}")

In [None]:
# column_names = ['f1','err']

# arrays_to_csv(result[2], result[4], column_names, output_file='normalised/f1_normalised.csv')

In [None]:
print("DOne")

# DONE