In [None]:
from sources.ml_f1 import*
import matplotlib.pyplot as plt
import numpy as np
from sklearn import metrics
from sklearn.metrics import accuracy_score as accuracy
from sklearn.metrics import f1_score as f1
from sklearn.metrics import recall_score as recall
from sklearn.metrics import precision_score as precision
from sklearn.model_selection import train_test_split
from scipy.spatial import distance
import pandas as pd

# ML models  
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
import xgboost
from xgboost import plot_importance
import json

# plot style
import seaborn as sns
sns.set_style("whitegrid")#, {"axes.facecolor": ".9"})
# plt.style.use('seaborn-ticks')


In [None]:
with open("normalised/new_final_review_data.txt") as f:
    data = f.read()

print("Data type before reconstruction : ", type(data))
      
# reconstructing the data as a dictionary
ml_dicts = json.loads(data)
  
print("Data type after reconstruction : ", type(ml_dicts))

In [None]:
# Set up models and Parameters for a "for loop"  
lr_model = LogisticRegression() #LR MODEL
knn_model = KNeighborsClassifier() #KNN MODEL
svm_model = SVC() #SVC MODEL
rf_model= RandomForestClassifier(random_state=1) #RF MODEL
xgb_model = xgboost.XGBClassifier(use_label_encoder=False, eval_metric='rmse', n_jobs=-1 )


models = [[lr_model, 'lr'], [knn_model, 'knn'], [svm_model, 'svm'], [rf_model, 'rf'], [xgb_model, 'xgb']]

In [None]:
ml_dicts.keys()

In [None]:
# Machine learning results for each split

res08 = result_per_split(ml_dicts ,models, 0.8)

res06 = result_per_split(ml_dicts ,models, 0.6)

res04 = result_per_split(ml_dicts ,models, 0.4)

res02 = result_per_split(ml_dicts ,models, 0.2)

all_res = [res08, res06, res04, res02]

In [None]:
features = [['qir'], 
            ['qir', 'class_star'],
            ['qir', 'class_star', 'log(S8/S45)'],
            ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)'],
            ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)', 'Mstar'],
            # ['qir', 'class_star', 'log(S8/S45)','log(S58/S36)', 'Mstar', 'log(S45/S36)'],
            # ['qir', 'class_star', 'Mstar', 'log(S45/S36)']
           ]

In [None]:
res08

In [None]:
colors = ['blue', 'green', 'orange', 'red']

plt.figure(figsize=(10, 7))

count = 0
n = 5

space = []
tickFeat = []

for result, model, color in zip(res08, models, colors):
    a = np.linspace(n*count, n*(1+count)-2,len(features)) # to get index on the x-axis
    space.extend(a)
    tickFeat.extend(result[0])
    plt.errorbar( a, result[2], result[4], fmt='o', label =model[1], color = color)
    plt.title( "F1 Score on unseen test dataset for different features with the SD", fontweight ='bold', fontsize =12)
    plt.ylabel("F1 score(test data)", fontweight = 'bold', fontsize =12)
    plt.ylim(.80, 1)
    plt.legend(loc = 'lower left')
    
    count += 1

plt.xticks(space, tickFeat, rotation = 'vertical',  fontsize =12)
plt.show()



In [None]:
def create_model_comparison_df_by_name(res_list, train_fractions, models_to_include=['KNN', 'RF'], feature_names_to_include=None):
    """
    Create a DataFrame comparing specific models across different train fractions and features.
    
    Parameters:
    res_list: list of result arrays [res02, res04, res06, res08]
    train_fractions: list of train fractions corresponding to res_list [0.2, 0.4, 0.6, 0.8]
    models_to_include: list of model names to include in the output
    feature_names_to_include: list of feature names to include (e.g., ['F1'], ['F5'], or None for all)
    
    Returns:
    pandas DataFrame with comparison data
    """
    all_data = []
    
    #Full model order
    all_models = ['LR', 'KNN', 'SVM', 'RF', 'XGB']
    model_indices = {model: idx for idx, model in enumerate(all_models)}
    
    for res, train_frac in zip(res_list, train_fractions):
        # Get all feature names from the first result
        all_feature_names = res[0][0]
        
        # Determine which feature indices to process
        if feature_names_to_include is None:
            # Include all features
            feature_indices = range(len(all_feature_names))
        else:
            # Include only specified features
            feature_indices = []
            for i, feature_name in enumerate(all_feature_names):
                # Extract just the feature part (e.g., "F1" from "0.8, F1")
                feature_part = feature_name.split(', ')[1] if ', ' in feature_name else feature_name
                if feature_part in feature_names_to_include:
                    feature_indices.append(i)
        
        for feature_idx in feature_indices:
            row_data = {'Train Fraction': train_frac, 'Features': all_feature_names[feature_idx]}
            
            for model_name in models_to_include:
                model_idx = model_indices[model_name]
                model_data = res[model_idx]
                
                row_data[f'{model_name} Validation Score'] = model_data[1][feature_idx]
                row_data[f'{model_name} Test Score'] = model_data[2][feature_idx]
                row_data[f'{model_name} Val Error'] = model_data[3][feature_idx]
                row_data[f'{model_name} Test Error'] = model_data[4][feature_idx]
            
            all_data.append(row_data)
    
    # Create DataFrame
    df = pd.DataFrame(all_data)
    
    # Reorder columns
    column_order = ['Train Fraction', 'Features']
    for model in models_to_include:
        column_order.extend([
            f'{model} Validation Score',
            f'{model} Test Score', 
            f'{model} Val Error',
            f'{model} Test Error'
        ])
    
    return df[column_order]

# Usage with feature names:
res_list = [res02, res04, res06, res08]
train_fractions = [0.2, 0.4, 0.6, 0.8]

df_f5_by_name = create_model_comparison_df_by_name(
    res_list, train_fractions, 
    models_to_include=['KNN', 'RF'], 
    feature_names_to_include=['F5']  # Use feature names instead of indices
)
print("F5 by name:")
print(df_f5_by_name.head())

In [None]:
df_f5_by_name.to_csv("model_comparison_original_f5_final_20-08-2025.csv", index = False)