# 1. Forest Classification and Model Selection

## Introduction

In this notebook, we will explore forest classification using ensemble methods and metric models. We will also perform model selection to identify the best-performing model based on various evaluation metrics.

## Objectives

1. **Data Preparation**: Load and preprocess the dataset.
2. **Model Training**: Train Random Forest and Extra Trees classifiers.
3. **Model Evaluation**: Evaluate the performance of each model using metrics such as accuracy, precision, recall, and F1-score.
4. **Hyperparameter Tuning**: Use techniques like Grid Search and Random Search to optimize model parameters.
5. **Model Selection**: Compare the models and select the best one based on evaluation metrics.

In [None]:
# # !pip3 -q install yellowbrick
# !pip3 -q install imblearn
# !pip3 -q install scienceplots
# !pip3 -q install xgboost
# !pip3 -q install seaborn

In [8]:
#libraries
import pandas as pd
import geopandas as gpd
import numpy as np
import traceback
import os

#clustering
from shapely import affinity


from sklearn.decomposition import PCA

#test/train split and hyperparameters optimisation
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GroupShuffleSplit

#ML
from sklearn import preprocessing
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, cohen_kappa_score 
from imblearn.over_sampling import SMOTE
from tqdm import tqdm

#xgb
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

#stats
import scipy.stats

import matplotlib.pyplot as plt


dict_normal_names={7: "Pine", 
        5:"Aspen",
        4: "Larch", 
        1:"Birch",
        6:"Silver fir",
        15:"Burnt forest", 
        13:'Deforestation', 
        14:'Grass',
        12:'Soil', 
        16:'Swamp', 
        11:'Water body',
        17:'Settlements'}

colors =[
    '#117733',
    '#50CE57',
    '#23A28F',
    '#5BD0AE',
    '#88CCEE', 
    '#92462D', 
    '#DE7486',
    '#DDCC77',
    '#AA4499',
    '#0f62fe',
    '#be95ff'
]

#model saving
from joblib import dump, load

In [9]:
import warnings
warnings.filterwarnings('ignore')

## Helpers

In [71]:
def init_models():
    return { 
        "kNN": KNeighborsClassifier(n_jobs=6), 
        "SVC": SVC(probability=True), # to speed up LinearSVC
        "RandomForest": RandomForestClassifier(bootstrap=True, n_jobs=6), 
        "XGB": xgb.XGBClassifier(n_jobs=6)}


def get_predictions(data,
                    model,
                    param_grid,
                    target_column: str = 'class',
                    stratify_column: str = 'key',
                    to_remove_columns: list = ['key'],
                    test_size: float=0.3,
                    smote_balance: bool=True,
                    cv: int=5,
                    n_iter_search: int=15,
                    label_encoder: bool=False,
                   verbose: int = 0,):
    #test/train spliting considering key overlap problems and missed classes
    n = 0
    while True:
        train_inds, test_inds = next(
            GroupShuffleSplit(
                test_size=test_size, n_splits=2  # ,random_state = 40
            ).split(data, groups=data[stratify_column])
        )
        # because we need pixels from same plots to be separated in train and test
        train = data.iloc[train_inds]
        test = data.iloc[test_inds]

        train_classes = train[target_column].nunique()
        test_classes = test[target_column].nunique()
        all_classes = data[target_column].nunique()
        # because we need classes to be represented in train and test
        n+=1
        if train_classes == test_classes == all_classes:
            break
        if n>40:
            print(f'N - {n}')
            msg= f'Train - {train_classes}, Test = {test_classes}, All - {all_classes}'
            print(msg)
            raise KeyError('Problems in train/test split')
        
    train = train.drop(columns=to_remove_columns)
    test = test.drop(columns=to_remove_columns)
    #class balansing with smote
    if smote_balance is True:
        smote = SMOTE(random_state = 42)
        X, y = smote.fit_resample(train.loc[:, train.columns!=target_column],
                                  train[target_column]) #drops 3 columns: key, class, and forest
        df_smote = pd.DataFrame(X, columns = train.loc[:, train.columns!=target_column].columns.tolist()) #drops 3 columns: key, class, and forest

        #we set train/test from SMOTE results
        X_train = df_smote
        y_train = y
        X_test = test.loc[:, train.columns!=target_column]
        y_test = test[target_column]
        #we set train/test as it is
    else:
        X_train = train.loc[:, train.columns!=target_column]
        y_train = train[target_column]
        X_test = test.loc[:, train.columns!=target_column]
        y_test = test[target_column]
    model = init_models()[model]
    gs = RandomizedSearchCV(estimator=model,
                            param_distributions = param_grid,
                            n_iter = n_iter_search,
                            cv = cv,
                            scoring= 'f1_weighted', 
                            verbose=verbose,
                           n_jobs = N_JOBS_CV)

    if label_encoder == True:
        le = LabelEncoder()
        gs.fit(X_train, le.fit_transform(y_train))
        y_pred = gs.best_estimator_.predict(X_test)
        model_fit = gs.best_estimator_
        y_proba = gs.best_estimator_.predict_proba(X_test)
        # model_name = gs.best_estimator_.__class__.__name__
        # pd.concat([X_test, y_test, y_pred, y_proba], axis=1).to_csv(f'{model_name}_for_conformal.csv')
        results = {'model': model_fit,
            'X_train data': X_train,
            'y train data': y_train,
            'X test data': X_test,
            'y test data': y_test,
            'y predicted': le.inverse_transform(y_pred),
            'y proba': y_proba
            }

    else:
        gs.fit(X_train, y_train)
        y_pred = gs.best_estimator_.predict(X_test)
        model_fit = gs.best_estimator_
        y_proba = gs.best_estimator_.predict_proba(X_test)
        # model_name = gs.best_estimator_.__class__.__name__
        # pd.concat([X_test, y_test, y_pred, y_proba], axis=1).to_csv(f'{model_name}_for_conformal.csv')

        results = {'model': model_fit,
                   'X_train data': X_train,
                   'y train data':  y_train,
                   'X test data': X_test,
                   'y test data': y_test,
                   'y predicted': y_pred,
                   'y proba': y_proba

        }

    return results

In [11]:
def metrics_description(y_true, y_pred, 
                        metrics_by_class: bool=True, 
                        confusion_matrix_multiclass_on: bool=True,
                        binary_matrix_on: bool=False):

    
    print('Accuracy score: %.2f%%' %(accuracy_score(y_true, y_pred)*100))  
    print('Precision score: %.2f%%' % (precision_score(y_true, y_pred, average= 'weighted')*100))
    print('Recall score: %.2f%%' % (recall_score(y_true, y_pred, average= 'weighted')*100))
    print('F1-Score: %.2f%%'%(f1_score(y_true, y_pred, average = 'weighted')*100))
    print('Kappa score: %.2f%%'%(cohen_kappa_score(y_true, y_pred)*100))
    
    
    #dataframe with metrics by class
    if metrics_by_class is True:
        metrics_by_class = pd.DataFrame(
                {
                    'names': list(map(dict_normal_names.get, list(np.unique(y_true)))),
                    'f1_scores': f1_score(y_true, y_pred,
                               average=None).round(2).tolist(),
                    'precision': precision_score(y_true, y_pred, 
                                       average=None).round(2).tolist(),
                    'recall':recall_score(y_true, y_pred,
                                       average=None).round(2).tolist()
                }
            )
        display(metrics_by_class)

    #confusion matrix multiclass
    if confusion_matrix_multiclass_on is True:
        data = confusion_matrix(y_true, y_pred)
        df_cm = pd.DataFrame(data, columns=list(map(dict_normal_names.get, list(np.unique(y_true)))), 
                             index = list(map(dict_normal_names.get, list(np.unique(y_true)))))
        df_cm.index.name = 'Actual'
        df_cm.columns.name = 'Predicted'

        #confusion matrix plot
        f, ax = plt.subplots(figsize=(6, 10))
        cmap = sns.cubehelix_palette(light=1, as_cmap=True)

        sns.heatmap(df_cm, cbar=False, annot=True, cmap=cmap, square=True, fmt='.0f',
                    annot_kws={'size': 10})
        plt.title('Actuals vs Predicted')
        plt.show()
        
    #confusion matrix binary    
    if binary_matrix_on is True:
        cm = confusion_matrix(y_true, y_pred)
        print('Confusion matrix\n\n', cm)
        ConfusionMatrixDisplay(confusion_matrix=cm).plot();

In [12]:
#getting dataset with metrics by class for each random prediction
def get_classes_metrics(models_vector): #vector with model variations, y predicted and y true from the dataset
    class_metrics_dataframe = pd.DataFrame()
    count = 0 #counter of iteration

    for i in models_vector:

        count += 1 #counting
        pred = i['y predicted'] #predicted values 
        true = i['y test data'] #corresponding labels from random test set
        names_list = list(np.unique(true))

        temp = pd.DataFrame(
            {
                'iteration':[count]*len(names_list), 
                'names': list(map(dict_normal_names.get, names_list)),
                'f1_scores': f1_score(true, pred,
                           average=None).round(2).tolist(),
                'precision_list': precision_score(true, 
                                   pred, 
                                   average=None).round(2).tolist(),
                'recall':recall_score(true, 
                                   pred, 
                                   average=None).round(2).tolist()
            }
        ) #dataset for each model 

        class_metrics_dataframe = pd.concat([class_metrics_dataframe, temp], ignore_index=True)
    return class_metrics_dataframe 

In [13]:
def get_metrics_average(models_vector): #vector with model variations, y predicted and y true from the dataset
    average_metrics_dataframe = pd.DataFrame()
    count = 0 #counter of iteration

    for i in models_vector:

        count += 1 #counting
        pred = i['y predicted'] #predicted values 
        true = i['y test data'] #corresponding labels from random test set

        temp = pd.DataFrame(
            {
                'iteration':[count],#*len(names_list), 
                #'names': list(map(dict_normal_names.get, names_list)),
                'f1_scores': f1_score(true, pred,
                           average='weighted').round(2).tolist(),
                'precision_list': precision_score(true, 
                                   pred, 
                                   average='weighted').round(2).tolist(),
                'recall':recall_score(true, 
                                   pred, 
                                   average='weighted').round(2).tolist()
            }
        ) #dataset for each model 

        average_metrics_dataframe = pd.concat([average_metrics_dataframe, temp], 
                                              ignore_index=True)
    return average_metrics_dataframe

In [14]:
def get_best_model(datavector_models):
    number = get_metrics_average(datavector_models).sort_values(by='f1_scores', 
                                                                ascending=False).head(1).reset_index()['index'].values[0]
    best_model = datavector_models[number]['model']
    return best_model

In [15]:
def get_worst_model(datavector_models):
    number = get_metrics_average(datavector_models).sort_values(by='f1_scores', 
                                                                ascending=False).tail(1).reset_index()['index'].values[0]
    best_model = datavector_models[number]['model']
    return best_model

# Data preparation

In [16]:
def get_scaled_data(path:str, cols_remove:list=['key', 'class']):
    df = pd.read_csv(path)
    mask = df['class']!=2
    df=df.loc[mask]
    x = df.drop(columns=cols_remove).values
    #minmax scaling
    min_max_scaler = preprocessing.MinMaxScaler() 
    x_scaled = min_max_scaler.fit_transform(x)
    target_cols = [col for col in list(df.columns) if col not in cols_remove]
    df.loc[:, target_cols] = x_scaled
    return df, min_max_scaler
folder = '../shape_data/filtered_datasets_2024/'
df_scaled, min_max_scaler = get_scaled_data(os.path.join(folder, 'df3_notfiltered.csv'))

In [18]:
def get_random_forest():
    n_estimators = np.arange(100,200,20)
    max_depth = np.arange(10,110,11)
    min_samples_split = [2, 5, 10]
    min_samples_leaf = [1, 2, 4]
    random_grid = {'n_estimators': n_estimators,
                   'max_depth': max_depth,
                   'min_samples_split': min_samples_split,
                   'min_samples_leaf': min_samples_leaf}
    return {"grid":random_grid}

def get_svm():
    svc_grid = {'C': [0.01, 0.1, 10,100], 
              # 'gamma': [1, 0.1, 0.01],
              # 'kernel': ['rbf', 'linear', 'poly']
             }
    return {"grid":svc_grid}
def get_KNN():
    metric = ['euclidean','manhattan','chebyshev','minkowski']
    n_neighbors = np.arange(4,15,2)
    weights = ['uniform','distance']
    random_grid_knn = {'n_neighbors': n_neighbors,
        'weights': weights,
        'metric': metric}
    return {"grid":random_grid_knn}

def get_XGB():
    params = { 'max_depth': [3,6,10],
           "min_child_weight": [0.5, 1, 2],
           'n_estimators': np.arange(10,80,20),
           'colsample_bytree': [0.3, 0.7, 1]}
    return {"grid":params}



## Model training

In [19]:
def model_loop(df_forest: pd.DataFrame, model: str,  param_grid: dict, smote_balance: bool, verbose: int = 0):
    datavector = []
    for i in range(15):
        print(f"{model} ---- {i}")
        trained_model = get_predictions(data = df_forest,
                    model = model,
                    param_grid = param_grid,
                    target_column = 'class',
                    to_remove_columns=['key'],
                    smote_balance=smote_balance,
                    cv=5, 
                    n_iter_search=20, 
                    label_encoder=True if model == 'XGB' else False, 
                    verbose=verbose)
        datavector.append(trained_model)
    return datavector

In [20]:
def check_computed_metrics(metrics: pd.DataFrame, dataset: str, model: str):
    if 'fname' not in metrics:
        return False
    mask = (metrics['fname']==dataset) & (metrics['model']==model) 
    if len(metrics.loc[mask])==150:
        return True
    else:
        return False

In [21]:
metric_fname = '../shape_data/metric_results_v7.csv'
metric_stats_fname = '../shape_data/metric_stats_v7.csv'
if os.path.isfile(metric_fname):
    print('Use cached metrics')
    metric_container = pd.read_csv(metric_fname, index_col=0)
    metric_stats_container = pd.read_csv(metric_stats_fname, index_col=0)
else:
    metric_container = pd.DataFrame()
    metric_stats_container = pd.DataFrame()

In [69]:
folder = '../shape_data/filtered_datasets_2024/'
datasets = os.listdir(folder)
problems = []

for dataset in sorted(datasets):
    print(dataset)
    scale = dataset[2]
    df_scaled, min_max_scaler = get_scaled_data(os.path.join(folder, dataset))    
    mask_forest = df_scaled['class']<10
    df_forest = df_scaled.loc[mask_forest]
    models = {
            'RandomForest': get_random_forest(), 
             "SVC":get_svm(),
            "kNN":get_KNN(),
            "XGB":get_XGB()
             }
    for model, param_grid in models.items():
        if model=='SVC':
            N_JOBS_CV = 4
        else:
            N_JOBS_CV = 1
        status_computed = check_computed_metrics(metric_container,dataset,model)
        if status_computed==True:
            print(f'Computed prev metrics for: {dataset} & {model}')
            continue
        for smote_balance in [True, False]:            
            print(model,scale, 'SMOTE', smote_balance)
            datavector = model_loop(df_forest, 
                                    model, 
                                    param_grid['grid'], 
                                    smote_balance, 
                                    verbose = 0)
            
            
    
            
            # Metrics related to forest types

            model_metrics = get_classes_metrics(datavector)
            model_metrics['model'] = model
            model_metrics['smote_balance'] = smote_balance
            model_metrics['scale'] = scale
            model_metrics['fname'] = dataset
            model_metrics['experiment_status'] = 'Done'
            metric_container = pd.concat([metric_container, model_metrics], axis=0)

            # Metrics related to forest
            metricts_stats = get_metrics_average(datavector)
            metricts_stats['model'] = model
            metricts_stats['smote_balance'] = smote_balance
            metricts_stats['scale'] = scale
            metricts_stats['fname'] = dataset
            metricts_stats['experiment_status'] = 'Done'
            metric_stats_container = pd.concat([metric_stats_container, metricts_stats], axis=0)
            
            best_model = get_best_model(datavector)
            core = dataset.split('.')[0]
            model_path = os.path.join(f'../models/best_models/{model}_{core}.joblib')
            dump(best_model, model_path)
        metric_container.to_csv('../shape_data/metric_results_v7.csv')
        metric_stats_container.to_csv('../shape_data/metric_stats_v7.csv')

df1_filtered_modified.csv
RandomForest 1 SMOTE True
RandomForest ---- 0
RandomForest ---- 1


KeyboardInterrupt: 

## End

Conformal prediction


In [73]:
folder = '../shape_data/filtered_datasets_2024/'
datasets = os.listdir(folder)
problems = []
dataset = 'df3_notfiltered.csv'
smote_balance = False

print(dataset)
scale = dataset[2]
df_scaled, min_max_scaler = get_scaled_data(os.path.join(folder, dataset))    
mask_forest = df_scaled['class']<10
df_forest = df_scaled.loc[mask_forest]
models = {
        'RandomForest': get_random_forest(), 
            "SVC":get_svm(),
        "kNN":get_KNN(),
        "XGB":get_XGB()
            }
for model, param_grid in models.items():
    if model=='SVC':
        N_JOBS_CV = 4
    else:
        N_JOBS_CV = 1
    # status_computed = check_computed_metrics(metric_container,dataset,model)
    # if status_computed==True:
    #     print(f'Computed prev metrics for: {dataset} & {model}')
    #     continue
    # for smote_balance in [True, False]:            
    print(model,scale, 'SMOTE', smote_balance)
    verbose = 0
    datavector = []
    for i in range(1):
        print(f"{model} ---- {i}")
        trained_model = get_predictions(data = df_forest,
                    model = model,
                    param_grid = param_grid['grid'],
                    target_column = 'class',
                    to_remove_columns=['key'],
                    smote_balance=smote_balance,
                    cv=5, 
                    n_iter_search=20, 
                    label_encoder=True if model == 'XGB' else False, 
                    verbose=verbose)
        datavector.append(trained_model)
    # datavector = model_loop(df_forest, 
    #                         model, 
    #                         param_grid['grid'], 
    #                         smote_balance, 
    #                         verbose = 0)
    
    
    col_names = [f'class_prob_{name}' for name in trained_model['model'].classes_]
    probs_df=pd.DataFrame(trained_model['y proba'], columns=col_names)
    probs_df['observed_class']=trained_model['y test data'].reset_index(drop=True)
    probs_df['predicted_class']=trained_model['y predicted']
    all_df= pd.concat([trained_model['X test data'].reset_index(drop=True), probs_df], axis=1)

    all_df.to_csv(f'conformal_predictions_{model}_{core}.csv', index=False)
    
    # Metrics related to forest types

    # model_metrics = get_classes_metrics(datavector)
    # model_metrics['model'] = model
    # model_metrics['smote_balance'] = smote_balance
    # model_metrics['scale'] = scale
    # model_metrics['fname'] = dataset
    # model_metrics['experiment_status'] = 'Done'
    # metric_container = pd.concat([metric_container, model_metrics], axis=0)

    # # Metrics related to forest
    # metricts_stats = get_metrics_average(datavector)
    # metricts_stats['model'] = model
    # metricts_stats['smote_balance'] = smote_balance
    # metricts_stats['scale'] = scale
    # metricts_stats['fname'] = dataset
    # metricts_stats['experiment_status'] = 'Done'
    # metric_stats_container = pd.concat([metric_stats_container, metricts_stats], axis=0)
    
    best_model = get_best_model(datavector)
    core = dataset.split('.')[0]
    model_path = os.path.join(f'../models/{model}_{core}.joblib')
    dump(best_model, model_path)
    # break
    # metric_container.to_csv('../shape_data/metric_results_v7.csv')
    # metric_stats_container.to_csv('../shape_data/metric_stats_v7.csv')

df3_notfiltered.csv
RandomForest 3 SMOTE False
RandomForest ---- 0
SVC 3 SMOTE False
SVC ---- 0
kNN 3 SMOTE False
kNN ---- 0
XGB 3 SMOTE False
XGB ---- 0


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,...,energy2,homogeneity1,homogeneity2,class_prob_1,class_prob_4,class_prob_5,class_prob_6,class_prob_7,observed_class,predicted_class
0,0.202290,0.069881,0.102996,0.073466,0.286765,0.771346,0.962482,0.921642,1.0,0.780807,...,0.218782,0.402796,1.0,0.100000,0.005556,0.727778,0.0,0.166667,5,5
1,0.178435,0.063435,0.108614,0.072385,0.287059,0.783846,0.989899,0.968195,1.0,0.767176,...,0.217343,0.383868,1.0,0.038889,0.000000,0.833333,0.0,0.127778,5,5
2,0.202290,0.073749,0.104401,0.077787,0.288235,0.771154,0.955087,0.836176,1.0,0.780807,...,0.222542,0.437208,1.0,0.172222,0.000000,0.761111,0.0,0.066667,5,5
3,0.202290,0.071429,0.109082,0.074546,0.288235,0.771154,0.955087,0.927861,1.0,0.780807,...,0.221511,0.413216,1.0,0.116667,0.005556,0.800000,0.0,0.077778,5,5
4,0.178435,0.062919,0.106742,0.071737,0.295294,0.843462,1.000000,0.993426,1.0,0.767176,...,0.219992,0.391755,1.0,0.050000,0.000000,0.883333,0.0,0.066667,5,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1418,0.259542,0.072202,0.104401,0.104149,0.321176,0.806923,0.979076,0.961976,1.0,0.820065,...,0.267391,0.409244,1.0,0.200000,0.000000,0.800000,0.0,0.000000,5,5
1419,0.259542,0.060340,0.101592,0.103717,0.320000,0.806538,0.975469,0.944208,1.0,0.820065,...,0.268392,0.426828,1.0,0.294444,0.000000,0.705556,0.0,0.000000,5,5
1420,0.259542,0.061372,0.117978,0.105445,0.335882,0.812692,0.981602,0.950249,1.0,0.820065,...,0.263709,0.393910,1.0,0.211111,0.000000,0.788889,0.0,0.000000,5,5
1421,0.259542,0.063950,0.111891,0.104149,0.330588,0.812692,0.977273,0.948827,1.0,0.820065,...,0.267171,0.388909,1.0,0.155556,0.000000,0.838889,0.0,0.005556,5,5


In [29]:
trained_model['X test data']

Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,...,contrast1,contrast2,correlation1,correlation2,dissimilarity1,dissimilarity2,energy1,energy2,homogeneity1,homogeneity2
0,0.182252,0.054409,0.088483,0.067632,0.260882,0.549615,0.628247,0.607676,0.699432,0.620502,...,0.199727,0.349690,0.281767,0.0,1.0,0.199181,0.050298,0.197563,0.348391,1.0
1,0.182252,0.058277,0.090824,0.072169,0.260882,0.549615,0.628247,0.579780,0.699432,0.620502,...,0.183075,0.328553,0.284199,0.0,1.0,0.182518,0.049952,0.199563,0.327212,1.0
2,0.182252,0.054409,0.092463,0.066768,0.255294,0.545385,0.625722,0.591862,0.687544,0.620502,...,0.200993,0.351267,0.280697,0.0,1.0,0.200448,0.050458,0.196698,0.349971,1.0
3,0.182252,0.058277,0.107912,0.076059,0.260882,0.549615,0.628247,0.626155,0.699432,0.620502,...,0.196611,0.345791,0.282156,0.0,1.0,0.196063,0.050241,0.197880,0.344484,1.0
4,0.182252,0.063950,0.109785,0.082757,0.260882,0.549615,0.628247,0.655117,0.699432,0.620502,...,0.179764,0.324259,0.284686,0.0,1.0,0.179205,0.049881,0.199960,0.322909,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4732,0.259542,0.072202,0.104401,0.104149,0.321176,0.806923,0.979076,0.961976,1.000000,0.820065,...,0.249781,0.409207,0.365149,0.0,1.0,0.250341,0.039020,0.267391,0.409244,1.0
4733,0.259542,0.060340,0.101592,0.103717,0.320000,0.806538,0.975469,0.944208,1.000000,0.820065,...,0.266530,0.427971,0.366316,0.0,1.0,0.266030,0.038885,0.268392,0.426828,1.0
4734,0.259542,0.061372,0.117978,0.105445,0.335882,0.812692,0.981602,0.950249,1.000000,0.820065,...,0.236050,0.393421,0.360868,0.0,1.0,0.236991,0.039552,0.263709,0.393910,1.0
4735,0.259542,0.063950,0.111891,0.104149,0.330588,0.812692,0.977273,0.948827,1.000000,0.820065,...,0.232058,0.388759,0.364857,0.0,1.0,0.232703,0.039069,0.267171,0.388909,1.0


In [31]:
trained_model['model'].classes_

array([1, 4, 5, 6, 7])

In [None]:
col_names = [f'class_prob_{name}' for name in trained_model['model'].classes_]
probs_df=pd.DataFrame(trained_model['y proba'], columns=col_names)
probs_df['observed_class']=trained_model['y test data'].reset_index(drop=True)
probs_df['predicted_class']=trained_model['y predicted']
all_df= pd.concat([trained_model['X test data'].reset_index(drop=True), probs_df], axis=1)

all_df.to_csv(f'conformal_predictions_{model}_{core}.csv', index=False)

In [62]:
f1_score(all_df['observed_class'], all_df['predicted_class'], average='weighted')

0.5558247693940066

In [64]:
model

'RandomForest'

In [39]:
trained_model['y test data']

0       7
1       7
2       7
3       7
4       7
       ..
4732    5
4733    5
4734    5
4735    5
4736    5
Name: class, Length: 1451, dtype: int64

In [None]:
trained_model