In [13]:
import os
import sys
import datetime
from time import gmtime
from time import strftime
from pathlib import Path
from datetime import timedelta  
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from scipy import signal
import pickle
import librosa
import librosa.display
from sklearn.model_selection import train_test_split
from feature_extraction import feature_extraction, extract_statistic
from sklearn.model_selection import LeaveOneGroupOut, KFold, GridSearchCV, PredefinedSplit, GroupKFold

from regression_analysis import random_split_evaluation, independent_split_evaluation, random_baseline_metrics
from regression_analysis import evaluate_model_performance, create_random_feature, clip_based_on_boxes
from regression_analysis import get_errors_random, get_errors_independent, plot_erros 
from regression_analysis import random_split_ranking, independent_split_ranking, plot_top_features, plot_top_features_separate

from regression_analysis import ml_gridsearchcv_kfold, random_split_cv, hive_independent_cv, features_selection_cv, balance_training_data, read_and_plot_results
import scipy
import scipy.signal
import scipy.fftpack

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

%load_ext autoreload
%autoreload 2


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Labels 2021

In [14]:
data = pd.read_csv("../data/annotations/inspections_2021.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 109 entries, 0 to 108
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Date             109 non-null    object 
 1   Tag number       109 non-null    int64  
 2   Colony Size      109 non-null    int64  
 3   Fob 1st          109 non-null    float64
 4   Fob 2nd          102 non-null    float64
 5   Fob 3rd          31 non-null     float64
 6   FoBrood          11 non-null     float64
 7   Queen status     108 non-null    object 
 8   Frames of Honey  63 non-null     float64
 9   Open             107 non-null    object 
 10  Close            107 non-null    object 
 11  Notes            64 non-null     object 
dtypes: float64(5), int64(2), object(5)
memory usage: 10.3+ KB


In [15]:
data = data.fillna(0)
data['Date'] = pd.to_datetime(data['Date'])
unique_hives = data['Tag number'].unique()
unique_hives

array([3629,    6, 3631, 3693, 3690, 3691, 3628, 3627, 3692, 3640])

In [16]:
grouped = data.groupby(['Tag number'])
dict_hives = {}
for i in unique_hives:
        dict_hives[i] = grouped.get_group(i)

In [17]:
for hive in  data['Tag number'].unique():
    dict_hives[hive] = dict_hives[hive].set_index(dict_hives[hive]['Date'])
    idx = pd.date_range(dict_hives[hive].index.min(), dict_hives[hive].index.max()+ timedelta(days=1), freq="15min")#  + timedelta(days=12)
    dict_hives[hive] = dict_hives[hive].reindex(idx)
    dict_hives[hive] = dict_hives[hive].drop(['Date'], axis=1)
    dict_hives[hive] = dict_hives[hive].interpolate(method="linear")#interpolate(method="ffill")
    dict_hives[hive]["fob"] = dict_hives[hive]["Fob 1st"] + dict_hives[hive]["Fob 2nd"]+ dict_hives[hive]["Fob 3rd"]
    dict_hives[hive]["fob"] = dict_hives[hive]["fob"]#.round(0).astype('f')

# Labels 2022

In [18]:
data_2022 = pd.read_csv("../data/annotations/inspections_2022.csv")
data_2022.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Date           278 non-null    object
 1   Tag number     278 non-null    int64 
 2   Category       274 non-null    object
 3   Action detail  274 non-null    object
 4   Queen status   269 non-null    object
 5   Is alive       278 non-null    int64 
 6   Report notes   175 non-null    object
dtypes: int64(2), object(5)
memory usage: 15.3+ KB


In [19]:
data_2022['Date'] = pd.to_datetime(data_2022['Date'], format="%Y-%m-%d %H:%M:%S.%f%z", errors='coerce').dt.date
data_2022 = data_2022.set_index('Date')
data_2022

Unnamed: 0_level_0,Tag number,Category,Action detail,Queen status,Is alive,Report notes
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2022-06-02,3627,hive status,queenright,queenright,1,nucs All qr. added second and queen excluders ...
2022-06-02,3627,hive grading,medium,queenright,1,nucs All qr. added second and queen excluders ...
2022-06-20,3627,hive grading,weak,queenright,1,
2022-07-04,3627,hive grading,medium,queenright,1,7fob
2022-07-11,3627,hive grading,medium,queenright,1,7 frames of brood
...,...,...,...,...,...,...
2022-09-01,3629,varroa,2,queenright,1,
2022-09-01,3629,treatment,mite away,queenright,1,
2022-09-07,3629,frames of bees,5,queenright,1,Lots of dead bees under bee escape
2022-09-07,3629,feeding,sugar,queenright,1,603 got minimal syrup


In [20]:
#data_2022['Date'] = pd.to_datetime(data_2022['Date'], dayfirst=True).dt.date
#data_2022 = data_2022.set_index('Date')
#data_2022

In [21]:
data_2022 = data_2022[data_2022['Category'] == 'frames of bees']
data_2022['Action detail'] = pd.to_numeric(data_2022['Action detail']) # conver the column from object to float
data_2022['Action detail'] = data_2022['Action detail'].astype('f') 

In [22]:
data_2022["fob"] = data_2022["Action detail"]

In [23]:
grouped = data_2022.groupby(['Tag number'])
dict_hives_2022 = {}
for i in data_2022['Tag number'].unique():
        dict_hives_2022[i] = grouped.get_group(i)

In [24]:
for hive in  data_2022['Tag number'].unique():
    idx = pd.date_range(dict_hives_2022[hive].index.min(), dict_hives_2022[hive].index.max() + timedelta (days=1), freq="15min")
    dict_hives_2022[hive] = dict_hives_2022[hive].reindex(idx)
    dict_hives_2022[hive] = dict_hives_2022[hive].interpolate(method="linear")#interpolate(method="ffill")
    dict_hives_2022[hive]['fob'] = dict_hives_2022[hive]['fob']#.round(0).astype('f')

# MFCCs

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

win = 1600
shift = 800

df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                        hop_length = shift, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data/features/2021_df_mfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


In [None]:
df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                        hop_length = shift, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_mfccs_win_" + str(win) +'_shift_' + str(shift) +
             "_n_filter_26.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives, hives=[hive], year=2021,  enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_mfccs_win_" + str(win) +'_shift_' + 
                 str(shift) +'_' + str(hive) + "_n_mels_26.pkl")
    

In [None]:
for hive in hives: 
    df = feature_extraction(feature='mfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives_2022, hives=[hive], year=2022,  enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_mfccs_win_" + str(win) +'_shift_' + str(shift) +'_' + str(hive) + "_n_mels_26.pkl")
    

# LFCC

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]
win=1600
shift=800

df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win,
                        hop_length = shift, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data//features/2021_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


In [None]:
df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win, hop_length = shift, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives, hives=[hive], year=2021,  enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_lfccs_win_" + str(win) +'_shift_' +
                 str(shift) +'_' + str(hive) + "_n_mels_26.pkl")

In [None]:
for hive in hives: 
    df = feature_extraction(feature='lfccs', sample_rate= 16000, n_fft = win,
                            hop_length = shift, dict_hives=dict_hives_2022, hives=[hive], year=2022,  enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_lfccs_win_" + str(win) +'_shift_' +
                 str(shift) +'_' + str(hive) + "_n_mels_26.pkl")
    

# Spectral features

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]


df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                        hop_length = 800, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data/features/2021_df_spectral_nine_features.pkl")


In [None]:
df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                        hop_length = 800, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_spectral_nine_features.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                            hop_length = 800, dict_hives=dict_hives, hives=[hive], year=2021,  enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl")
    

In [None]:
for hive in hives: 
    df = feature_extraction(feature='spectral_shape_descriptors', sample_rate= 16000, n_fft = 1600,
                            hop_length = 800, dict_hives=dict_hives_2022, hives=[hive], year=2022,  enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl")
    

# Hand crafted features

In [None]:
hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                        hop_length=512, dict_hives=dict_hives, hives=hives, year=2021, enhancement=False)
df.to_pickle("../data/features/2021_df_nectar_features.pkl")



In [None]:
df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                        hop_length=512, dict_hives=dict_hives_2022, hives=hives, year=2022, enhancement=False)
df.to_pickle("../data/features/2022_df_nectar_features.pkl")


In [None]:
for hive in hives: 
    df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                            hop_length=512, dict_hives=dict_hives, hives=[hive], year=2021, enhancement=True)
    df.to_pickle("../data/features/2021_df_ss_amp_hand_crafted_" + str(hive) + ".pkl")
    

In [None]:
for hive in hives: 
    df = feature_extraction(feature='nectar_hand_crafted', sample_rate= 15625, n_fft=512,
                            hop_length=512, dict_hives=dict_hives_2022, hives=[hive], year=2022, enhancement=True)
    df.to_pickle("../data/features/2022_df_ss_amp_hand_crafted_" + str(hive) + ".pkl")
    

# Build the dataset

In [None]:
win = 1600
shift= 800

mfccs_2021 = pd.read_pickle("../data/features/2021_df_mfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_mels_26.pkl")
mfccs_2022 = pd.read_pickle("../data/features/2022_df_mfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_mels_26.pkl")


mfccs_2021['date'] = pd.to_datetime(mfccs_2021['date'], dayfirst=True)
mfccs_2021 = mfccs_2021.set_index(mfccs_2021['date'])
mfccs_2021 = mfccs_2021.drop(['date'], axis=1)

mfccs_2022['date'] = pd.to_datetime(mfccs_2022['date'], dayfirst=True)
mfccs_2022 = mfccs_2022.set_index(mfccs_2022['date'])
mfccs_2022 = mfccs_2022.drop(['date'], axis=1)

mfccs_2022['tag'] = mfccs_2022['tag']+ 10
mfccs = pd.concat([mfccs_2021, mfccs_2022])
mfccs.drop(columns=['raw_audio'], inplace=True)

lfccs_2021 = pd.read_pickle("../data/features/2021_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")
lfccs_2022 = pd.read_pickle("../data/features/2022_df_lfccs_win_" + str(win) +'_shift_' + str(shift) + "_n_filter_26.pkl")


lfccs_2021['date'] = pd.to_datetime(lfccs_2021['date'], dayfirst=True)
lfccs_2021 = lfccs_2021.set_index(lfccs_2021['date'])
lfccs_2021 = lfccs_2021.drop(['date'], axis=1)

lfccs_2022['date'] = pd.to_datetime(lfccs_2022['date'], dayfirst=True)
lfccs_2022 = lfccs_2022.set_index(lfccs_2022['date'])
lfccs_2022 = lfccs_2022.drop(['date'], axis=1)

lfccs_2022['tag'] = lfccs_2022['tag']+ 10
lfccs = pd.concat([lfccs_2021, lfccs_2022])
lfccs.drop(columns=['raw_audio'], inplace=True)

spectral_2021 = pd.read_pickle("../data/features/2021_df_spectral_nine_features.pkl")
spectral_2022 = pd.read_pickle("../data/features/2022_df_spectral_nine_features.pkl")

spectral_2021['date'] = pd.to_datetime(spectral_2021['date'], dayfirst=True)
spectral_2021 = spectral_2021.set_index(spectral_2021['date'])
spectral_2021 = spectral_2021.drop(['date'], axis=1)

spectral_2022['date'] = pd.to_datetime(spectral_2022['date'], dayfirst=True)
spectral_2022 = spectral_2022.set_index(spectral_2022['date'])
spectral_2022 = spectral_2022.drop(['date'], axis=1)

spectral_2022['tag'] = spectral_2022['tag']+ 10
spectral = pd.concat([spectral_2021, spectral_2022])
spectral.drop(columns=['raw_audio'], inplace=True)

hand_crafted_2021 = pd.read_pickle("../data/features/2021_df_hand_crafted_features.pkl")
hand_crafted_2022 = pd.read_pickle("../data/features/2022_df_hand_crafted_features.pkl")

hand_crafted_2021['date'] = pd.to_datetime(hand_crafted_2021['date'], dayfirst=True)
hand_crafted_2021 = hand_crafted_2021.set_index(hand_crafted_2021['date'])
hand_crafted_2021 = hand_crafted_2021.drop(['date'], axis=1)

hand_crafted_2022['date'] = pd.to_datetime(hand_crafted_2022['date'], dayfirst=True)
hand_crafted_2022 = hand_crafted_2022.set_index(hand_crafted_2022['date'])
hand_crafted_2022 = hand_crafted_2022.drop(['date'], axis=1)

hand_crafted_2022['tag'] = hand_crafted_2022['tag']+ 10
hand_crafted = pd.concat([hand_crafted_2021, hand_crafted_2022])
hand_crafted.drop(columns=['raw_audio'], inplace=True)


In [None]:
win = 1600
shift= 800

hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

    
mfccs_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_mfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)


mfccs_ss_2021['date'] = pd.to_datetime(mfccs_ss_2021['date'], dayfirst=True)
mfccs_ss_2021 = mfccs_ss_2021.set_index(mfccs_ss_2021['date'])
mfccs_ss_2021 = mfccs_ss_2021.drop(['date'], axis=1)

mfccs_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_mfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)


mfccs_ss_2022['date'] = pd.to_datetime(mfccs_ss_2022['date'], dayfirst=True)
mfccs_ss_2022 = mfccs_ss_2022.set_index(mfccs_ss_2022['date'])
mfccs_ss_2022 = mfccs_ss_2022.drop(['date'], axis=1)

mfccs_ss_2022['tag'] = mfccs_ss_2022['tag']+ 10
mfccs_ss = pd.concat([mfccs_ss_2021, mfccs_ss_2022])
mfccs_ss.drop(columns=['raw_audio'], inplace=True)

lfccs_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_lfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)

lfccs_ss_2021['date'] = pd.to_datetime(lfccs_ss_2021['date'], dayfirst=True)
lfccs_ss_2021 = lfccs_ss_2021.set_index(lfccs_ss_2021['date'])
lfccs_ss_2021 = lfccs_ss_2021.drop(['date'], axis=1)

lfccs_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_lfccs_win_{win}_shift_{shift}_{hive}_n_mels_26.pkl") 
         for hive in hives],
        ignore_index=True)


lfccs_ss_2022['date'] = pd.to_datetime(lfccs_ss_2022['date'], dayfirst=True)
lfccs_ss_2022 = lfccs_ss_2022.set_index(lfccs_ss_2022['date'])
lfccs_ss_2022 = lfccs_ss_2022.drop(['date'], axis=1)

lfccs_ss_2022['tag'] = lfccs_ss_2022['tag']+ 10
lfccs_ss = pd.concat([lfccs_ss_2021, lfccs_ss_2022])
lfccs_ss.drop(columns=['raw_audio'], inplace=True)

spectral_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)

spectral_ss_2021['date'] = pd.to_datetime(spectral_ss_2021['date'], dayfirst=True)
spectral_ss_2021 = spectral_ss_2021.set_index(spectral_ss_2021['date'])
spectral_ss_2021 = spectral_ss_2021.drop(['date'], axis=1)

spectral_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_spectral_nine_features_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)


spectral_ss_2022['date'] = pd.to_datetime(spectral_ss_2022['date'], dayfirst=True)
spectral_ss_2022 = spectral_ss_2022.set_index(spectral_ss_2022['date'])
spectral_ss_2022 = spectral_ss_2022.drop(['date'], axis=1)

spectral_ss_2021.replace([np.inf, -np.inf], np.nan, inplace=True)
spectral_ss_2022.replace([np.inf, -np.inf], np.nan, inplace=True)

spectral_ss_2021 = spectral_ss_2021.dropna(axis='columns')

spectral_ss_2022 = spectral_ss_2022.dropna(axis='columns')

spectral_ss_2022['tag'] = spectral_ss_2022['tag']+ 10
spectral_ss = pd.concat([spectral_ss_2021, spectral_ss_2022])
spectral_ss.drop(columns=['raw_audio'], inplace=True)



hives = [6, 3627, 3628, 3629, 3631, 3640, 3690, 3691, 3692, 3693]

    
hand_crafted_ss_2021=pd.concat(
        [pd.read_pickle(f"../data/features/{2021}_df_ss_amp_hand_crafted_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)

hand_crafted_ss_2021['date'] = pd.to_datetime(hand_crafted_ss_2021['date'], dayfirst=True)
hand_crafted_ss_2021 = hand_crafted_ss_2021.set_index(hand_crafted_ss_2021['date'])
hand_crafted_ss_2021 = hand_crafted_ss_2021.drop(['date'], axis=1)

hand_crafted_ss_2022=pd.concat(
        [pd.read_pickle(f"../data/features/{2022}_df_ss_amp_hand_crafted_" + str(hive) + ".pkl") 
         for hive in hives],
        ignore_index=True)


hand_crafted_ss_2022['date'] = pd.to_datetime(hand_crafted_ss_2022['date'], dayfirst=True)
hand_crafted_ss_2022 = hand_crafted_ss_2022.set_index(hand_crafted_ss_2022['date'])
hand_crafted_ss_2022 = hand_crafted_ss_2022.drop(['date'], axis=1)


hand_crafted_ss_2022['tag'] = hand_crafted_ss_2022['tag']+ 10
hand_crafted_ss = pd.concat([hand_crafted_ss_2021, hand_crafted_ss_2022])

hand_crafted_ss.drop(columns=['raw_audio'], inplace=True)


# Random split

In [None]:
feature = 'mfcc' # 'mfcc', lfcc', 'spectral', 

feature_data = mfcc    # mfccs, lfccs, spectral, hand_crafted
method = 'shap' #'mrmr' 'shap'
split = 'random' # 'independent'
n_splits = 5
model='random forest'
preprocessing = 'off'
selected_columns = feature_data.columns[2:]

feature_data_15, feature_data_rest = train_test_split(feature_data, test_size=0.85, random_state=42)

features_selection_cv(feature, feature_data_15, selected_columns, n_splits, method, split, model, preprocessing)
read_and_plot_results(method, feature, split, preprocessing='off')

# Run model on the best number of features

In [None]:
feature = 'mfcc' # 'lfcc', 'spectral'

feature_data = mfccs #lfcc, spectral
method = 'shap' #'mrmr'
split = 'random' # 'independent'
n_splits = 5
model = 'random forest'
fobs = 'all'
folds = '5folds' # '5folds'

num_features = 12
preprocessing = 'on'

random_predictions = np.load(f"random_regressor_mfcc_0_{folds}_{fobs}_{model}_shap_predictions_{split}_preprocessing_{preprocessing}.npy", allow_pickle=True)

feature_data_15, feature_data_rest = train_test_split(feature_data, test_size=0.85, random_state=42)


ranking_filename = f"{feature}_{method}_feature_ranking_{split}_preprocessing_off.csv"
ranking_df = pd.read_csv(ranking_filename)
feature_ranking_idxs = ranking_df["Feature Index"].values

print(f"Feature indices loaded: {feature_ranking_idxs}")

    
selected_columns = feature_data.columns[2:][feature_ranking_idxs[:num_features]] # select the best number of features


model_predictions, y_tests, _, _ =  random_split_cv(feature_data_rest, selected_columns, n_splits=n_splits, model=model)
evaluate_model_performance(y_tests, random_predictions, model_predictions)


np.save(f"y_tests_{folds}_{fobs}_{split}_preprocessing_{preprocessing}.npy", y_tests)
np.save(f"{feature}_{folds}_{fobs}_{model}_{method}_predictions_{split}_preprocessing_{preprocessing}.npy", model_predictions)



# Independent

In [None]:
feature = 'mfccs' # 'mfcc', lfcc', 'spectral', 'hand_crafted'

feature_data = mfccs    # mfccs, lfccs, spectral   
method = 'shap' #'mrmr' 'shap'
split = 'independent' # 'independent', 'random'
n_splits = 5
model='random forest'
preprocessing = 'off'
selected_columns = feature_data.columns[2:]

list_hives_feature_selection = [3692, 3650, 3638, 3627]
feature_data_15 = feature_data[feature_data['tag'].isin(list_hives_feature_selection)]
feature_data_rest = feature_data[~feature_data['tag'].isin(list_hives_feature_selection)]

features_selection_cv(feature, feature_data_15, selected_columns, n_splits, method, split, model, preprocessing)
read_and_plot_results(method, feature, split, preprocessing='off')


# Run model on the best number of features

In [None]:
feature = 'mfccs' #'lfcc', 'spectral', 'mssd'
feature_data = mfccs #lfcc, spectral, hand_crafted
method = 'mrmr' #'mrmr'
list_hives_feature_selection = [3692, 3650, 3638, 3627]

feature_data_rest = feature_data[~feature_data['tag'].isin(list_hives_feature_selection)]

split = 'independent' # 'independent'
model = 'random forest'
preprocessing = 'on'
fobs = 'all'
folds = '5folds' # '5folds'

if folds=='loo':
    n_outer_folds = feature_data_rest['tag'].nunique()
else:
    n_outer_folds = 5
    
num_features = 10
    

ranking_filename = f"mfcc_{method}_feature_ranking_{split}_preprocessing_off.csv"
ranking_df = pd.read_csv(ranking_filename)
feature_ranking_idxs = ranking_df["Feature Index"].values

print(f"Feature indices loaded: {feature_ranking_idxs}")

selected_columns = feature_data.columns[2:][feature_ranking_idxs[:num_features]] # select the best number of features


model_predictions, y_tests, _, _, hive_results = hive_independent_cv(feature_data_rest, selected_columns, model=model,
                                                           n_outer_folds=n_outer_folds, n_inner_folds=5)

np.save(f"{feature}_{folds}_{fobs}_{model}_{method}_predictions_{split}_preprocessing_{preprocessing}.npy", model_predictions)

with open(f"{feature}_{folds}_{fobs}_{model}_{method}_hive_results_{split}_preprocessing_{preprocessing}.pkl", 'wb') as file:
    pickle.dump(hive_results, file)