## 0. Installs and imports

In [None]:
'''
!pip install matplotlib
!pip install plotly
#install all packages needed from this notebook
!pip install numpy
!pip install pandas
!pip install sklearn
!pip install librosa
!pip install IPython
%pip install imblearn
'''


In [None]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import librosa
from IPython.display import Audio

## 1. Load files

### 1.1 Annotations

In [None]:
annotations_kj_path = "../data/raw/annotations_koti_janmani.txt"
annotations_vnk_path = "../data/raw/annotations_vanajaksha_ninni_kore.txt"

In [None]:
def to_seconds(t):
    return (t.hour * 60 * 60) + (t.minute * 60) + t.second + (t.microsecond / 1000000)

def load_annotations_file(path: str) -> pd.DataFrame:
    """
    Load annotations from a file.

    :param path: Path to the file containing the annotations.
    :return: A pandas DataFrame containing the annotations.
    """
    # Read the annotations file
    annotations = pd.read_csv(path, sep='\t', header=None)

    # Add column names
    annotations.columns = ["level", "", "start", "end", "duration", "label"]
    del annotations[""]

    # Convert to seconds
    annotations["start"] = pd.to_datetime(annotations["start"])
    annotations["end"] = pd.to_datetime(annotations["end"])
    annotations["start"] = annotations["start"].apply(to_seconds)
    annotations["end"] = annotations["end"].apply(to_seconds)
    annotations["duration"] = pd.to_timedelta(annotations['duration']).dt.total_seconds()

    annotations.reset_index(inplace=True)

    return annotations

In [None]:
annotations_kj = load_annotations_file(annotations_kj_path)
annotations_vnk = load_annotations_file(annotations_vnk_path)

#annotations_vnk

In [None]:
annotations_kj_usancara = annotations_kj[annotations_kj["level"] == "underlying_sancara"]
annotations_vnk_usancara = annotations_vnk[annotations_vnk["level"] == "root_sancara"]

#annotations_vnk_usancara

### 1.2 Audio

In [None]:
audio_kj_path = "../data/raw/Koti Janmani/Koti Janmani.multitrack-vocal.mp3"
audio_vnk_path = "../data/raw/Vanajaksha Ninne Kori/Vanajaksha Ninne Kori_vocal.mp3"

In [None]:
def load_audio_file(path: str, sampling_rate: int) -> tuple:
    audio_time_series, sr = librosa.load(path, sr=sampling_rate)
    return audio_time_series, sr

In [None]:
audio_kj, sr_kj = load_audio_file(audio_kj_path, 44100)
audio_vnk, sr_vnk = load_audio_file(audio_vnk_path, 44100)

#Audio(data=audio_vnk, rate=sr_vnk)
Audio(data=audio_kj, rate=sr_kj)

### 1.3 Extract pitch

In [None]:
# Passar la ref a una constant (per fer el canvi de Hz a cents)
tonic_path_kj = "../data/raw/Koti Janmani/Koti Janmani.ctonic.txt"
tonic_path_vnk = "../data/raw/Vanajaksha Ninne Kori/Vanajaksha Ninne Kori.ctonic.txt"

with open(tonic_path_kj, "r") as f:
    ctonic_ref_kj = float(f.readline().strip())

with open(tonic_path_vnk, "r") as f:
    ctonic_ref_vnk = float(f.readline().strip())

ctonic_ref_vnk

In [None]:
from scipy.signal import savgol_filter

def pitch_to_cents(pitch: float, ref: float):
    if pitch == 0:
        return None
    else:
        return 1200 * math.log(pitch/ref, 2)

def interpolate_and_smooth_pitch(pitch, ts):
    pitch = pd.Series(pitch)
    pitch[pitch <= 0] = np.nan
    pitch_interpolated = pitch.interpolate(method="linear")
    pitch_smoothed = savgol_filter(pitch_interpolated, window_length=50, polyorder=2)
    #pitch_smoothed = savgol_filter(pitch_interpolated, window_length=int((250*0.001)/ts), polyorder=3)
    return pitch_smoothed

#### Extract pitch from pitch file

In [None]:
pitch_path_kj = "../data/raw/Koti Janmani/Koti_Janmani.melodia.pitch.txt"
pitch_path_vnk = "../data/raw/Vanajaksha Ninne Kori/Vanajaksha Ninne Kori.melodia.pitch.txt"

In [None]:
def load_pitch_file(path: str):
    """
    Load a pitch file from a given path.

    :param path: Path to the pitch file.
    :return: pitch_file, time, pitch, timestep
    """
    pitch_file = pd.read_csv(path, sep="\t", header=None)
    pitch_file.columns = ["time", "pitch"]

    time = pitch_file["time"].values
    pitch = pitch_file["pitch"].values
    timestep = time[1] - time[0]

    return pitch_file, time, pitch, timestep


In [None]:
pitch_file_kj, time_kj, pitch_kj, timestep_kj = load_pitch_file(pitch_path_kj)
pitch_file_vnk, time_vnk, pitch_vnk, timestep_vnk = load_pitch_file(pitch_path_vnk)

# Replace non-positive values with NaN, interpolate and smooth
pitch_kj_smoothed = interpolate_and_smooth_pitch(pitch_kj, timestep_kj)
pitch_vnk_smoothed = interpolate_and_smooth_pitch(pitch_vnk, timestep_vnk)

# Convert pitch to cents
pitch_cents_kj = np.array([pitch_to_cents(p, ctonic_ref_kj) for p in pitch_kj_smoothed])
pitch_cents_vnk = np.array([pitch_to_cents(p, ctonic_ref_vnk) for p in pitch_vnk_smoothed])

In [None]:
#pitch_cents_kj[10000:10020]
pitch_cents_vnk[10000:10020]

##### Pitch was extracted using melodia and saved into the files: "Koti Janmani.melodia.pitch.txt" , "Vanajaksha Ninne Kori.melodia.pitch.txt"

## 2. Feature extraction

### 2.1 Time Domain Features

In [None]:
from librosa.feature import rms
from librosa.feature import zero_crossing_rate as zcr

In [None]:
time_features_kj_df = annotations_kj_usancara.copy()
time_features_vnk_df = annotations_vnk_usancara.copy()

#### 2.1.1 Root Mean Square Energy

In [None]:
def compute_rms(audio: np.ndarray, sample_start: float, sample_end: float, sr: int) -> float:
    sample = audio[round(sample_start * sr):round(sample_end * sr)]
    return np.mean(rms(y=sample)[0])

In [None]:
# NOTE: a stands for annotation
time_features_kj_df['rmse'] = time_features_kj_df.apply(
    lambda a: compute_rms(audio_kj, a['start'], a['end'], sr_kj), axis=1
)
time_features_vnk_df['rmse'] = time_features_vnk_df.apply(
    lambda a: compute_rms(audio_vnk, a['start'], a['end'], sr_vnk), axis=1
)

#### 2.1.2 Zero Crossing Rate

In [None]:
def compute_zcr(audio: np.ndarray, sample_start: float, sample_end: float, sr: int) -> float:
    sample = audio[round(sample_start * sr):round(sample_end * sr)]
    return np.mean(zcr(y=sample)[0])

In [None]:
time_features_kj_df['zcr'] = time_features_kj_df.apply(
    lambda a: compute_zcr(audio_kj, a['start'], a['end'], sr_kj), axis=1
)
time_features_vnk_df['zcr'] = time_features_vnk_df.apply(
    lambda a: compute_zcr(audio_vnk, a['start'], a['end'], sr_vnk), axis=1
)

#### Time Domain Features DataFrame

In [None]:
#time_features_kj_df
#time_features_vnk_df

### 2.2 Frequency Domain Features

In [None]:
from librosa.feature import spectral_centroid as scentroid
from librosa.feature import spectral_bandwidth as sbandwidth
from librosa.feature import spectral_rolloff as srolloff
from librosa.feature import mfcc

In [None]:
frequency_features_kj_df = annotations_kj_usancara.copy()
frequency_features_vnk_df = annotations_vnk_usancara.copy()

#### 2.2.1 Spectral Centroid

Each frame of a magnitude spectrogram is normalized and treated as a distribution over frequency bins, from which the mean (centroid) is extracted per frame.

In [None]:
def compute_scentroid(audio: np.ndarray, sample_start: float, sample_end: float, sr: int) -> float:
    sample = audio[round(sample_start * sr):round(sample_end * sr)]
    return np.mean(scentroid(y=sample, sr=sr)[0])

In [None]:
frequency_features_kj_df['spectral_centroid'] = frequency_features_kj_df.apply(
    lambda a: compute_scentroid(audio_kj, a['start'], a['end'], sr_kj), axis=1
)
frequency_features_vnk_df['spectral_centroid'] = frequency_features_vnk_df.apply(
    lambda a: compute_scentroid(audio_vnk, a['start'], a['end'], sr_vnk), axis=1
)

#### 2.2.2 Spectral Bandwidth

In [None]:
def compute_sbandwidth(audio: np.ndarray, sample_start: float, sample_end: float, sr: int) -> float:
    sample = audio[round(sample_start * sr):round(sample_end * sr)]
    return np.mean(sbandwidth(y=sample, sr=sr)[0])

In [None]:
frequency_features_kj_df['spectral_bandwidth'] = frequency_features_kj_df.apply(
    lambda a: compute_sbandwidth(audio_kj, a['start'], a['end'], sr_kj), axis=1
)
frequency_features_vnk_df['spectral_bandwidth'] = frequency_features_vnk_df.apply(
    lambda a: compute_sbandwidth(audio_vnk, a['start'], a['end'], sr_vnk), axis=1
)

#### 2.2.3 Spectral Rolloff

The roll-off frequency is defined for each frame as the center frequency for a spectrogram bin such that at least roll_percent (0.85 by default) of the energy of the spectrum in this frame is contained in this bin and the bins below. This can be used to, e.g., approximate the maximum (or minimum) frequency by setting roll_percent to a value close to 1 (or 0).

In [None]:
def compute_srolloff(audio: np.ndarray, sample_start: float, sample_end: float, sr: int) -> float:
    sample = audio[round(sample_start * sr):round(sample_end * sr)]
    return np.mean(srolloff(y=sample, sr=sr)[0])

In [None]:
frequency_features_kj_df['spectral_rolloff'] = frequency_features_kj_df.apply(
    lambda a: compute_srolloff(audio_kj, a['start'], a['end'], sr_kj), axis=1
)
frequency_features_vnk_df['spectral_rolloff'] = frequency_features_vnk_df.apply(
    lambda a: compute_srolloff(audio_vnk, a['start'], a['end'], sr_vnk), axis=1
)

#### 2.2.4 Mel Frequency Cepstral Coefficients

In [None]:
mfcc_kj_df = annotations_kj_usancara.copy()
mfcc_vnk_df = annotations_vnk_usancara.copy()

In [None]:
def compute_mfcc(audio: np.ndarray, sample_start: float, sample_end: float, sr: int) -> np.ndarray:
    sample = audio[round(sample_start * sr):round(sample_end * sr)]
    return np.mean(mfcc(y=sample, sr=sr, n_mfcc=6), axis=1)

mfcc_cols = [f'mfcc_{i+1}' for i in range(6)]

In [None]:
mfcc_kj_df[mfcc_cols] = mfcc_kj_df.apply(
    lambda a: compute_mfcc(audio_kj, a['start'], a['end'], sr_kj), axis=1
).apply(pd.Series)
mfcc_vnk_df[mfcc_cols] = mfcc_vnk_df.apply(
    lambda a: compute_mfcc(audio_vnk, a['start'], a['end'], sr_vnk), axis=1
).apply(pd.Series)

#### Frequency Domain Features DataFrame

In [None]:
#frequency_features_kj_df
#frequency_features_vnk_df
mfcc_kj_df
#mfcc_vnk_df


### 2.3 Pitch Curve Features

In [None]:
pitch_features_kj_df = annotations_kj_usancara.copy()
pitch_features_vnk_df = annotations_vnk_usancara.copy()

#### 2.3.1 Mean pitch, Min/Max and Range

In [None]:
def get_mean_min_max_pitch(cents: np.ndarray, tstep: float, sample_start: float, sample_end: float):
    #sample_time = time[round(sample_start/tstep):round(sample_end/tstep)]
    sample_cents = cents[round(sample_start/tstep):round(sample_end/tstep)]
    return np.mean(sample_cents), min(sample_cents), max(sample_cents)

In [None]:
pitch_features_kj_df[['mean_pitch', 'min_pitch', 'max_pitch']] = pitch_features_kj_df.apply(
    lambda a: get_mean_min_max_pitch(pitch_cents_kj, timestep_kj, a['start'], a['end']), axis=1
).apply(pd.Series)
pitch_features_vnk_df[['mean_pitch', 'min_pitch', 'max_pitch']] = pitch_features_vnk_df.apply(
    lambda a: get_mean_min_max_pitch(pitch_cents_vnk, timestep_vnk, a['start'], a['end']), axis=1
).apply(pd.Series)

In [None]:
# Range
pitch_features_kj_df['pitch_range'] = pitch_features_kj_df['max_pitch'] - pitch_features_kj_df['min_pitch']
pitch_features_vnk_df['pitch_range'] = pitch_features_vnk_df['max_pitch'] - pitch_features_vnk_df['min_pitch']

#### 2.3.2 Number of Change Points

In [None]:
from scipy.signal import find_peaks

def compute_number_of_change_points(cents: np.ndarray, prominence: int, tstep: float, sample_start: float, sample_end: float) -> int:
    #sample_time = time[round(sample_start/tstep):round(sample_end/tstep)]
    sample_cents = cents[round(sample_start/tstep):round(sample_end/tstep)]

    peaks, _ = find_peaks(sample_cents, prominence=prominence) # Use PROMINENCE to get only significant change points (> 70 cents is significant)
    valleys, _ = find_peaks(-sample_cents, prominence=prominence)

    num_change_points = len(peaks) + len(valleys)
    return num_change_points

In [None]:
prominence = 70 #cents

pitch_features_kj_df['num_change_points'] = pitch_features_kj_df.apply(
    lambda a: compute_number_of_change_points(pitch_cents_kj, prominence, timestep_kj, a['start'], a['end']), axis=1
)
pitch_features_vnk_df['num_change_points'] = pitch_features_vnk_df.apply(
    lambda a: compute_number_of_change_points(pitch_cents_vnk, prominence, timestep_vnk, a['start'], a['end']), axis=1
)

#### 2.3.3 Number of Change Points per second

In [None]:
def compute_number_of_change_points_per_second(cents: np.ndarray, prominence: int, tstep: float, sample_start: float, sample_end: float) -> float:
    num_change_points = compute_number_of_change_points(cents, prominence, tstep, sample_start, sample_end)
    return num_change_points / (sample_end - sample_start)

In [None]:
pitch_features_kj_df['num_change_points_per_second'] = pitch_features_kj_df.apply(
    lambda a: compute_number_of_change_points_per_second(pitch_cents_kj, prominence, timestep_kj, a['start'], a['end']), axis=1
)
pitch_features_vnk_df['num_change_points_per_second'] = pitch_features_vnk_df.apply(
    lambda a: compute_number_of_change_points_per_second(pitch_cents_vnk, prominence, timestep_vnk, a['start'], a['end']), axis=1
)

#### Pitch Curve Features DataFrame

In [None]:
#pitch_features_kj_df
#pitch_features_vnk_df

### 2.4 Create DataFrame with the Features

In [None]:
# Create a features dataframe for each song
cols_to_drop = ["index", "level", "start", "end", "duration", "label"]
features_kj_df = pd.concat([annotations_kj_usancara, 
                        time_features_kj_df.drop(columns=cols_to_drop),
                        frequency_features_kj_df.drop(columns=cols_to_drop),
                        mfcc_kj_df.drop(columns=cols_to_drop),
                        pitch_features_kj_df.drop(columns=cols_to_drop)],
axis=1)
features_vnk_df = pd.concat([annotations_vnk_usancara,
                        time_features_vnk_df.drop(columns=cols_to_drop),
                        frequency_features_vnk_df.drop(columns=cols_to_drop),
                        mfcc_vnk_df.drop(columns=cols_to_drop),
                        pitch_features_vnk_df.drop(columns=cols_to_drop)],
axis=1)


In [None]:
# Merge both dataframes
features_df = pd.concat([features_kj_df, features_vnk_df], axis=0)
features_df['level'] = features_df['level'].apply(lambda y: y.replace('root','underlying'))

#features_df[130:150]

In [None]:
# Normalize the data
def normalize_dataframe(df: pd.DataFrame, features) -> pd.DataFrame:
    for f in features:
        if f != 'num_change_points' and f != 'num_change_points_per_second':
            df[f] = (df[f] - df[f].mean()) / df[f].std()
    return df

In [None]:
all_features = ['rmse', 'zcr', 
            'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff',
            'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6',
            'mean_pitch', 'min_pitch', 'max_pitch', 'pitch_range', 'num_change_points', 'num_change_points_per_second']
features_pitch = ['mean_pitch', 'min_pitch', 'max_pitch', 'pitch_range', 'num_change_points', 'num_change_points_per_second']

norm_features_df = normalize_dataframe(features_df, all_features)
norm_features_df[130:150]

### 2.5 Comparing NNS (no subsequences)

In [None]:
df = norm_features_df.copy()

In [None]:
df_nns = df[df['label'] == 'nns']
df_nns

In [None]:
def compare_multiple_patterns_pitch(pitch_cents_list, time_list, tstep_list, sample_start_list, sample_end_list):
    """
    Compara múltiples patrones de pitch en un solo gráfico.

    pitch_cents_list: Lista de arrays de pitch en cents.
    time_list: Lista de arrays de tiempo correspondientes.
    tstep_list: Lista de pasos de tiempo para cada conjunto de datos.
    sample_start_list: Lista de tiempos de inicio para los fragmentos a comparar.
    sample_end_list: Lista de tiempos de fin para los fragmentos a comparar.
    """
    colors = ['blue', 'red', 'green', 'purple', 'orange', 'cyan', 'magenta', 'yellow', 'black', 'grey']
    num_patterns = len(pitch_cents_list)
    
    plt.figure(figsize=(10, 5))
    
    for i in range(num_patterns):
        pitch_cents = pitch_cents_list[i]
        time = time_list[i]
        tstep = tstep_list[i]
        sample_start = sample_start_list[i]
        sample_end = sample_end_list[i]
        
        sample_time = time[round(sample_start/tstep):round(sample_end/tstep)]
        sample_time = sample_time - sample_start
        sample_cents = pitch_cents[round(sample_start/tstep):round(sample_end/tstep)]
        
        plt.plot(sample_time, sample_cents, label=f'Pattern {i+1}', color=colors[i % len(colors)])
    
    plt.xlabel('Time (s)')
    plt.ylabel('Pitch (cents)')
    plt.legend()
    plt.title('Comparison of Pitch Patterns')
    plt.grid(True)
    plt.show()

In [None]:
pitch_cents_list = [pitch_cents_kj, pitch_cents_kj, pitch_cents_kj, pitch_cents_kj, pitch_cents_kj]
time_list = [time_kj, time_kj, time_kj, time_kj, time_kj]
tstep_list = [timestep_kj, timestep_kj, timestep_kj, timestep_kj, timestep_kj]

sample_start_list = [df_nns['start'].iloc[i] for i in range(0, 5)]
sample_end_list = [df_nns['end'].iloc[i] for i in range(0, 5)]

compare_multiple_patterns_pitch(pitch_cents_list, time_list, tstep_list, sample_start_list, sample_end_list)

## 3. Modelling to predict NNS

In [None]:
df = norm_features_df.copy()
label = 'nns'

# Get IS_NNS
filtered_df = df[df['label'] == label]
dummies_df = pd.get_dummies(filtered_df['label'], prefix='is')
result_df = df.join(dummies_df).fillna(False)

# Get CONTAINS_NNS
result_df['contains_nns'] = result_df['label'].apply(lambda y: label in y)
# result_df['contains_nns'].value_counts() -- 61 values

# Convert the labels to a binary format
cols_to_convert = [f'is_{label}', f'contains_{label}']
result_df[cols_to_convert] = result_df[cols_to_convert].astype(int)
result_df[100:120]

#save the dataframe to csv
#result_df.to_csv('../data/processed/containsss.csv', index=False)

In [None]:
# FEATURES and TARGETS

all_features = ['rmse', 'zcr', 
            'spectral_centroid', 'spectral_bandwidth', 'spectral_rolloff',
            'mfcc_1', 'mfcc_2', 'mfcc_3', 'mfcc_4', 'mfcc_5', 'mfcc_6', 
            'mean_pitch', 'min_pitch', 'max_pitch', 'pitch_range', 'num_change_points', 'num_change_points_per_second']

targets = [f'is_{label}', f'contains_{label}']

### 3.1 Get Training Data

In [None]:
from sklearn.model_selection import train_test_split

# Splitting is_nns
X_is_nns = result_df[all_features].values
y_is_nns = result_df[targets[0]].values
X_train_is_nns, X_test_is_nns, y_train_is_nns, y_test_is_nns = train_test_split(X_is_nns, y_is_nns, test_size=0.3, random_state=42, stratify=y_is_nns)

# Splitting contains_nns
X_contains_nns = result_df[all_features].values
y_contains_nns = result_df[targets[1]].values
X_train_contains_nns, X_test_contains_nns, y_train_contains_nns, y_test_contains_nns = train_test_split(X_contains_nns, y_contains_nns, test_size=0.3, random_state=42, stratify=y_contains_nns)

In [None]:
# Feature importances using GradientBoosting
from sklearn.ensemble import GradientBoostingClassifier

# IS_NNS
features_is_nns = []
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train_is_nns, y_train_is_nns)
print("Feature importances for IS_NNS using Gradient Boosting ordered by most relevance\n")
for feature, importance in sorted(zip(all_features, clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"{feature.ljust(23)} \t\t\t{importance*100:.2f} %")
    if importance > 0.02:
        features_is_nns.append(feature)

# CONTAINS_NNS
features_contains_nns = []
clf = GradientBoostingClassifier(random_state=42)
clf.fit(X_train_contains_nns, y_train_contains_nns)
print("\nFeature importances for CONTAINS_NNS using Gradient Boosting ordered by most relevance\n")
for feature, importance in sorted(zip(all_features, clf.feature_importances_), key=lambda x: x[1], reverse=True):
    print(f"{feature.ljust(23)} \t\t\t{importance*100:.2f} %")
    if importance > 0.02:
        features_contains_nns.append(feature)

#features_is_nns
#features_contains_nns

### 3.2 Random Predictions to set a baseline

In [None]:
# Info about metrics:  https://www.evidentlyai.com/classification-metrics/accuracy-precision-recall
from sklearn.metrics import accuracy_score # Accuracy = (Correct predictions / Total predictions) * 100 === (TP + TN) / (TP + TN + FP + FN)
from sklearn.metrics import precision_score # Precision = (TP) / (TP + FP) the % of correct positive predictions over the total PREDICTED positives
from sklearn.metrics import recall_score # Recall = (TP) / (TP + FN) the % of correct positive predictions over the total ACTUAL positives

from sklearn.metrics import f1_score # F1 = 2 * (Precision * Recall) / (Precision + Recall) the harmonic mean of precision and recall
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

In [None]:
# Compute random predictions

for t in targets:
    if t == targets[0]: # IS_NNS
        # SPLITTING
        X = result_df[all_features].values # TODO: Change to features_is_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=None)

        # RANDOM PREDICTION
        is_accuracies = []
        is_precisions = []
        is_recalls = []
        is_f1s = []
        unique_classes = np.unique(y_test)
        for _ in range(100):
            random_predictions = np.random.choice(unique_classes, len(y_test))
            is_accuracies.append(accuracy_score(y_true=y_test, y_pred=random_predictions))
            is_precisions.append(precision_score(y_true=y_test, y_pred=random_predictions))
            is_recalls.append(recall_score(y_true=y_test, y_pred=random_predictions))
            is_f1s.append(f1_score(y_true=y_test, y_pred=random_predictions))

        # EVALUATION
        is_random_accuracy = np.mean(is_accuracies)
        is_random_precision = np.mean(is_precisions)
        is_random_recall = np.mean(is_recalls)
        is_random_f1 = np.mean(is_f1s)

        print(f'RANDOM IS_NNS: Test ACCURACY for target {t}: {is_random_accuracy}')
        print(f'RANDOM IS_NNS: Test PRECISION for target {t}: {is_random_precision}')
        print(f'RANDOM IS_NNS: Test RECALL for target {t}: {is_random_recall}')
        print(f'RANDOM IS_NNS: Test F1 for target {t}: {is_random_f1}')
        print("-----------------------------------")
    elif t == targets[1]: # CONTAINS_NNS
        # SPLITTING
        X = result_df[all_features].values # TODO: Change to features_contains_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=None)

        # RANDOM PREDICTION
        contains_accuracies = []
        contains_precisions = []
        contains_recalls = []
        contains_f1s = []
        unique_classes = np.unique(y_test)
        for _ in range(100):
            random_predictions = np.random.choice(unique_classes, len(y_test))
            contains_accuracies.append(accuracy_score(y_true=y_test, y_pred=random_predictions))
            contains_precisions.append(precision_score(y_true=y_test, y_pred=random_predictions))
            contains_recalls.append(recall_score(y_true=y_test, y_pred=random_predictions))
            contains_f1s.append(f1_score(y_true=y_test, y_pred=random_predictions))

        # EVALUATION
        contains_random_accuracy = np.mean(contains_accuracies)
        contains_random_precision = np.mean(contains_precisions)
        contains_random_recall = np.mean(contains_recalls)
        contains_random_f1 = np.mean(contains_f1s)

        print(f'RANDOM: Test ACCURACY for target {t}: {contains_random_accuracy}')
        print(f'RANDOM: Test PRECISION for target {t}: {contains_random_precision}')
        print(f'RANDOM: Test RECALL for target {t}: {contains_random_recall}')
        print(f'RANDOM: Test F1 for target {t}: {contains_random_f1}')
        print("-----------------------------------")

### 3.3 Train and evaluate a Model

#### 3.3.1 Gradient Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

# TRAINING with GradientBoostingClassifier and GridSearchCV (only) and EVALUATING
params = {
    'n_estimators':[20,60,80],
    'learning_rate': [0.075, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth':[8,10],
}
results_table = []

for t in targets:
    if t == targets[0]: # IS_NNS
        # SPLITTING
        X = result_df[features_is_nns].values # TODO: Change to features_is_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=None)

        # TRAINING
        clf = GradientBoostingClassifier(random_state=42)
        gs = GridSearchCV(clf, param_grid=params, scoring='f1', cv=4)
        gs.fit(X_train, y_train)

        # PREDICTION
        best = gs.best_estimator_
        y_pred = best.predict(X_test)

        # EVALUATION
        is_f1_gbc = f1_score(y_true=y_test, y_pred=y_pred)
        is_accuracy_gbc = accuracy_score(y_true=y_test, y_pred=y_pred)
        is_precision_gbc = precision_score(y_true=y_test, y_pred=y_pred)
        is_recall_gbc = recall_score(y_true=y_test, y_pred=y_pred)

        results_table.append({
            'Class': t,
            'Accuracy': is_accuracy_gbc,
            'Precision': is_precision_gbc,
            'Recall': is_recall_gbc,
            'F1-Score': is_f1_gbc,
            'CV Score': gs.best_score_
        })
        print(f'Best parameters for target {t}: {gs.best_params_}')
        print(f'Test ACCURACY for target {t}: {is_accuracy_gbc}')
        print(f'Test PRECISION for target {t}: {is_precision_gbc}')
        print(f'Test RECALL for target {t}: {is_recall_gbc}')
        print(f'Test F1 for target {t}: {is_f1_gbc}')
        print(f'CV Score for target {t}: {gs.best_score_}')

        print("Confusion matrix:")
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
        cm_display = ConfusionMatrixDisplay(confusion_matrix=cm) # [TP, FP], [TN, FN]
        cm_display.plot()
        plt.show()
        print("-----------------------------------")
    elif t == targets[1]: # CONTAINS_NNS
        # SPLITTING
        X = result_df[features_contains_nns].values # TODO: Change to features_contains_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, stratify=None)

        # TRAINING
        clf = GradientBoostingClassifier(random_state=42)
        gs = GridSearchCV(clf, param_grid=params, scoring='f1', cv=4)
        gs.fit(X_train, y_train)

        # PREDICTION
        best = gs.best_estimator_
        y_pred = best.predict(X_test)

        # EVALUATION
        contains_f1_gbc = f1_score(y_true=y_test, y_pred=y_pred)
        contains_accuracy_gbc = accuracy_score(y_true=y_test, y_pred=y_pred)
        contains_precision_gbc = precision_score(y_true=y_test, y_pred=y_pred)
        contains_recall_gbc = recall_score(y_true=y_test, y_pred=y_pred)

        results_table.append({
            'Class': t,
            'Accuracy': contains_accuracy_gbc,
            'Precision': contains_precision_gbc,
            'Recall': contains_recall_gbc,
            'F1-Score': contains_f1_gbc,
            'CV Score': gs.best_score_
        })
        print(f'Best parameters for target {t}: {gs.best_params_}')
        print(f'Test ACCURACY for target {t}: {contains_accuracy_gbc}')
        print(f'Test PRECISION for target {t}: {contains_precision_gbc}')
        print(f'Test RECALL for target {t}: {contains_recall_gbc}')
        print(f'Test F1 for target {t}: {contains_f1_gbc}')
        print(f'CV Score for target {t}: {gs.best_score_}')

        print("Confusion matrix:")
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
        cm_display = ConfusionMatrixDisplay(confusion_matrix=cm) # [TP, FP], [TN, FN]
        cm_display.plot()
        plt.show()
        print("-----------------------------------")

# Print the results table
print("Analysis\n")
for result in results_table:
    print("Class: {}\nPrecision: {:.2f}\nRecall: {:.2f}\nF1-Score: {:.2f}\nCV Score: {:.2f}\n".format(result['Class'], result['Precision'], result['Recall'], result['F1-Score'], result['CV Score']))

#### 3.3.2 Gradient Boosting with KFolds

In [None]:
# Training with GradientBoostingClassifier using GridSearchCV with StratifiedKFold cross-validation.
from sklearn.model_selection import StratifiedKFold, train_test_split, GridSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import f1_score
params = {
    'n_estimators':[20,60,80],
    'learning_rate': [0.075, 0.1, 0.2, 0.3, 0.4, 0.5],
    'max_depth':[8,10],
}
test_size = 0.3
best_results = {}
results_table = []

print ('Training with GradientBoostingClassifier using GridSearchCV with StratifiedKFold cross-validation.')
for t in targets:
    if t == targets[0]: # IS_NNS
        # SPLITTING
        X = result_df[features_is_nns].values # TODO: Change to features_is_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

        # TRAINING
        clf = GradientBoostingClassifier(random_state=42) #aixi sempre es pot reproduir el mateix model
        skf = StratifiedKFold(n_splits=5)
        gs = GridSearchCV(clf, param_grid=params, scoring='f1', cv=skf)
        gs.fit(X_train, y_train)

        # PREDICTION
        best = gs.best_estimator_
        y_pred = best.predict(X_test)
        print('_')

        # EVALUATION
        is_f1_gbc_kf = f1_score(y_true=y_test, y_pred=y_pred)
        is_accuracy_gbc_kf = accuracy_score(y_true=y_test, y_pred=y_pred)
        is_precision_gbc_kf = precision_score(y_true=y_test, y_pred=y_pred)
        is_recall_gbc_kf = recall_score(y_true=y_test, y_pred=y_pred)
        is_support_gbc_kf = len(y_test)
        # Analysis is subjective and should be written based on the metrics
        if is_precision_gbc_kf == 1.0:
            analysis = "The model achieves perfect precision, indicating high confidence in identifying {}. However, it misses about {} of actual {} segments (false negatives).".format(t, 1-is_recall_gbc_kf, t)
        else:
            analysis = "The model shows good balance between precision and recall for detecting {}. There's room for improvement in capturing more instances without compromising precision.".format(t)
        results_table.append({
            'Class': t,
            'Accuracy': is_accuracy_gbc_kf,
            'Precision': is_precision_gbc_kf,
            'Recall': is_recall_gbc_kf,
            'F1-Score': is_f1_gbc_kf,
            'Support': is_support_gbc_kf,
            'Analysis': analysis,
            'cv_score': gs.best_score_
        })
        print(f'Best parameters for target {t}: {gs.best_params_}')
        print(f'Test ACCURACY for target {t}: {is_accuracy_gbc_kf}')
        print(f'Test PRECISION for target {t}: {is_precision_gbc_kf}')
        print(f'Test RECALL for target {t}: {is_recall_gbc_kf}')
        print(f'Test F1 for target {t}: {is_f1_gbc_kf}')
        print(f'Cross-validation score for target {t}: {gs.best_score_}')
        print("Confusion matrix:")
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
        cm_display = ConfusionMatrixDisplay(confusion_matrix=cm) # [TP, FP], [TN, FN]
        cm_display.plot()
        plt.show()
        print("-----------------------------------")
    elif t == targets[1]: # CONTAINS_NNS
        # SPLITTING
        X = result_df[features_contains_nns].values # TODO: Change to features_contains_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

        # TRAINING
        clf = GradientBoostingClassifier(random_state=42) #aixi sempre es pot reproduir el mateix model
        skf = StratifiedKFold(n_splits=5)
        gs = GridSearchCV(clf, param_grid=params, scoring='f1', cv=skf)
        gs.fit(X_train, y_train)

        # PREDICTION
        best = gs.best_estimator_
        y_pred = best.predict(X_test)
        print('_')

        # EVALUATION
        contains_f1_gbc_kf = f1_score(y_true=y_test, y_pred=y_pred)
        contains_accuracy_gbc_kf = accuracy_score(y_true=y_test, y_pred=y_pred)
        contains_precision_gbc_kf = precision_score(y_true=y_test, y_pred=y_pred)
        contains_recall_gbc_kf = recall_score(y_true=y_test, y_pred=y_pred)
        contains_support_gbc_kf = len(y_test)
        # Analysis is subjective and should be written based on the metrics
        if contains_precision_gbc_kf == 1.0:
            analysis = "The model achieves perfect precision, indicating high confidence in identifying {}. However, it misses about {} of actual {} segments (false negatives).".format(t, 1-contains_recall_gbc_kf, t)
        else:
            analysis = "The model shows good balance between precision and recall for detecting {}. There's room for improvement in capturing more instances without compromising precision.".format(t)
        results_table.append({
            'Class': t,
            'Accuracy': contains_accuracy_gbc_kf,
            'Precision': contains_precision_gbc_kf,
            'Recall': contains_recall_gbc_kf,
            'F1-Score': contains_f1_gbc_kf,
            'Support': contains_support_gbc_kf,
            'Analysis': analysis,
            'cv_score': gs.best_score_
        })
        print(f'Best parameters for target {t}: {gs.best_params_}')
        print(f'Test ACCURACY for target {t}: {contains_accuracy_gbc_kf}')
        print(f'Test PRECISION for target {t}: {contains_precision_gbc_kf}')
        print(f'Test RECALL for target {t}: {contains_recall_gbc_kf}')
        print(f'Test F1 for target {t}: {contains_f1_gbc_kf}')
        print(f'Cross-validation score for target {t}: {gs.best_score_}')
        print("Confusion matrix:")
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred)
        cm_display = ConfusionMatrixDisplay(confusion_matrix=cm) # [TP, FP], [TN, FN]
        cm_display.plot()
        plt.show()
        print("-----------------------------------")

# Print the results table
print("Analysis\n")
for result in results_table:
    print("Class: {}\nPrecision: {:.2f}\nRecall: {:.2f}\nF1-Score: {:.2f}\nCV Score: {:.2f}\nSupport: {}\nAnalysis: {}\n".format(result['Class'], result['Precision'], result['Recall'], result['F1-Score'], result['cv_score'], result['Support'], result['Analysis']))

#### 3.3.3 Random Forest

In [None]:
import numpy as np
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import f1_score, recall_score, precision_score, accuracy_score
# Define the parameter grid for RandomForest
params_rf = {
    'n_estimators': [20, 120],
    'max_depth':[8,10],
    'min_samples_split': [8],
    'min_samples_leaf': [4],
    'bootstrap': [True, False]
}
test_size = 0.3
best_results_rf = {}
results_table = []
print('Training using Random Forest')
for t in targets:
    if t == targets[0]: # IS_NNS
        # SPLITTING
        X = result_df[features_is_nns].values # TODO: Change to features_is_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

        # Apply SMOTE
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

        # TRAINING
        clf_rf = RandomForestClassifier(random_state=42)
        random_search = RandomizedSearchCV(clf_rf, param_distributions=params_rf, n_iter=4, cv=5, verbose=0, random_state=42)
        random_search.fit(X_train_res, y_train_res)

        # PREDICTION
        best_rf = random_search.best_estimator_
        y_pred_rf = best_rf.predict(X_test)

        # EVALUATION
        is_f1_rf = f1_score(y_true=y_test, y_pred=y_pred_rf)
        is_accuracy_rf = accuracy_score(y_true=y_test, y_pred=y_pred_rf)
        is_precision_rf = precision_score(y_true=y_test, y_pred=y_pred_rf)
        is_recall_rf = recall_score(y_true=y_test, y_pred=y_pred_rf)
        is_support_rf = len(y_test)
        
        # Analysis is subjective and should be written based on the metrics
        if is_precision_rf == 1.0:
            analysis = "The model achieves perfect precision, indicating high confidence in identifying {}. However, it misses about {} of actual {} segments (false negatives).".format(t, 1-is_recall_rf, t)
        else:
            analysis = "The model shows good balance between precision and recall for detecting {}. There's room for improvement in capturing more instances without compromising precision.".format(t)

        results_table.append({
            'Class': t,
            'Accuracy': is_accuracy_rf,
            'Precision': is_precision_rf,
            'Recall': is_recall_rf,
            'F1-Score': is_f1_rf,
            'cv_score': random_search.best_score_,
            'Support': is_support_rf,
            'Analysis': analysis
        })
        print("-----------------------------------")
        best_results_rf[t] = {
            'f1': is_f1_rf,
            'recall': is_recall_rf,
            'precision': is_precision_rf,
            'accuracy': is_accuracy_rf,
            'params': random_search.best_params_,
            'cv_score': random_search.best_score_,
            'test_size': test_size
        }

        print(f'\nBest results for target {t} using Random Forest:')
        print(f"Test F1: {best_results_rf[t]['f1']}")
        print(f"Test Recall: {best_results_rf[t]['recall']}")
        print(f"Test Precision: {best_results_rf[t]['precision']}")
        print(f"Test Accuracy: {best_results_rf[t]['accuracy']}")
        print(f"Best Params: {best_results_rf[t]['params']}")
        print(f"CV Score: {best_results_rf[t]['cv_score']}")
        print('_')
        # Print confusion matrix
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred_rf)
        cm_display = ConfusionMatrixDisplay(confusion_matrix=cm)
        cm_display.plot()
        plt.show()
    elif t == targets[1]: # CONTAINS_NNS
        # SPLITTING
        X = result_df[features_contains_nns].values # TODO: Change to features_contains_nns
        y = result_df[t].values
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42, stratify=y)

        # Apply SMOTE
        sm = SMOTE(random_state=42)
        X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

        # TRAINING
        clf_rf = RandomForestClassifier(random_state=42)
        random_search = RandomizedSearchCV(clf_rf, param_distributions=params_rf, n_iter=4, cv=5, verbose=0, random_state=42)
        random_search.fit(X_train_res, y_train_res)

        # PREDICTION
        best_rf = random_search.best_estimator_
        y_pred_rf = best_rf.predict(X_test)

        # EVALUATION
        contains_f1_rf = f1_score(y_true=y_test, y_pred=y_pred_rf)
        contains_accuracy_rf = accuracy_score(y_true=y_test, y_pred=y_pred_rf)
        contains_precision_rf = precision_score(y_true=y_test, y_pred=y_pred_rf)
        contains_recall_rf = recall_score(y_true=y_test, y_pred=y_pred_rf)
        contains_support_rf = len(y_test)
        
        # Analysis is subjective and should be written based on the metrics
        if contains_precision_rf == 1.0:
            analysis = "The model achieves perfect precision, indicating high confidence in identifying {}. However, it misses about {} of actual {} segments (false negatives).".format(t, 1-contains_recall_rf, t)
        else:
            analysis = "The model shows good balance between precision and recall for detecting {}. There's room for improvement in capturing more instances without compromising precision.".format(t)

        results_table.append({
            'Class': t,
            'Accuracy': contains_accuracy_rf,
            'Precision': contains_precision_rf,
            'Recall': contains_recall_rf,
            'F1-Score': contains_f1_rf,
            'cv_score': random_search.best_score_,
            'Support': contains_support_rf,
            'Analysis': analysis
        })
        print("-----------------------------------")
        best_results_rf[t] = {
            'f1': contains_f1_rf,
            'recall': contains_recall_rf,
            'precision': contains_precision_rf,
            'accuracy': contains_accuracy_rf,
            'params': random_search.best_params_,
            'cv_score': random_search.best_score_,
            'test_size': test_size
        }

        print(f'\nBest results for target {t} using Random Forest:')
        print(f"Test F1: {best_results_rf[t]['f1']}")
        print(f"Test Recall: {best_results_rf[t]['recall']}")
        print(f"Test Precision: {best_results_rf[t]['precision']}")
        print(f"Test Accuracy: {best_results_rf[t]['accuracy']}")
        print(f"Best Params: {best_results_rf[t]['params']}")
        print(f"CV Score: {best_results_rf[t]['cv_score']}")
        print('_')
        # Print confusion matrix
        cm = confusion_matrix(y_true=y_test, y_pred=y_pred_rf)
        cm_display = ConfusionMatrixDisplay(confusion_matrix=cm)
        cm_display.plot()
        plt.show()

print("Analysis\n")
for result in results_table:
    print("Class: {}\nPrecision: {:.2f}\nRecall: {:.2f}\nF1-Score: {:.2f}\nCV Score: {:.2f}\nSupport: {}\nAnalysis: {}\n".format(result['Class'], result['Precision'], result['Recall'], result['F1-Score'], result['cv_score'], result['Support'], result['Analysis']))

## 4. Conclusions: Comparison against chance (contains_nns)

In [None]:
print("Comparison of the metrics of IS_NNS for the three models and random predictions")
print("-------------------------------------------------------------------------------------")
print("Model\t\t\tAccuracy\tPrecision\tRecall\t\tF1-Score")
print("GradientBoosting\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(is_accuracy_gbc, is_precision_gbc, is_recall_gbc, is_f1_gbc))
print("GradientBoosting-KFolds\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(is_accuracy_gbc_kf, is_precision_gbc_kf, is_recall_gbc_kf, is_f1_gbc_kf))
print("RandomForest\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(is_accuracy_rf, is_precision_rf, is_recall_rf, is_f1_rf))
print("-------------------------------------------------------------------------------------")
print("Random Predictions\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(is_random_accuracy, is_random_precision, is_random_recall, is_random_f1))
print("-------------------------------------------------------------------------------------")

print("\n")

print("Comparison of the metrics of CONTAINS_NNS for the three models and random predictions")
print("-------------------------------------------------------------------------------------")
print("Model\t\t\tAccuracy\tPrecision\tRecall\t\tF1-Score")
print("GradientBoosting\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(contains_accuracy_gbc, contains_precision_gbc, contains_recall_gbc, contains_f1_gbc))
print("GradientBoosting-KFolds\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(contains_accuracy_gbc_kf, contains_precision_gbc_kf, contains_recall_gbc_kf, contains_f1_gbc_kf))
print("RandomForest\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(contains_accuracy_rf, contains_precision_rf, contains_recall_rf, contains_f1_rf))
print("-------------------------------------------------------------------------------------")
print("Random Predictions\t{:.2f}\t\t{:.2f}\t\t{:.2f}\t\t{:.2f}".format(contains_random_accuracy, contains_random_precision, contains_random_recall, contains_random_f1))
print("-------------------------------------------------------------------------------------")

# TODO: ¿¿¿¿¿ Exportar los resultados y las confusion matrices a la carpeta "results" ?????