In [1]:
import os
os.chdir("../")

In [2]:
import seaborn as sns
import pandas as pd
import numpy as np
import torch
import matplotlib.pyplot as plt
import torchaudio.functional as F

from sklearn.metrics import (
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay,
    RocCurveDisplay,
    f1_score,
    precision_score,
    recall_score,
    average_precision_score,
)

In [3]:
from src.preprocess.feature_extractor import FeatureExtractor
from src.preprocess.target_processor import TargetProcessor
from clients import gdrive
from src.utils import save_pickle
import constants

In [4]:
TEXT_COLOR = '#313131'
# Plotly colors
LINE_COLORS = ['#636EFA', '#EF553B', '#00CC96', '#AB63FA', '#FFA15A', '#19D3F3', '#FF6692', '#B6E880', '#FF97FF', '#FECB52']

sns.set(
    style='darkgrid', 
    rc={'figure.figsize':(6,4),
        'figure.dpi': 100,
        'figure.facecolor': 'w', 
        'legend.facecolor': 'w',
        'text.color': TEXT_COLOR,
        'font.family': 'Microsoft Sans Serif',
        'axes.labelcolor': TEXT_COLOR,
        'xtick.color': TEXT_COLOR,
        'ytick.color': TEXT_COLOR}
)

sns.set_palette(sns.color_palette(LINE_COLORS))

---

# Build dataset

Upload processed data to gDrive

In [5]:
TRAIN_SPEAKERS = [
    "1_ines", 
    "2_helena",
    "3_ignasi",
    "4_sonia",
    "5_david",
    "6_gloria",
    "8_diana",
    "9_daniel",
    "10_oriol", 
    "13_tomy",
    "14_maria", 
    "15_arancha",
    "16_tropicfeel",
    "17_robert",
    # "18_lourdes", 
    # "19_pablo",
    "20_anna",
    "21_eirene",
    "22_patricia",
    "23_carlos",
    "25_daniel",
    "26_angel",
    "27_angela",
    "28_joachim",
    "29_alfredo",
]

In [6]:
VALID_SPEAKERS = [
    "30_segolene",
    "31_andrew",
    "32_alexia",
    "33_bernat",
    "34_jon",
    "35_juanjo",
    "36_clara",
    "37_sandra",
]

In [7]:
ALL_SPEAKERS = TRAIN_SPEAKERS + VALID_SPEAKERS

In [8]:
feature_extractor = FeatureExtractor()
target_processor = TargetProcessor()

In [10]:
%%time
os.makedirs(f"data/processed_data/features/", exist_ok=True)
os.makedirs(f"data/processed_data/targets/", exist_ok=True)

x_valid = torch.Tensor([])
y_valid = torch.Tensor([])

for speaker in ALL_SPEAKERS:
    # Extract features and targets
    audio_path = f"data/audio/{speaker}.wav"
    annotation_path = f"data/labels/parsed_annotations/{speaker}.csv"
    current_x = feature_extractor.process_file(audio_path)
    current_y = target_processor.process(annotation_path, size=current_x.shape[1])
    
    # Save as pickle files
    features_pickle_path = f"data/processed_data/features/{speaker}.pickle"
    targets_pickle_path = f"data/processed_data/targets/{speaker}.pickle"
    save_pickle(current_x, features_pickle_path)
    save_pickle(current_y, targets_pickle_path)
    
    # Upload pickles to gDrive
    features_drive_folder_id = "12KIT85SR25p33TEVa3jJU52DItvV7QqP"
    targets_drive_folder_id = "1aY77GI-o8GbPoi1MO0RuPgl2JMgXzBx2"
    gdrive.upload_file(features_pickle_path, features_drive_folder_id)
    gdrive.upload_file(targets_pickle_path, targets_drive_folder_id)

---