In [1]:
import subprocess
import os
import glob
import pathlib
from pathlib import Path
import numpy as np
import pandas as pd
import time
from datetime import timedelta
import librosa

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.svm import SVC

In [4]:
from emotion import root_dir, module_dir
# ffmpeg binary
FFMPEG = "/usr/bin/ffmpeg"

# text directory
TEXT_DIR = Path(root_dir / "data/raw/text")
LABELS_DIR = Path(root_dir / "data/raw/labels")

# audio directories
AUDIO_DIR = Path(root_dir / "data/raw/audio")
AUDIO_CLIPS_DIR = Path(root_dir / "data/interim/audio")
AUDIO_FEATURES_DIR = Path(root_dir / "data/processed/audio")

# Load ratings, keep sentiment (positive, negative, none)

## Aggregate : 1 if 2+ raters agree, keep labels having audio file

In [38]:
def load_all_ratings(labels_dir, split_id_clip = False):
    
    label_files = glob.glob(f"{labels_dir}/*.csv")

    df_labels = []

    for filename in label_files:
        df = pd.read_csv(filename, index_col=None, header=0)
        df_labels.append(df)


    df_labels = pd.concat(df_labels, axis=0, ignore_index=True)

    # keep only relevant columns
    label_cols = ['Input.VIDEO_ID', 'Input.CLIP',
              'Answer.anger', 'Answer.disgust',
              'Answer.fear', 'Answer.happiness',
              'Answer.sadness', 'Answer.surprise',
              'Answer.sentiment']

    # rename columns to shorter names
    label_new_cols = ['id', 'clip',
                      'anger', 'disgust',
                      'fear', 'happiness',
                      'sadness', 'surprise',
                      'sentiment']
    df_labels = df_labels[label_cols]
    df_labels.columns = label_new_cols

    # drop row all nan
    isna_idx = \
        df_labels.index[df_labels[df_labels.columns[2:]].isna().all(axis=1)]
    df_labels.drop(index=isna_idx, inplace=True)
    # replace remaining nan's with 0
    df_labels = df_labels.replace({np.nan : 0})
    # convert ratings to int
    df_labels[label_new_cols[2:]] = df_labels[label_new_cols[2:]].astype('Int64')
    # set emotions to 0 or 1
    df_labels[label_new_cols[2:-1]] = \
        df_labels[label_new_cols[2:-1]].applymap(lambda x : 1 if x > 0 else 0)

    # if sentiment > 0 convert to positive = 1, elif < 0 convert to negative = 1
    #   if none of emotion or sentiment == 1, set none to 1
    df_labels['positive'] = \
        df_labels['sentiment'].map(lambda x : 1 if x > 0 else 0)
    df_labels['negative'] = \
        df_labels['sentiment'].map(lambda x : 1 if x < 0 else 0)

    # drop sentiment column (now in positive/negative)
    df_labels.drop(columns='sentiment', inplace=True)
    
    df_labels['none'] = 0
    none_idx = \
        df_labels[df_labels[df_labels.columns[2:]].sum(axis=1) == 0].index

    df_labels.loc[none_idx,'none'] = 1

    label_new_cols = ['id', 'clip',
                      'anger', 'disgust',
                      'fear', 'happiness',
                      'sadness', 'surprise',
                      'sentiment']
    
    # remove '/' from id's
    df_labels['id'] = df_labels['id'].map(lambda x : str(x).split("/")[-1])
    if not split_id_clip:
        df_labels['id'] = df_labels['id'] + '_' + df_labels['clip'].astype(str)
        df_labels.drop(columns = 'clip', inplace = True)
        
        label_new_cols = ['id',
                          'none', 'positive', 'negative',
                          'anger', 'disgust',
                          'fear', 'happiness',
                          'sadness', 'surprise'
                         ]
    else:
        label_new_cols = ['id', 'clip',
                          'none', 'positive', 'negative',
                          'anger', 'disgust',
                          'fear', 'happiness',
                          'sadness', 'surprise'
                         ]
        
    df_labels = df_labels[label_new_cols]
    return df_labels

In [45]:
def aggregate_ratings(ratings):
    '''
        aggregate labels to 1 if 2+ ratings aggree else 0
        ratings : pd.DataFrame containing all ratings (3 per example)
        returns : pd.DataFrame containing aggregated labels
    '''
    grp_labels = ratings.groupby('id').sum()
    # label to 1 if > 1 else 0
    grp_labels = grp_labels.applymap(lambda x : 1 if x > 1 else 0)
    # drop rows where all == 0
    idx = grp_labels[grp_labels.sum(axis =1) == 0].index
    grp_labels.drop(index = idx, inplace=True)
    print(f"{len(idx)} rows dropped")
    print(f"{grp_labels.shape[0]} grouped labels")
    return grp_labels

# Load sentiment labels

In [46]:
def get_sentiment_labels(labels_dir, audio_dir):
    '''
        get sentiment labels (none, positive, negative) having audio clips
        labels_dir : directory containing labels .csv files (ratings)
        audio_dir  : directory containing segmented audio clips
        returns    : pd.DataFrame containing sentiment labels having audio clips
    '''
    all_ratings = load_all_ratings(labels_dir)
    # keep ratings having positive, negative or none
    mask_positive = all_ratings['positive'] == 1
    mask_negative = all_ratings['negative'] == 1
    mask_none = all_ratings['none'] == 1
    sentiment_ratings = \
        all_ratings[(mask_positive) | (mask_negative) | (mask_none)]\
            [['id','none','positive','negative']].copy()
    # aggregate ratings
    sentiment_labels = aggregate_ratings(sentiment_ratings)
    # get audio clip names
    audio_clip_names = glob.glob(f"{audio_dir}/*.wav")
    audio_clip_names = \
        [cn.rsplit('.', maxsplit = 1)[0].rsplit('/', maxsplit = 1)[-1] for \
             cn in audio_clip_names]
    clips_no_audio = []
    for idx in sentiment_labels.index:
        if idx not in audio_clip_names:
            clips_no_audio.append(idx)
    sentiment_labels.drop(index=clips_no_audio, inplace=True)
    print(f"{sentiment_labels.shape[0]} labels for sentiment with audio")
    return sentiment_labels

In [None]:
def get_labeled_clips(labels_dir):
    '''
        get id & clip number from labels
        labels_dir : directory containing raw labels
        returns   : dict of ids with correponding labeled clips
                    {'id' : [clip1, clip2, ... clip_n]}
    '''
    labeled_clips = {}
    labels = load_all_ratings(labels_dir, split_id_clip=True)

    labels = labels.drop_duplicates(subset=['id','clip'])

    uniq_ids = sorted(list(labels['id'].unique()))
    for i in uniq_ids:
        labeled_clips[i] = \
            sorted(
                labels[labels['id'] == i]['clip'].astype(int).to_list()
            )
    del labels
    return labeled_clips

In [None]:
def get_clips_info_from_text(text_dir, get_text=False):
    '''
        get audio information from text files to split audio into clips
        text_dir         : directory containing texts with audio info
        get_text         : add text column to info DataFrame
        returns: DataFrame
        - id (file id)
        - clip (clip number)
        - start_time (audio file segment start time)
        - end_time   (audio file segment end time)
        - len (audio clip len)
    '''
    text_files = glob.glob(f"{text_dir}/*.txt")
    dfs = []
    for file in text_files:
        with open(file, 'r') as text_file:
            lines = text_file.readlines()
            clips_info = []
            for line in lines:
                line_info = {}
                split_line = line.split('___')
                line_info['id'] = split_line[0]
                line_info['clip'] = int(split_line[1])
                line_info['start_time'] = round(abs(float(split_line[2])) ,3)
                line_info['end_time'] = round(float(split_line[3]), 3)
                line_info['len'] = \
                    line_info['end_time'] - line_info['start_time']

                if get_text:
                    line_info['text'] = split_line[4]
                
                clips_info.append(line_info)
        dfs.append(pd.DataFrame(clips_info))
    dfs = pd.concat(dfs, axis=0, ignore_index=True)
    return dfs

In [None]:
def get_labeled_clips_info(labels_dir, text_dir,
        get_text=False, show_progress=True):
    '''
        labels_dir       : directory containing labels
        text_dir         : directory containing text files and clip info
        get_text         : add text column to info DataFrame
        returns          : DataFrame containing:
                           id         : file id
                           clip       : audio clip/line number
                           start_time : audio clip start time
                           end time   : audio clip end time
                           len        : audio clip len
                           text       : text if get_text == True
                    clips_not_in_text (labeled_clips)
    '''

    stime = time()
    print("Recherche des ids et no. de clips annotés   ...")
    labeled_clips = get_labeled_clips(labels_dir)
    print("Extraction des infos audio (début/fin)      ...")
    clips_info = get_clips_info_from_text(text_dir, get_text)

    labeled_clips_info = []
    clips_not_in_text = []
    print("Selection de l'info audio des clips annotés ...")
    num_files = len(labeled_clips)
    files_processed = 0
    for i, clips in labeled_clips.items():
        mask_id = clips_info['id'] == i
        for clip in clips:
            mask_clip = clips_info['clip'] == clip
            clip_info = {}
            clip_info['id'] = i
            clip_info['clip'] = clip
            try:
                # if text with id contains clip, add info to clip_info
                len(clips_info[(mask_id) & (mask_clip)].index) == 1
                clip_info['start_time'] = \
                        clips_info[(mask_id) & (mask_clip)]['start_time'].values[0]
                clip_info['end_time'] = \
                        clips_info[(mask_id) & (mask_clip)]['end_time'].values[0]
                clip_info['len'] = \
                        clips_info[(mask_id) & (mask_clip)]['len'].values[0]
                if get_text:
                        clip_info['text'] = \
                            clips_info[(mask_id) & (mask_clip)]['text'].values[0]
                labeled_clips_info.append(clip_info)
            except:
                # if no clip in text add id and clip to errors
                clips_not_in_text.append(clip_info)

        if show_progress:
            files_processed += 1
            if files_processed % 10 == 0:
                print('.', end = '')
                if files_processed % 500 == 0:
                    print(f" {files_processed} de {num_files} fichiers")
    print("\nTemps d'exécution: ",
            f"{timedelta(seconds = round(time() - stime))} (h:mm:ss)")
    clips_not_in_text = pd.DataFrame(clips_not_in_text)
    labeled_clips_info = pd.DataFrame(labeled_clips_info)
    print(f"\n{labeled_clips_info.shape[0]} clips annotés")
    print(clips_not_in_text.shape[0],
        "clips annotés sans info audio")
    return labeled_clips_info, clips_not_in_text

In [48]:
sentiment_labels = get_labels_sentiment(labels_dir, audio_out_dir)
print("")
print(sentiment_labels.head())

5062 rows dropped
18319 grouped labels
18063 labels for sentiment with audio

                none  positive  negative
id                                      
--qXJuDtHPw_5      0         1         0
-3g5yACwYnA_10     0         1         0
-3g5yACwYnA_13     0         1         0
-3g5yACwYnA_3      1         0         0
-3g5yACwYnA_4      1         0         0


# Extract audio features (median on 4 seconds)

In [50]:
def extract_features_median(audio_file, len_secs=3,
                           n_mfccs=40, rms=False, zrc=False):

    import librosa
    samples, srate = \
        librosa.load(audio_file, sr=None)
    if srate != 16000:
        samples, srate = \
            librosa.load(audio_file, sr=16000)
    
    if len_secs != 'full':
        # keep end of samples of len_secs
        num_samples = srate * len_secs
        if num_samples > len(samples):
            samples = samples[-num_samples:]
    
    audio_features = None
    feature_names = []
    if rms:
        feature_names += ['rms']
        audio_features = \
            np.median(librosa.feature.rms(y=samples).T, axis=0)
    if zrc:
        feature_names += ['zrc']
        zrc = \
            np.median(librosa.feature.zero_crossing_rate(y=samples).T,
                    axis=0)
        if isinstance(audio_features, np.ndarray):
            audio_features = np.append(audio_features, zrc)
        else:
            audio_features = zrc
    feature_names += ['mfcc_' + str(x) for x in range(1, n_mfccs + 1)]
    S = librosa.feature.melspectrogram(y=samples,
                                   sr=srate, n_mels=64, #128,
                                   fmax=8000, hop_length=512)
    mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=n_mfccs)
    mfccs = np.median(mfccs, axis=1)
    if isinstance(audio_features, np.ndarray):
        audio_features = np.append(audio_features, mfccs)
    else:
        audio_features = mfccs
    return audio_features, feature_names

In [240]:
def extract_features_mean(audio_file, len_secs=3,
                           n_mfccs=40, rms=False, zrc=False):

    import librosa
    samples, srate = \
        librosa.load(audio_file, sr=None)
    if srate != 16000:
        samples, srate = \
            librosa.load(audio_file, sr=16000)
    
    if len_secs != 'full':
        # keep end of samples of len_secs
        num_samples = srate * len_secs
        if num_samples > len(samples):
            samples = samples[-num_samples:]
    
    audio_features = None
    feature_names = []
    if rms:
        feature_names += ['rms']
        audio_features = \
            np.mean(librosa.feature.rms(y=samples).T, axis=0)
    if zrc:
        feature_names += ['zrc']
        zrc = \
            np.mean(librosa.feature.zero_crossing_rate(y=samples).T,
                    axis=0)
        if isinstance(audio_features, np.ndarray):
            audio_features = np.append(audio_features, zrc)
        else:
            audio_features = zrc
    
    feature_names += ['mfcc_' + str(x) for x in range(1, n_mfccs + 1)]
    S = librosa.feature.melspectrogram(y=samples,
                                   sr=srate, n_mels=64, #128,
                                   fmax=8000, hop_length=512)
    mfccs = librosa.feature.mfcc(S=librosa.power_to_db(S), n_mfcc=n_mfccs)
    mfccs = np.mean(mfccs, axis=1)
    if isinstance(audio_features, np.ndarray):
        audio_features = np.append(audio_features, mfccs)
    else:
        audio_features = mfccs
    return audio_features, feature_names

In [51]:
def extract_features_from_dir(audio_dir, file_names=None,
                                    agg='median', len_secs='full',
                                    n_mfccs=20, rms=False, zrc=False,
                                    show_progress=True):
    not_files = []
    audio_features = {}
    stime = time.time()
    cnt = 0
    if file_names is None:
        file_names = glob.glob(f"{audio_dir}/*.wav")
    else:
        file_names = [audio_dir + '/' + f + '.wav' for f in file_names]
    num_files = len(file_names)

    for i, f in enumerate(file_names):
        if agg == 'median':
            clip_features, fnames = \
                extract_features_median(f, len_secs, n_mfccs, rms, zrc)
                # # extract_audio_features_median(audio_dir + '/' + f,
        else:
            clip_features, fnames = \
                extract_features_mean(f, len_secs, n_mfccs, rms, zrc)
                # extract_audio_features_mean(audio_dir + '/' + f,
        # audio_features[f.rsplit('.', maxsplit = 1)[0]] = clip_features
        # check new split !!!!!!
        clip_id = f.rsplit('.', maxsplit = 1)[0].rsplit('/', maxsplit = 1)[-1]
        audio_features[clip_id] = clip_features
        # audio_features[f.rsplit('.', maxsplit = 1)[0]] = clip_features
        # else:
        #    not_files.append(f)
            # print(f"{f} not in audio_dir")

        if show_progress:
            if i % 10 == 0 and i != 0:
                print('.', end = '')
                if i % 500 == 0:
                    print(f" {i} de {num_files} fichiers")

    audio_features = pd.DataFrame(audio_features).T
    audio_features.columns = fnames #['mfcc_' + str(x) for x in range(1, n_mfccs + 1)]
    etime = time.time() # ptime()
    proc_time = timedelta(seconds = round(etime - stime))
    print(f"\n\n{audio_features.shape[0]} fichiers extraits: {proc_time} (h:mm:ss)")
    return audio_features

In [52]:
mfcc40_3sec_median = \
    extract_features_from_dir(audio_out_dir, file_names=None,
                                    agg='median', len_secs=7,
                                    n_mfccs=40, rms=False, zrc=False,
                                    show_progress=True)

.................................................. 500 de 23259 fichiers
.................................................. 1000 de 23259 fichiers
.................................................. 1500 de 23259 fichiers
.................................................. 2000 de 23259 fichiers
.................................................. 2500 de 23259 fichiers
.................................................. 3000 de 23259 fichiers
.................................................. 3500 de 23259 fichiers
.................................................. 4000 de 23259 fichiers
.................................................. 4500 de 23259 fichiers
.................................................. 5000 de 23259 fichiers
.................................................. 5500 de 23259 fichiers
.................................................. 6000 de 23259 fichiers
.................................................. 6500 de 23259 fichiers
.......................................

  return f(*args, **kwargs)


...................................... 18500 de 23259 fichiers
.................................................. 19000 de 23259 fichiers
.................................................. 19500 de 23259 fichiers
.................................................. 20000 de 23259 fichiers
.................................................. 20500 de 23259 fichiers
.................................................. 21000 de 23259 fichiers
.................................................. 21500 de 23259 fichiers
.................................................. 22000 de 23259 fichiers
.................................................. 22500 de 23259 fichiers
.................................................. 23000 de 23259 fichiers
.........................

23259 fichiers extraits: 0:04:32 (h:mm:ss)


In [53]:
mfcc40_7sec_median.to_csv("./mfcc40_7sec_median.csv", header=True, index=True)

In [182]:
mfcc40_3sec_median = pd.read_csv("./mfcc40_median_3sec.csv", index_col=0)
mfcc40_3sec_median.head()

Unnamed: 0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,mfcc_9,mfcc_10,...,mfcc_31,mfcc_32,mfcc_33,mfcc_34,mfcc_35,mfcc_36,mfcc_37,mfcc_38,mfcc_39,mfcc_40
MPRqaQqrd9Y_7,-238.4778,116.48294,-16.343117,13.451738,-20.161373,-10.331965,-14.03253,3.187353,-8.78322,-4.770639,...,0.175206,-0.482796,1.425903,1.884764,0.493,0.52995,1.037738,1.994728,-0.325253,-1.274733
UlTJmndbGHM_4,-53.42596,94.49222,-19.986244,47.58835,-21.56705,40.110077,-9.876991,21.592289,-3.903989,11.75324,...,-1.38776,-0.199772,-1.833252,-1.971496,0.220655,-1.211483,1.755036,-1.856162,2.457587,-0.118698
hjBQmIWiWgw_2,-247.95389,101.13844,-9.910642,7.709038,-13.915911,10.742346,-22.425343,-8.783762,-15.74935,-6.562918,...,-3.128196,-3.29488,-1.202057,-2.194178,-1.391237,-2.352279,-0.9277,-2.60297,-1.094968,-1.410288
9zWeMrfr-l0_0,-227.37321,89.07364,-9.827379,26.472038,2.726988,1.796814,-16.57557,8.506326,-8.254383,-6.067927,...,-0.133548,-0.343467,-0.75708,-0.810546,-0.412276,-1.497693,-0.009953,-1.921595,-1.485501,-1.057941
31197_2,-219.16013,122.94565,-42.745243,51.231842,8.988421,-1.754196,27.00664,-5.793803,10.785469,6.574906,...,-3.564412,-0.184192,-1.412041,-0.797373,0.196932,-2.414317,0.634682,-0.541945,-0.924508,-0.020378


In [231]:
mfcc40_5sec_median = \
    extract_audio_features_from_dir(audio_out_dir, file_names=None,
                                    agg='median', len_secs=5,
                                    n_mfccs=40, rms=False, zrc=False,
                                    show_progress=True)

.................................................. 500 de 23259 fichiers
.................................................. 1000 de 23259 fichiers
.................................................. 1500 de 23259 fichiers
.................................................. 2000 de 23259 fichiers
.................................................. 2500 de 23259 fichiers
.................................................. 3000 de 23259 fichiers
.................................................. 3500 de 23259 fichiers
.................................................. 4000 de 23259 fichiers
.................................................. 4500 de 23259 fichiers
.................................................. 5000 de 23259 fichiers
.................................................. 5500 de 23259 fichiers
.................................................. 6000 de 23259 fichiers
.................................................. 6500 de 23259 fichiers
.......................................

  return f(*args, **kwargs)


....................................... 18500 de 23259 fichiers
.................................................. 19000 de 23259 fichiers
.................................................. 19500 de 23259 fichiers
.................................................. 20000 de 23259 fichiers
.................................................. 20500 de 23259 fichiers
.................................................. 21000 de 23259 fichiers
.................................................. 21500 de 23259 fichiers
.................................................. 22000 de 23259 fichiers
.................................................. 22500 de 23259 fichiers
.................................................. 23000 de 23259 fichiers
.........................

23259 fichiers extraits: 0:04:44 (h:mm:ss)


In [232]:
mfcc40_5sec_median = mfcc40_5sec_median_new
mfcc40_5sec_median.to_csv("./mfcc40_5sec_median.csv", header=True, index=True)

In [241]:
mfcc40_5sec_mean = \
    extract_audio_features_from_dir(audio_out_dir, file_names=None,
                                    agg='mean', len_secs=5,
                                    n_mfccs=40, rms=False, zrc=False,
                                    show_progress=True)
mfcc40_5sec_mean.to_csv("./mfcc40_5sec_mean.csv", header=True, index=True)

.................................................. 500 de 23259 fichiers
.................................................. 1000 de 23259 fichiers
.................................................. 1500 de 23259 fichiers
.................................................. 2000 de 23259 fichiers
.................................................. 2500 de 23259 fichiers
.................................................. 3000 de 23259 fichiers
.................................................. 3500 de 23259 fichiers
.................................................. 4000 de 23259 fichiers
.................................................. 4500 de 23259 fichiers
.................................................. 5000 de 23259 fichiers
.................................................. 5500 de 23259 fichiers
.................................................. 6000 de 23259 fichiers
.................................................. 6500 de 23259 fichiers
.......................................

  return f(*args, **kwargs)


...................................... 18500 de 23259 fichiers
.................................................. 19000 de 23259 fichiers
.................................................. 19500 de 23259 fichiers
.................................................. 20000 de 23259 fichiers
.................................................. 20500 de 23259 fichiers
.................................................. 21000 de 23259 fichiers
.................................................. 21500 de 23259 fichiers
.................................................. 22000 de 23259 fichiers
.................................................. 22500 de 23259 fichiers
.................................................. 23000 de 23259 fichiers
.........................

23259 fichiers extraits: 0:04:45 (h:mm:ss)


In [263]:
mfcc40_3sec_mean = \
    extract_audio_features_from_dir(audio_out_dir, file_names=None,
                                    agg='mean', len_secs=3,
                                    n_mfccs=40, rms=False, zrc=False,
                                    show_progress=True)
mfcc40_3sec_mean.to_csv("./mfcc40_3sec_mean.csv", header=True, index=True)

.................................................. 500 de 23259 fichiers
.................................................. 1000 de 23259 fichiers
.................................................. 1500 de 23259 fichiers
.................................................. 2000 de 23259 fichiers
.................................................. 2500 de 23259 fichiers
.................................................. 3000 de 23259 fichiers
.................................................. 3500 de 23259 fichiers
.................................................. 4000 de 23259 fichiers
.................................................. 4500 de 23259 fichiers
.................................................. 5000 de 23259 fichiers
.................................................. 5500 de 23259 fichiers
.................................................. 6000 de 23259 fichiers
.................................................. 6500 de 23259 fichiers
.......................................

  return f(*args, **kwargs)


...................................... 18500 de 23259 fichiers
.................................................. 19000 de 23259 fichiers
.................................................. 19500 de 23259 fichiers
.................................................. 20000 de 23259 fichiers
.................................................. 20500 de 23259 fichiers
.................................................. 21000 de 23259 fichiers
.................................................. 21500 de 23259 fichiers
.................................................. 22000 de 23259 fichiers
.................................................. 22500 de 23259 fichiers
.................................................. 23000 de 23259 fichiers
.........................

23259 fichiers extraits: 0:04:39 (h:mm:ss)


In [230]:
mfcc_3sec_median_new.to_csv("./mfcc40_3sec_median.csv", header=True, index=True)

# Align features with labels

In [179]:
def create_datasets(in_features, in_labels, test_size=0.2):
    labels = in_labels.copy()
    features = in_features.copy().reindex(labels.index)
    X_train, X_test, y_train, y_test = \
        train_test_split(features, labels, test_size=0.2,
                         random_state=100,
                         stratify = labels.values.argmax(axis=1))
    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    print(f"X_train.shape: {X_train.shape}")
    print(f"X_test.shape: {X_test.shape}")
    print(f"y_train.shape: {y_train.shape}")
    print(f"y_test.shape: {y_test.shape}")
    print("\ntraining label count:")
    print(y_train.sum(axis=0))
    print("\ntest label count:")
    print(y_test.sum(axis=0))
    
    return X_train, X_test, y_train, y_test, scaler

In [174]:
def create_scaled_datasets(in_features, in_labels, val_size=0.2, test_size=0.1):
    labels = in_labels.copy()
    features = in_features.copy().reindex(labels.index)
    X_dev, X_test, y_dev, y_test = \
        train_test_split(features, labels, test_size=test_size,
                         random_state=100,
                         stratify = labels.values.argmax(axis=1))
    X_train, X_val, y_train, y_val = \
        train_test_split(X_dev, y_dev, test_size=val_size,
                         random_state=100,
                         stratify = y_dev.values.argmax(axis=1))

    scaler = StandardScaler().fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    print(f"X_train.shape: {X_train.shape}")
    print(f"X_val.shape: {X_val.shape}")
    print(f"X_test.shape: {X_test.shape}")
    print(f"y_train.shape: {y_train.shape}")
    print(f"y_val.shape: {y_val.shape}")
    print(f"y_test.shape: {y_test.shape}")
    
    print("\ntraining label count:")
    print(y_train.sum(axis=0))
    print("\nvalidation label count:")
    print(y_val.sum(axis=0))
    print("\ntest label count:")
    print(y_test.sum(axis=0))
    
    return X_train, X_val, X_test, y_train, y_val, y_test, scaler

In [74]:
def calc_metrics_per_class(y_true, y_pred, classes=None):
    
    if len(y_true.shape) > 1:
        conf_mtx = pd.DataFrame(
                            confusion_matrix(
                                y_true.values.argmax(axis=1),
                                y_pred.argmax(axis=1)
                            )
                        )
    else:
        conf_mtx = pd.DataFrame(
                            confusion_matrix(
                                y_true,
                                y_pred
                            )
                        )

    if classes != None:
        conf_mtx.index = classes
        conf_mtx.columns = classes
    else:
        classes = sorted(list(y_true.unique()))
        conf_mtx.index = classes
        conf_mtx.columns = classes

    class_metrics = {}
    for c in classes:
        metrics = {}
        metrics['precision'] = \
            round(conf_mtx.loc[c, c] / conf_mtx.loc[:, c].sum(), 3)
        metrics['recall'] = \
            round(conf_mtx.loc[c, c] / conf_mtx.loc[c, :].sum(), 3)
        metrics['f1'] = \
            round(2 * (metrics['precision'] * metrics['recall'])/
                  (metrics['precision'] + metrics['recall']), 3)
        class_metrics[c] = metrics
        # metrics
    class_metrics = pd.DataFrame(class_metrics)
    macro_metrics = class_metrics.sum(axis=1) / 3
    class_metrics = class_metrics.T
    class_metrics.loc['macro'] = macro_metrics.round(3)
    return conf_mtx, class_metrics

In [133]:
def train_svc(X_train, y_train, C=5):
    gamma='auto'
    # C=18 good,
    svc = SVC(C=C, kernel='rbf', gamma=gamma, random_state=101)
    svc.fit(X_train, y_train.values.argmax(axis=1))
    return svc

In [88]:
def predict_show_metrics(model, X, y, show_confu=False, data_name='data'):
    pred = svc_model.predict(X)
    print(f"\n{data_name} accuracy : ",
          round(accuracy_score(y.values.argmax(axis=1), pred),3))

    conf_mtx, metrics = \
    calc_metrics_per_class(y.values.argmax(axis=1), pred,
                           classes=y.columns.tolist())
    if show_confu:
        print("")
        print(conf_mtx)
    print("")
    print(metrics)

In [111]:
X_train, X_test, y_train, y_test = \
    create_datasets(mfcc40_7sec_median, sentiment_labels)
    # create_datasets(rms_mfcc40_7sec_median, sentiment_labels)

X_train.shape: (14450, 40)
X_test.shape: (3613, 40)
y_train.shape: (14450, 3)
y_test.shape: (3613, 3)

training label count:
none        5282
positive    5522
negative    3674
dtype: int64

test label count:
none        1320
positive    1380
negative     918
dtype: int64


In [112]:
svc_model = train_svc(X_train, y_train, C=5)

In [113]:
predict_show_metrics(svc_model, X_train, y_train, data_name='Train')


Train accuracy :  0.791

          precision  recall     f1
none          0.791   0.754  0.772
positive      0.763   0.855  0.806
negative      0.844   0.747  0.793
macro         0.799   0.785  0.790


In [114]:
predict_show_metrics(svc_model, X_test, y_test, data_name='Test')


Test accuracy :  0.582

          precision  recall     f1
none          0.536   0.518  0.527
positive      0.591   0.665  0.626
negative      0.638   0.549  0.590
macro         0.588   0.577  0.581



# model 3sec mean

In [264]:
X_train, X_test, y_train, y_test, scaler = \
    create_datasets(mfcc40_3sec_mean, sentiment_labels)

X_train.shape: (14450, 40)
X_test.shape: (3613, 40)
y_train.shape: (14450, 3)
y_test.shape: (3613, 3)

training label count:
none        5282
positive    5522
negative    3674
dtype: int64

test label count:
none        1320
positive    1380
negative     918
dtype: int64


In [265]:
svc_model = train_svc(X_train, y_train, C=5)

In [266]:
predict_show_metrics(svc_model, X_train, y_train, data_name='Train')


Train accuracy :  0.791

          precision  recall     f1
none          0.786   0.761  0.773
positive      0.769   0.848  0.807
negative      0.839   0.749  0.791
macro         0.798   0.786  0.790


In [267]:
predict_show_metrics(svc_model, X_test, y_test, data_name='Test')


Test accuracy :  0.569

          precision  recall     f1
none          0.520   0.508  0.514
positive      0.575   0.649  0.610
negative      0.639   0.536  0.583
macro         0.578   0.564  0.569


In [149]:
y_test.tail()

Unnamed: 0_level_0,none,positive,negative
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
j1m6ctAgjsM_40,1,0,0
8eaYvALnJ0o_2,0,1,0
28006_20,0,0,1
20LfN8ENbhM_6,0,1,0
202431_5,1,0,0


In [165]:
test_feats, _ = \
    extract_audio_features_median(f"{audio_out_dir}/20LfN8ENbhM_6.wav", len_secs=5,
                                  n_mfccs=40, rms=False, zrc=False)
print(test_feats)

[-195.52985      92.70075     -24.008392     10.492144     -6.4965787
  -18.499443    -17.859005    -13.474919     -8.963863     -7.732952
   -9.457983     -3.5309544    -7.2717237    -3.894504     -4.5399246
   -6.592246     -1.472745     -6.1299205    -6.881275     -2.304553
   -5.336194     -4.8850164    -4.1405487    -3.3099942    -3.8576074
   -4.1142683    -2.4667516    -2.9003258    -3.1817105    -1.2571107
   -2.089558     -1.6431712    -2.0988235    -1.2071482    -0.34366733
   -0.29154432   -0.20354536   -0.28135467    0.36818227    0.25190628]


In [162]:
test_feats = mfcc40_3sec_median.loc['20LfN8ENbhM_6']
# print(test_feats)
test_feats = scaler.transform(test_feats.values.reshape(1, -1))
# print(test_feats)

mfcc_1    -195.529846
mfcc_2      92.700752
mfcc_3     -24.008392
mfcc_4      10.492144
mfcc_5      -6.496579
mfcc_6     -18.499443
mfcc_7     -17.859005
mfcc_8     -13.474919
mfcc_9      -8.963863
mfcc_10     -7.732952
mfcc_11     -9.457983
mfcc_12     -3.530954
mfcc_13     -7.271724
mfcc_14     -3.894504
mfcc_15     -4.539925
mfcc_16     -6.592246
mfcc_17     -1.472745
mfcc_18     -6.129920
mfcc_19     -6.881275
mfcc_20     -2.304553
mfcc_21     -5.336194
mfcc_22     -4.885016
mfcc_23     -4.140549
mfcc_24     -3.309994
mfcc_25     -3.857607
mfcc_26     -4.114268
mfcc_27     -2.466752
mfcc_28     -2.900326
mfcc_29     -3.181710
mfcc_30     -1.257111
mfcc_31     -2.089558
mfcc_32     -1.643171
mfcc_33     -2.098824
mfcc_34     -1.207148
mfcc_35     -0.343667
mfcc_36     -0.291544
mfcc_37     -0.203545
mfcc_38     -0.281355
mfcc_39      0.368182
mfcc_40      0.251906
Name: 20LfN8ENbhM_6, dtype: float32
[[-0.2896645   0.6766481  -1.152351   -0.4575754  -0.3195462  -1.9685535
  -1.621028

In [163]:
test_pred = svc_model.predict(test_feats)
print(test_pred)

[1]


# mfcc40 3sec

In [219]:
X_train, X_test, y_train, y_test, scaler = \
    create_datasets(mfcc_3sec_median_new, sentiment_labels)
    # create_datasets(mfcc40_3sec_median, sentiment_labels)

X_train.shape: (14450, 40)
X_test.shape: (3613, 40)
y_train.shape: (14450, 3)
y_test.shape: (3613, 3)

training label count:
none        5282
positive    5522
negative    3674
dtype: int64

test label count:
none        1320
positive    1380
negative     918
dtype: int64


In [220]:
svc_model = train_svc(X_train, y_train, C=5)

In [221]:
predict_show_metrics(svc_model, X_train, y_train, data_name='Train')


Train accuracy :  0.791

          precision  recall     f1
none          0.791   0.754  0.772
positive      0.763   0.855  0.806
negative      0.844   0.747  0.793
macro         0.799   0.785  0.790


In [222]:
predict_show_metrics(svc_model, X_test, y_test, data_name='Test')


Test accuracy :  0.582

          precision  recall     f1
none          0.536   0.518  0.527
positive      0.591   0.665  0.626
negative      0.638   0.549  0.590
macro         0.588   0.577  0.581


In [268]:
test_positive, _ = \
    extract_audio_features_mean("../../positive_test.wav", len_secs=3,
                                  n_mfccs=40, rms=False, zrc=False)
print(test_positive)
test_positive = scaler.transform(test_positive.reshape(1, -1))
print(test_positive)
test_positive_pred = svc_model.predict(test_positive)
print(test_positive_pred)

[-1.3329506e+02  6.9082504e+01  1.9993027e+00  3.0165579e+01
  4.8657122e+00  4.5340462e+00 -4.3247681e+00  5.6679940e+00
 -2.5897515e+00 -3.5472088e+00 -1.2017125e+00 -3.8753498e+00
 -5.5836668e+00 -5.5898219e-01 -3.2558935e+00  3.5103457e+00
 -9.8338270e+00 -4.0746432e-02 -1.6256493e+00  9.0134335e-01
 -6.0412264e+00 -1.8584248e+00 -3.9652193e+00  8.4362030e-01
 -7.7901870e-01  8.1869185e-01 -1.8088070e+00  3.5724044e+00
 -1.7526212e-01 -1.4741180e+00 -4.7781491e-01 -1.6564813e-01
 -1.7174493e+00  2.1145258e+00 -1.7190773e+00  1.9912972e+00
 -1.9437352e+00 -1.1604613e+00 -1.1484674e+00 -1.6397709e+00]
[[ 0.7635654  -0.26115137  0.48957878  1.1923492   1.0420123   0.9935151
   0.37793547  1.665574    0.14429611 -0.11704967  0.4874705  -0.54466844
  -0.6914352  -0.0087396   0.16978696  1.4739904  -2.3575652   0.5895076
   0.5527696   0.7868876  -1.1900071  -0.14027564 -0.77462804  0.75580704
   0.42126012  0.63187295 -0.13639873  1.7592795   0.32865947 -0.49446487
   0.15859684  0.0110

In [294]:
from emotion.models.audio_model import AudioModel
import emotion.models.audio_model as audio_model
import emotion.models.audio_model

In [297]:
import importlib
importlib.reload(emotion.models.audio_model)

<module 'emotion.models.audio_model' from '/home/graymo/devl/bdeb/a62/emotion/emotion/models/audio_model.py'>

In [298]:
t_audio_model = emotion.models.audio_model.AudioModel()

EOFError: Ran out of input

In [275]:
svc_model.scaler = scaler

In [276]:
svc_model.__dict__

{'decision_function_shape': 'ovr',
 'break_ties': False,
 'kernel': 'rbf',
 'degree': 3,
 'gamma': 'auto',
 'coef0': 0.0,
 'tol': 0.001,
 'C': 5,
 'nu': 0.0,
 'epsilon': 0.0,
 'shrinking': True,
 'probability': False,
 'cache_size': 200,
 'class_weight': None,
 'verbose': False,
 'max_iter': -1,
 'random_state': 101,
 '_sparse': False,
 'n_features_in_': 40,
 'class_weight_': array([1., 1., 1.]),
 'classes_': array([0, 1, 2]),
 '_gamma': 0.025,
 'support_': array([    2,     7,    10, ..., 14441, 14442, 14449], dtype=int32),
 'support_vectors_': array([[-3.39125633e-01, -1.07917376e-01,  2.02631444e-01, ...,
         -4.16145623e-01, -1.00107908e+00, -1.09846246e+00],
        [ 4.94197726e-01,  3.10632288e-01, -1.62518597e+00, ...,
          6.52225912e-01,  3.35251570e-01,  6.16763420e-02],
        [-3.32404733e+00, -1.46636404e-02,  1.52493520e-02, ...,
         -1.52940893e+00, -6.08805478e-01, -1.80887890e+00],
        ...,
        [ 5.98988950e-01, -1.94819361e-01, -1.29108202e+00

In [205]:
test_happy_pred = svc_model.predict(test_happy)
print(test_happy_pred)

[0]


In [277]:
test_negative, _ = \
    extract_audio_features_mean("../../angry.wav", len_secs=3,
                                  n_mfccs=40, rms=False, zrc=False)
print(test_negative)
test_negative = svc_model.scaler.transform(test_negative.reshape(1, -1))
print(test_negative)
test_negative_pred = svc_model.predict(test_negative)
print(test_negative_pred)

[-138.25993      47.83075      12.43015       9.442147      2.8886764
    9.218459     -1.7010719     5.866592     -1.1854713     1.8068024
   -5.1761727    -2.8015115    -8.815862     -2.103603     -3.1512337
    0.73841155   -7.360411      0.8245869     0.57170105    3.441814
    0.19264491    2.0323708     0.3843471     1.3737916    -0.93068755
    0.9424493     1.0967329     0.47656903   -0.8584002    -1.6949978
   -0.619295      0.5549292    -1.308152      1.3822969    -1.8309324
    0.40982783   -2.3747313    -1.2822636     1.3139466     0.30254412]
[[ 0.68894565 -1.3008703   1.1330906  -0.62594897  0.83384556  1.5823922
   0.75391036  1.6949327   0.40254703  0.93477666 -0.4041584  -0.28442252
  -1.4825854  -0.4049933   0.198373    0.6424882  -1.5469548   0.87359196
   1.3397962   1.6784688   1.1318214   1.353408    0.9849124   0.9691251
   0.3612777   0.6840961   1.0443158   0.45697284  0.04656575 -0.59102887
   0.09626771  0.32925007 -0.29649228  0.686594   -0.6190391   0.27876

In [270]:
test_neutral, _ = \
    extract_audio_features_mean("../../neutral_2.wav", len_secs=3,
                                  n_mfccs=40, rms=False, zrc=False)
print(test_neutral)
test_neutral = scaler.transform(test_neutral.reshape(1, -1))
print(test_neutral)
test_neutral_pred = svc_model.predict(test_neutral)
print(test_neutral_pred)

[-190.24722      46.390522     18.75701      27.056196      5.0892696
   15.369224     -0.8989706    11.484544     -2.2059097     2.6998034
   -4.582983     -1.2573931    -4.801177      4.4592733    -1.8320419
    5.2400537    -1.6288586     1.4911369     3.028813      2.1564338
   -4.174566      1.2735795    -2.1502934    -0.76275975   -1.222827
    1.1108754    -1.8192449    -0.49994603   -0.6435079    -0.27612004
   -0.24331577    2.4745038    -2.229893      0.8968766    -0.39293244
    0.2933206    -1.1387155    -2.3547163     1.1650361    -0.72762954]
[[-0.0923997  -1.3713318   1.5234146   0.91952837  1.0655512   2.3556042
   0.86885124  2.5254362   0.21488565  1.1102118  -0.27108246  0.08979627
  -0.49990383  1.278633    0.5586879   1.9928544   0.33144203  1.0924168
   2.2198617   1.2273631  -0.49476242  1.0621066  -0.04043174  0.10946898
   0.24574153  0.75516856 -0.14064033  0.04618806  0.13530299  0.02927476
   0.26190558  1.1767861  -0.7280971   0.46574375  0.11481909  0.2216

In [216]:
test_feats = mfcc40_3sec_median.loc['20LfN8ENbhM_6']
test_featsu = \
    extract_audio_features_median(f"{audio_out_dir}/20LfN8ENbhM_6.wav", len_secs=3,
                                  n_mfccs=40, rms=False, zrc=False)

print(test_feats)
print(test_featsu)
# test_feats = scaler.transform(test_feats.values.reshape(1, -1))
# test_pred = svc_model.predict(test_feats)

mfcc_1    -225.151460
mfcc_2     109.085250
mfcc_3     -27.196510
mfcc_4      10.121292
mfcc_5       1.344766
mfcc_6     -12.285128
mfcc_7     -18.956467
mfcc_8     -10.092272
mfcc_9     -12.402256
mfcc_10     -5.953549
mfcc_11     -6.343150
mfcc_12     -9.125022
mfcc_13     -3.000541
mfcc_14     -6.843306
mfcc_15     -3.434034
mfcc_16     -2.889093
mfcc_17     -4.126051
mfcc_18     -3.769894
mfcc_19     -1.705133
mfcc_20     -6.406855
mfcc_21     -5.169720
mfcc_22     -1.708011
mfcc_23     -4.280241
mfcc_24     -3.766256
mfcc_25     -4.359924
mfcc_26     -2.121166
mfcc_27     -3.779163
mfcc_28     -3.300152
mfcc_29     -3.509409
mfcc_30     -2.014054
mfcc_31     -2.784825
mfcc_32     -2.542096
mfcc_33     -1.216293
mfcc_34     -1.769727
mfcc_35     -0.876318
mfcc_36     -1.842804
mfcc_37     -1.077330
mfcc_38     -0.488476
mfcc_39     -0.340661
mfcc_40     -0.298670
Name: 20LfN8ENbhM_6, dtype: float64
(array([-195.52985   ,   92.70075   ,  -24.008392  ,   10.492144  ,
         -6.4965

In [213]:
print(test_pred)

[1]


In [215]:
rand_negative = \
    np.random.choice(
        sentiment_labels[sentiment_labels['negative'] == 1].index.tolist(),
        1)
print(rand_negative)

['261267_19']


In [None]:
test_feats = mfcc40_3sec_median.loc['20LfN8ENbhM_6']
# print(test_feats)
test_feats = scaler.transform(test_feats.values.reshape(1, -1))
test_pred = svc_model.predict(test_feats)