In [12]:
import os
import pandas as pd

from os.path import join, basename, isdir, isfile, exists
from pathlib import Path

In [13]:
fear_label = 'fear'
contempt_label = 'contempt'  # disgust
happiness_label = 'happiness'
anger_label = 'anger'
sadness_label = 'sadness'
neutral_label = 'neutral'
surprise_label = 'surprise'
calm_label = 'calm'

In [14]:
meld_dir = r'I:\Datasets\meld\train_splits\wav'
train_files_path = r'I:\Datasets\meld\train_splits'
train_info_path = r'I:\Datasets\meld\train_sent_emo.csv'

In [15]:
train_info = pd.read_csv(train_info_path)

In [16]:
train_info.head(2)

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,8,21,"00:16:16,059","00:16:21,731"
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,8,21,"00:16:21,940","00:16:23,442"


Column Name  | Description 

Sr No.       | Serial numbers of the utterances mainly for referencing the utterances in case of different versions or multiple copies with different subsets. 

Utterance    | Individual utterances from EmotionLines as a string.

Speaker      | Name of the speaker associated with the utterance. 

Emotion      | The emotion (neutral, joy, sadness, anger, surprise, fear, disgust)

Sentiment    | The sentiment (positive, neutral, negative) expressed by the speaker in the utterance.

Dialogue_ID  | The index of the dialogue starting from 0.       

Utterance_ID | The index of the particular utterance in the dialogue starting from 0.     

StartTime    | The starting time of the utterance in the given episode in the format 'hh:mm:ss,ms'. 

StartTime    | The starting time of the utterance in the given episode in the format 'hh:mm:ss,ms'. 

In [17]:
train_filenames = [f for f in os.listdir(train_files_path) if isfile(join(train_files_path, f))]


In [18]:
train_filenames[:5]

['dia0_utt0.mp4',
 'dia0_utt1.mp4',
 'dia0_utt10.mp4',
 'dia0_utt11.mp4',
 'dia0_utt12.mp4']

In [19]:
#emotion - file path
#add file column to train info ("dia{Dialogue_ID}_urr{Utterance_ID}.ext")
#emotion from train_info, path from dir+train_filenames

In [20]:
filenames = []
for i, row in train_info.iterrows():
    filenames.append(f'{meld_dir}\dia{row.Dialogue_ID}_utt{row.Utterance_ID}.wav')

print(len(filenames) == len(train_info))

True


In [21]:
train_info.insert(7, 'path', filenames)
train_info.head(2)

Unnamed: 0,Sr No.,Utterance,Speaker,Emotion,Sentiment,Dialogue_ID,Utterance_ID,path,Season,Episode,StartTime,EndTime
0,1,also I was the point person on my companys tr...,Chandler,neutral,neutral,0,0,I:\Datasets\meld\train_splits\wav\dia0_utt0.wav,8,21,"00:16:16,059","00:16:21,731"
1,2,You mustve had your hands full.,The Interviewer,neutral,neutral,0,1,I:\Datasets\meld\train_splits\wav\dia0_utt1.wav,8,21,"00:16:21,940","00:16:23,442"


In [22]:
for i, row in train_info.iterrows():
    if not exists(row.path):
        print(f'file does not exist: {row.path}')


file does not exist: I:\Datasets\meld\train_splits\wav\dia125_utt3.wav


In [23]:
i = train_info[(train_info.path == r'I:\Datasets\meld\train_splits\wav\dia125_utt3.wav')].index
train_info_valid = train_info.drop(i)
print(len(train_info) - len(train_info_valid) == 1)

True


In [24]:
meld_df = train_info_valid[['Emotion', 'path']].copy()
meld_df = meld_df.rename(columns={'Emotion': 'emotion'})

In [25]:
for i, row in meld_df.iterrows():
    if not exists(row.path):
        print(f'file does not exist: {row.path}')

In [26]:
meld_df.head()

Unnamed: 0,emotion,path
0,neutral,I:\Datasets\meld\train_splits\wav\dia0_utt0.wav
1,neutral,I:\Datasets\meld\train_splits\wav\dia0_utt1.wav
2,neutral,I:\Datasets\meld\train_splits\wav\dia0_utt2.wav
3,neutral,I:\Datasets\meld\train_splits\wav\dia0_utt3.wav
4,surprise,I:\Datasets\meld\train_splits\wav\dia0_utt4.wav


In [28]:
meld_df.emotion.value_counts()

neutral     4709
joy         1743
surprise    1205
anger       1109
sadness      683
disgust      271
fear         268
Name: emotion, dtype: int64

In [40]:
meld_crop_df = meld_df[(meld_df.emotion != 'joy')]

In [41]:
meld_crop_df.emotion.value_counts()

neutral     4709
surprise    1205
anger       1109
sadness      683
contempt     271
fear         268
Name: emotion, dtype: int64

In [44]:
meld_df.emotion.replace('neutral', neutral_label, inplace=True)
meld_df.emotion.replace('surprise', surprise_label, inplace=True)
meld_df.emotion.replace('anger', anger_label, inplace=True)
meld_df.emotion.replace('sadness', sadness_label, inplace=True)
meld_df.emotion.replace('disgust', contempt_label, inplace=True)
meld_df.emotion.replace('fear', fear_label, inplace=True)

In [45]:
meld_crop_df.to_csv('meld_df.csv', index=False)

In [46]:
meld_crop_df = meld_crop_df.groupby(['emotion']).head(1000).reset_index(drop=True)

In [47]:
meld_crop_df.emotion.value_counts()


neutral     1000
surprise    1000
anger       1000
sadness      683
contempt     271
fear         268
Name: emotion, dtype: int64

In [48]:
meld_crop_df.to_csv('meld_crop_df.csv', index=False)