# Data Preprocessing

## 0. Preprocess Plan 
1. Preprocess `SEP-28k_labels.csv` file
 - Create new column including unique audio id (Show-EpId-ClipId)
 - Create isStutter columns
 - Encode Interjection, NaturalPause, Music columns 
 - Drop rows in which Unsure, PoorAudioQuality, DifficultToUnderstand, noSpeech is not 0
 - Drop unused columns
2. Preprocess audio files
 - Use MCFF
3. Combine above two datasets into one csv file 

### Your file structure should look like below :
&#x2757; Make sure you put `clips` folder outside of Smooth-Talk-Squad folder

In [1]:
'''
├── clips
    ├── HeStutters_0_0.wav
└── Smooth-Talk-Squad
    ├── raw_data
        ├── SEP-28k_labels.csv
    └── notebooks
        ├── data-processing.ipynb
''';

In [2]:
! pip install pandas
! pip install numpy
! pip install tqdm
! pip install librosa


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython3.10 -m pip install --upgrade pip[0m

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.2.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32

## 1. Preprocess `SEP-28k_labels.csv` file

In [3]:
import numpy as np
import pandas as pd

In [4]:
data_sep28k_labels = pd.read_csv('../raw_data/SEP-28k_labels.csv')

In [5]:
data_sep28k_labels.head(5)

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech
0,HeStutters,0,0,31900320,31948320,0,0,0,0,0,0,0,0,3,1,0,0
1,HeStutters,0,1,31977120,32025120,0,0,0,0,0,0,0,0,3,1,0,0
2,HeStutters,0,2,34809760,34857760,0,0,0,0,0,0,0,0,3,0,0,0
3,HeStutters,0,3,35200640,35248640,0,0,1,0,0,0,0,0,2,0,0,0
4,HeStutters,0,4,35721920,35769920,0,0,0,0,0,0,0,0,3,0,0,0


### Look into dataset

In [6]:
data_sep28k_labels.shape

(28177, 17)

In [7]:
data_sep28k_labels.columns

Index(['Show', 'EpId', 'ClipId', 'Start', 'Stop', 'Unsure', 'PoorAudioQuality',
       'Prolongation', 'Block', 'SoundRep', 'WordRep', 'DifficultToUnderstand',
       'Interjection', 'NoStutteredWords', 'NaturalPause', 'Music',
       'NoSpeech'],
      dtype='object')

In [8]:
data_sep28k_labels[["Unsure"]].value_counts()

Unsure
0         27409
1           744
2            24
Name: count, dtype: int64

In [9]:
data_sep28k_labels[["PoorAudioQuality"]].value_counts()

PoorAudioQuality
0                   25267
1                    2328
2                     481
3                     101
Name: count, dtype: int64

In [10]:
data_sep28k_labels[["DifficultToUnderstand"]].value_counts()

DifficultToUnderstand
0                        23305
1                         3834
2                          884
3                          154
Name: count, dtype: int64

In [11]:
data_sep28k_labels[["NoSpeech"]].value_counts()

NoSpeech
0           27202
1             667
2             178
3             130
Name: count, dtype: int64

In [12]:
data_sep28k_labels[["Prolongation"]].value_counts()

Prolongation
0               19631
1                5734
2                2022
3                 790
Name: count, dtype: int64

### Create unique audio id column 'Name'

In [13]:
data_sep28k_labels['Name'] = data_sep28k_labels[data_sep28k_labels.columns[0:3]].apply(
    lambda x: '_'.join(x.dropna().astype(str)),
    axis=1
)
data_sep28k_labels.head(5)

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech,Name
0,HeStutters,0,0,31900320,31948320,0,0,0,0,0,0,0,0,3,1,0,0,HeStutters_0_0
1,HeStutters,0,1,31977120,32025120,0,0,0,0,0,0,0,0,3,1,0,0,HeStutters_0_1
2,HeStutters,0,2,34809760,34857760,0,0,0,0,0,0,0,0,3,0,0,0,HeStutters_0_2
3,HeStutters,0,3,35200640,35248640,0,0,1,0,0,0,0,0,2,0,0,0,HeStutters_0_3
4,HeStutters,0,4,35721920,35769920,0,0,0,0,0,0,0,0,3,0,0,0,HeStutters_0_4


### Create isStutter columns

1. 'isStutter_by_all_reviewers' column : NoStutteredWords is 1 or 2 or 3 (1 or more reviewers think it is stuttered) 
2. 'isStutter_by_2_more_reviewers' column : NoStutteredWords is 2 or 3 (at least 2 reviewers think it is stuttered) 

In [14]:
data_sep28k_labels['isStutter_by_all_reviewers'] = ""
data_sep28k_labels['isStutter_by_2_more_reviewers'] = ""

In [15]:
data_sep28k_labels.loc[data_sep28k_labels["NoStutteredWords"] <= 1.0, "isStutter_by_2_more_reviewers"] = 0
data_sep28k_labels.loc[data_sep28k_labels["NoStutteredWords"] >= 2.0, "isStutter_by_2_more_reviewers"] = 1
data_sep28k_labels[["isStutter_by_2_more_reviewers"]].value_counts()

isStutter_by_2_more_reviewers
1                                16046
0                                12131
Name: count, dtype: int64

In [16]:
# data_sep28k_labels.head(20)

In [17]:
data_sep28k_labels.loc[data_sep28k_labels["NoStutteredWords"] == 0, "isStutter_by_all_reviewers"] = 0
data_sep28k_labels.loc[data_sep28k_labels["NoStutteredWords"] > 0, "isStutter_by_all_reviewers"] = 1
data_sep28k_labels[["isStutter_by_all_reviewers"]].value_counts()

isStutter_by_all_reviewers
1                             21372
0                              6805
Name: count, dtype: int64

In [18]:
data_sep28k_labels.head(5)

Unnamed: 0,Show,EpId,ClipId,Start,Stop,Unsure,PoorAudioQuality,Prolongation,Block,SoundRep,WordRep,DifficultToUnderstand,Interjection,NoStutteredWords,NaturalPause,Music,NoSpeech,Name,isStutter_by_all_reviewers,isStutter_by_2_more_reviewers
0,HeStutters,0,0,31900320,31948320,0,0,0,0,0,0,0,0,3,1,0,0,HeStutters_0_0,1,1
1,HeStutters,0,1,31977120,32025120,0,0,0,0,0,0,0,0,3,1,0,0,HeStutters_0_1,1,1
2,HeStutters,0,2,34809760,34857760,0,0,0,0,0,0,0,0,3,0,0,0,HeStutters_0_2,1,1
3,HeStutters,0,3,35200640,35248640,0,0,1,0,0,0,0,0,2,0,0,0,HeStutters_0_3,1,1
4,HeStutters,0,4,35721920,35769920,0,0,0,0,0,0,0,0,3,0,0,0,HeStutters_0_4,1,1


### Encode Interjection, NaturalPause, Music columns

In [19]:
data_sep28k_labels.shape

(28177, 20)

In [20]:
from sklearn.preprocessing import Binarizer
column_names = ["Music", "Interjection", "NaturalPause"]
column_names_after_encoded = ["MusicEncoded", "InterjectionEncoded", "NaturalPauseEncoded"]
temp_data = data_sep28k_labels[column_names]
binarizer = Binarizer(threshold=0.0).fit(temp_data)
temp_data_binarizered = pd.DataFrame(binarizer.transform(temp_data), columns=column_names_after_encoded)
temp_data_binarizered.shape

(28177, 3)

In [21]:
data_sep28k_labels = pd.concat([data_sep28k_labels, temp_data_binarizered], axis=1)

In [22]:
data_sep28k_labels[["Music"]].value_counts() # For comparison purpose

Music
0        27781
3          199
2           99
1           98
Name: count, dtype: int64

In [23]:
data_sep28k_labels[["MusicEncoded"]].value_counts() # For comparison purpose

MusicEncoded
0               27781
1                 396
Name: count, dtype: int64

In [24]:
data_sep28k_labels.drop(column_names, axis=1,inplace=True)

### Drop rows in which Unsure, PoorAudioQuality, DifficultToUnderstand, NoSpeech is not 0

In [25]:
data_sep28k_labels.shape

(28177, 20)

In [26]:
data_sep28k_labels = data_sep28k_labels[data_sep28k_labels["Unsure"] == 0]
data_sep28k_labels = data_sep28k_labels[data_sep28k_labels["PoorAudioQuality"] == 0]
data_sep28k_labels = data_sep28k_labels[data_sep28k_labels["DifficultToUnderstand"] == 0]
data_sep28k_labels = data_sep28k_labels[data_sep28k_labels["NoSpeech"] == 0]

data_sep28k_labels.shape

(21090, 20)

### Drop unused columns

In [27]:
data_sep28k_labels.drop(['Prolongation', 'Block', 'SoundRep', 'WordRep', 'NoStutteredWords'], axis=1,inplace=True)
data_sep28k_labels.drop(['Unsure', 'PoorAudioQuality', 'DifficultToUnderstand', 'NoSpeech'], axis=1, inplace=True)
data_sep28k_labels.drop(['Show', 'EpId', 'ClipId', 'Start', 'Stop'], axis=1, inplace=True)


In [28]:
data_sep28k_labels.shape

(21090, 6)

In [29]:
data_sep28k_labels.head(5)

Unnamed: 0,Name,isStutter_by_all_reviewers,isStutter_by_2_more_reviewers,MusicEncoded,InterjectionEncoded,NaturalPauseEncoded
0,HeStutters_0_0,1,1,0,0,1
1,HeStutters_0_1,1,1,0,0,1
2,HeStutters_0_2,1,1,0,0,0
3,HeStutters_0_3,1,1,0,0,0
4,HeStutters_0_4,1,1,0,0,0


## 2. Audio File Data Process

In [30]:
import librosa
import numpy as np
from tqdm.notebook import tqdm
import os

In [31]:
features={}

In [32]:
directory = os.path.join(os.pardir,os.pardir, 'clips') # clips folder is not in Smooth-Talk-Squad folder
'''
├── clips
    ├── HeStutters_0_0.wav
└── Smooth-Talk-Squad
    ├── raw_data
        ├── SEP-28k_labels.csv
    └── notebooks
        ├── data-processing.ipynb
'''
ignore_list = []
for filename in tqdm(os.listdir(directory)):
    filename = filename[:-4] # Remove file extension (.wav in our situation)
    print(filename)
    if ignore_list.count(filename) == 0:
        audio, sample_rate = librosa.load(os.path.join(directory, filename) + '.wav', res_type='kaiser_fast', sr=None)
        mfccs = np.mean(librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=40).T,axis=0)
        features[filename] = mfccs

  0%|          | 0/1 [00:00<?, ?it/s]

HeStutters_0_0


In [33]:
features

{'HeStutters_0_0': array([-3.9217270e+02,  6.3350628e+01,  4.8332834e+00,  6.9779840e+00,
         2.1696117e+00, -3.5743895e+00, -7.0913873e+00, -1.9783952e+00,
        -8.9845819e+00, -4.7242465e+00, -3.3452766e+00, -6.5738153e+00,
        -7.0268269e+00, -7.4578005e-01, -5.3784409e+00, -2.5886452e+00,
        -3.6963844e+00, -2.4798405e+00, -5.2713332e+00, -1.9026902e+00,
        -4.3867741e+00, -3.5050061e+00, -4.2800155e+00, -3.9538312e+00,
        -2.0943983e+00, -2.7127795e+00, -3.4109273e+00, -3.4360785e+00,
        -3.9724562e+00, -3.7955062e+00, -6.4136357e+00, -4.3933554e+00,
        -6.8520637e+00, -1.3455036e+00, -2.4144375e+00, -1.3912545e-01,
        -3.2510958e+00, -1.0767114e+00, -3.1128201e+00, -6.9624156e-01],
       dtype=float32)}

In [34]:
df_pro = pd.DataFrame.from_dict(features)
df_pro=df_pro.transpose()
df_pro = df_pro.reset_index()
df_pro

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,HeStutters_0_0,-392.172699,63.350628,4.833283,6.977984,2.169612,-3.574389,-7.091387,-1.978395,-8.984582,...,-6.413636,-4.393355,-6.852064,-1.345504,-2.414438,-0.139125,-3.251096,-1.076711,-3.11282,-0.696242


## 3. Combine MCFF Datasets with Label Datasets

In [35]:
df_pro

Unnamed: 0,index,0,1,2,3,4,5,6,7,8,...,30,31,32,33,34,35,36,37,38,39
0,HeStutters_0_0,-392.172699,63.350628,4.833283,6.977984,2.169612,-3.574389,-7.091387,-1.978395,-8.984582,...,-6.413636,-4.393355,-6.852064,-1.345504,-2.414438,-0.139125,-3.251096,-1.076711,-3.11282,-0.696242


In [36]:
df_pro.shape

(1, 41)

In [37]:
data_sep28k_labels.shape

(21090, 6)

In [38]:
data = pd.merge(data_sep28k_labels, df_pro, left_on='Name', right_on='index', how='left')
data.drop(['Name'], axis=1,inplace=True)
data.shape

(21090, 46)

In [39]:
data.head(5)

Unnamed: 0,isStutter_by_all_reviewers,isStutter_by_2_more_reviewers,MusicEncoded,InterjectionEncoded,NaturalPauseEncoded,index,0,1,2,3,...,30,31,32,33,34,35,36,37,38,39
0,1,1,0,0,1,HeStutters_0_0,-392.172699,63.350628,4.833283,6.977984,...,-6.413636,-4.393355,-6.852064,-1.345504,-2.414438,-0.139125,-3.251096,-1.076711,-3.11282,-0.696242
1,1,1,0,0,1,,,,,,...,,,,,,,,,,
2,1,1,0,0,0,,,,,,...,,,,,,,,,,
3,1,1,0,0,0,,,,,,...,,,,,,,,,,
4,1,1,0,0,0,,,,,,...,,,,,,,,,,


### Output Final CSV File 

In [40]:
raw_data_directory = os.path.join(os.pardir, 'raw_data')
data.to_csv(os.path.join(raw_data_directory, 'final_data.csv'))