# Import Libraries

In [None]:
#General
import numpy as np
import pandas as pd
import itertools
from tqdm import tqdm

# System
import os, fnmatch

In [None]:
# Root Directory of project
root = "/content/drive/MyDrive/ML_Final_Project"

# Load Csv File

In [None]:
# Load Csv of Voice Data
csv_data = pd.read_csv(f'{root}/dataV2.csv')
csv_data

Unnamed: 0,emotionID,textID,sex,age,voice id
0,1,1,m,21,15997
1,1,2,m,21,16001
2,1,3,m,21,16005
3,1,4,m,21,16009
4,1,5,m,21,16013
...,...,...,...,...,...
16860,4,6,f,54,10563
16861,4,7,f,54,10567
16862,4,8,f,54,10571
16863,4,9,f,54,10575


# Transfomr csv to a more standard dataframe


In [None]:
df = pd.DataFrame()
df['voice_id'] = csv_data['voice id']
df['emotion_id'] = csv_data['emotionID']
df['text_id'] = csv_data['textID']
df['gender'] = csv_data['sex']
df['age'] = csv_data['age']
df

Unnamed: 0,voice_id,emotion_id,text_id,gender,age
0,15997,1,1,m,21
1,16001,1,2,m,21
2,16005,1,3,m,21
3,16009,1,4,m,21
4,16013,1,5,m,21
...,...,...,...,...,...
16860,10563,4,6,f,54
16861,10567,4,7,f,54
16862,10571,4,8,f,54
16863,10575,4,9,f,54


# Check for Unique and Null Values in Columns

In [None]:
dtypes=pd.DataFrame(df.dtypes,columns=["Data Type"])
dtypes["Unique Values"]=df.nunique()
dtypes["Null Values"]=df.isnull().sum()
dtypes.style.background_gradient(cmap='Set3',axis=0)

Unnamed: 0,Data Type,Unique Values,Null Values
voice_id,int64,16808,0
emotion_id,int64,10,0
text_id,int64,11,0
gender,object,6,0
age,object,59,0


In [None]:
for col in df[['emotion_id', 'text_id', 'gender', 'age']]:
  print('\n=======================================\n')
  print('Unique Values: ', df[col].unique())
  print('# of Unique Values\n', df[col].value_counts())



Unique Values:  [ 1  2  3  4 10  5  6  7  8  9]
# of Unique Values
 4     4193
1     4191
3     4191
2     4170
10      20
5       20
6       20
7       20
8       20
9       20
Name: emotion_id, dtype: int64


Unique Values:  [ 1  2  3  4  5  6  7  8  9 10  0]
# of Unique Values
 3     1744
4     1727
1     1723
2     1704
6     1663
5     1660
8     1660
7     1659
9     1658
10    1651
0       16
Name: text_id, dtype: int64


Unique Values:  ['m' 'f' 'M' 'F' 'w' 'f ']
# of Unique Values
 m     8235
f     7585
M      503
F      382
w       80
f       80
Name: gender, dtype: int64


Unique Values:  ['21' '24' '23' '41' '38' '33' '29' '25' '46' '50' '57' '22' '19' '48'
 '44' '30' '26' '56' '20' '55' '10' '27' '51' '61' '63' '58' '59' '18'
 '31' '60' '40' '28' '34' '42' '52' '49' '32' '36' '53' '35' '30-' '30+'
 '54' '65' '45' '16' '39' '14' '70' '43' '47' '62' '13' '15' '37' '8' '64'
 '66' '67']
# of Unique Values
 24     1839
23     1211
25     1117
20      799
26      790
21      7

# Fix Columns

In [None]:
df_filtered = df[df['emotion_id'] <= 4]
df_filtered = df_filtered[df_filtered['text_id'] >= 1]
df_filtered = df_filtered.drop(df_filtered[(df_filtered['age'] == '30-') | (df_filtered['age'] == '30+')].index)
df_filtered = df_filtered[df_filtered['gender'] != 'w']

gender = {'m': 1, 'M': 1, 'f': 0, 'F': 0, 'f ': 0}

df_filtered.gender = [gender[item] for item in df_filtered.gender]

for col in df_filtered[['emotion_id', 'text_id', 'gender', 'age']]:
  print('\n=======================================\n')
  print('Unique Values: ', df_filtered[col].unique())
  print('# of Unique Values\n', df_filtered[col].value_counts())



Unique Values:  [1 2 3 4]
# of Unique Values
 2    4138
4    4129
1    4127
3    4127
Name: emotion_id, dtype: int64


Unique Values:  [ 1  2  3  4  5  6  7  8  9 10]
# of Unique Values
 3     1694
1     1673
4     1673
2     1654
6     1643
5     1640
8     1640
7     1639
9     1638
10    1627
Name: text_id, dtype: int64


Unique Values:  [1 0]
# of Unique Values
 1    8618
0    7903
Name: gender, dtype: int64


Unique Values:  ['21' '24' '23' '41' '38' '33' '29' '25' '46' '50' '57' '22' '19' '48'
 '44' '30' '26' '56' '20' '55' '10' '27' '51' '61' '63' '58' '59' '18'
 '31' '60' '40' '28' '34' '42' '52' '49' '32' '36' '53' '35' '54' '65'
 '45' '16' '39' '14' '70' '43' '47' '62' '13' '15' '37' '8' '64' '66' '67']
# of Unique Values
 24    1815
23    1211
25    1117
20     799
26     790
21     721
22     583
31     495
19     479
27     441
35     402
56     402
29     402
33     377
50     357
32     337
28     324
45     322
51     322
55     318
58     282
44     242
49     242
48

# Delete duplicated rows based on voice_id

In [None]:
df_filtered = df_filtered.drop_duplicates(subset=['voice_id'], keep="last")

print(f'{df_filtered.shape[0]} rows remaining after deleting duplicates')

16464 rows remaining after deleting duplicates


# Create Dictionary of (voice_id, voice_file_path) 

In [None]:
voice_dict = {}

for id in tqdm(list(df_filtered['voice_id'])):
  file_path = f'{root}/Voice/' + str(id) + ".wav"

  # Check if voice file exists
  if (os.path.isfile(file_path)):
    voice_dict[id] = file_path
  else:
    voice_dict[id] = False

len(voice_dict)

100%|██████████| 16464/16464 [00:09<00:00, 1779.57it/s]


16464

# Find Files that are missing in dataset

In [None]:
missing_files = {k for (k,v) in voice_dict.items() if v == False}
missing_files

{91,
 1895,
 1923,
 1924,
 1928,
 1931,
 1932,
 1935,
 1936,
 1939,
 1940,
 1943,
 1944,
 1947,
 1948,
 1951,
 1952,
 1955,
 1956,
 1959,
 1960,
 1978,
 2052,
 2422}

# Remove missing files from dataframe

In [None]:
df_filtered = df_filtered[~df_filtered['voice_id'].isin(missing_files)]
print(f'{df_filtered.shape[0]} rows remaining after deleting rows with missing files')

16440 rows remaining after deleting rows with missing files


# Add file paths of each voice to corresponding row in dataframe


In [None]:
df_filtered['file_path'] = df_filtered['voice_id'].map({k:v for (k,v) in voice_dict.items() if v != False})
df_filtered

Unnamed: 0,voice_id,emotion_id,text_id,gender,age,file_path
0,15997,1,1,1,21,/content/drive/MyDrive/ML_Final_Project/Voice/...
1,16001,1,2,1,21,/content/drive/MyDrive/ML_Final_Project/Voice/...
2,16005,1,3,1,21,/content/drive/MyDrive/ML_Final_Project/Voice/...
3,16009,1,4,1,21,/content/drive/MyDrive/ML_Final_Project/Voice/...
4,16013,1,5,1,21,/content/drive/MyDrive/ML_Final_Project/Voice/...
...,...,...,...,...,...,...
16860,10563,4,6,0,54,/content/drive/MyDrive/ML_Final_Project/Voice/...
16861,10567,4,7,0,54,/content/drive/MyDrive/ML_Final_Project/Voice/...
16862,10571,4,8,0,54,/content/drive/MyDrive/ML_Final_Project/Voice/...
16863,10575,4,9,0,54,/content/drive/MyDrive/ML_Final_Project/Voice/...


# Save csv 

In [None]:
df_filtered.to_csv(f'{root}/clean_data.csv', index=False)

In [None]:
# Print some paths for test
for i,f in enumerate(list(df_filtered.file_path)[:5]):
  print(i, f)

0 /content/drive/MyDrive/ML_Final_Project/Voice/15997.wav
1 /content/drive/MyDrive/ML_Final_Project/Voice/16001.wav
2 /content/drive/MyDrive/ML_Final_Project/Voice/16005.wav
3 /content/drive/MyDrive/ML_Final_Project/Voice/16009.wav
4 /content/drive/MyDrive/ML_Final_Project/Voice/16013.wav
