### Imports and Config

In [36]:
# Data analysis
import pandas as pd
# Fileworks
import pickle
pickle.HIGHEST_PROTOCOL = 4  # for compability reasons

# ML preprocessing
from sklearn.preprocessing import OrdinalEncoder, MinMaxScaler

#### Dataset Pathways

In [37]:
DATA_PATH = 'dataset/'
SAMPLES_PATH = 'samples/'

TSV_TRAIN = 'train.tsv'
TSV_DEV = 'dev.tsv'
TSV_TEST = 'test.tsv'

### Read Data

In [38]:
df_train = pd.read_csv(DATA_PATH + TSV_TRAIN, sep='\t')
df_dev = pd.read_csv(DATA_PATH + TSV_DEV, sep='\t')
df_test = pd.read_csv(DATA_PATH + TSV_TEST, sep='\t')

In [39]:
d_tuples = [(df_train, TSV_TRAIN), 
            (df_dev, TSV_DEV), 
            (df_test, TSV_TEST)]

In [40]:
for df, tsv_name in d_tuples:
    print(f'There are {df.shape[0]} rows and {df.shape[1]} columns in {tsv_name}')

There are 564337 rows and 10 columns in train.tsv
There are 16164 rows and 10 columns in dev.tsv
There are 16164 rows and 10 columns in test.tsv


### Separate Accented Data

First, we'll leave only samples with labeled accents:

In [41]:
for df, tsv_name in d_tuples:
    df.query('accent == accent', inplace=True)
    print(f'There are {df.shape[0]} rows with accents {tsv_name}')

There are 317182 rows with accents train.tsv
There are 1874 rows with accents dev.tsv
There are 1519 rows with accents test.tsv


Then, check what fraction is each dataset relative to the total amount of data:

In [42]:
full_size = sum([len(df) for df in [t[0] for t in d_tuples]])
print(f'Accented dataset full size: {full_size} rows\n')

for df, tsv_name in d_tuples:
    print(f'{tsv_name} is {(df.shape[0] / full_size * 100):.2f}% of full dataset')

Accented dataset full size: 320575 rows

train.tsv is 98.94% of full dataset
dev.tsv is 0.58% of full dataset
test.tsv is 0.47% of full dataset


..which is obviously too little. So, let's concatenate all data into a single dataframe:

In [43]:
df_full = pd.concat([df for df in [t[0] for t in d_tuples]])
print(f'Created full accented dataset with {df_full.shape[0]} rows and {df_full.shape[1]} columns')

Created full accented dataset with 320575 rows and 10 columns


### Explore df_full

Now, let's explore the resulting dataframe

In [44]:
df_full.head()

Unnamed: 0,client_id,path,sentence,up_votes,down_votes,age,gender,accent,locale,segment
24,cb5bd9ad996218619531511ae2600aa61055005f910e7c...,common_voice_en_1027059.mp3,Little things please little minds,2,0,twenties,male,us,en,
72,cc6516333444de42e6d5a07afe5f65085d09df0f45c3c2...,common_voice_en_21788001.mp3,"Unfortunately, Adam overloads the computer, fr...",2,0,thirties,male,us,en,
73,cc6516333444de42e6d5a07afe5f65085d09df0f45c3c2...,common_voice_en_21788002.mp3,Nepal Loktantrik Primary School is one of the ...,2,0,thirties,male,us,en,
74,cc6516333444de42e6d5a07afe5f65085d09df0f45c3c2...,common_voice_en_21788003.mp3,"In the same article, Gardner denied that he co...",2,0,thirties,male,us,en,
75,cc6516333444de42e6d5a07afe5f65085d09df0f45c3c2...,common_voice_en_21788004.mp3,"During her literary career, she has written po...",2,1,thirties,male,us,en,


In [45]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 320575 entries, 24 to 16134
Data columns (total 10 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   client_id   320575 non-null  object
 1   path        320575 non-null  object
 2   sentence    320575 non-null  object
 3   up_votes    320575 non-null  int64 
 4   down_votes  320575 non-null  int64 
 5   age         315295 non-null  object
 6   gender      315807 non-null  object
 7   accent      320575 non-null  object
 8   locale      320575 non-null  object
 9   segment     1 non-null       object
dtypes: int64(2), object(8)
memory usage: 26.9+ MB


In [46]:
df_full.nunique()

client_id       7336
path          320575
sentence      320575
up_votes          16
down_votes         7
age                9
gender             3
accent            17
locale             1
segment            1
dtype: int64

- 7336 unique speakers
- Each spoken sentence is unique
- 9 types of age
- 3 types of gender
- 17 unique English accents

#### Encode client_id

Let's encode `client_id` feature for convenience

In [47]:
ordinal_encoder = OrdinalEncoder(dtype=int)
df_full['client_id'] = ordinal_encoder.fit_transform(df_full[['client_id'][:]])
df_full['client_id'] = df_full['client_id'].apply(lambda x: f'id_{x}')

df_full['client_id'].head()

24    id_5830
72    id_5849
73    id_5849
74    id_5849
75    id_5849
Name: client_id, dtype: object

#### Votes Disparity

Now we'll engineer a new feature called  `'votes_disparity_rate'`:

In [48]:
def disparity_rate(df: pd.DataFrame) -> pd.DataFrame:
    '''Creates votes_disparity_rate feature, which may denote some useful information'''
    make_disparity_rate = lambda u, d: ((u - d) / max(u, d)) * (np.log(u - d) + 1)
    df['votes_disparity_rate'] = df.apply(
        lambda r: make_disparity_rate(r['up_votes'], r['down_votes']), 
        axis=1
    )
    vdr_array = df['votes_disparity_rate'].values.reshape(-1, 1)
    df['votes_disparity_rate'] = MinMaxScaler().fit_transform(vdr_array)
    return df

df_full = disparity_rate(df_full)

df_full['votes_disparity_rate'].head()

24    0.398656
72    0.398656
73    0.398656
74    0.398656
75    0.087053
Name: votes_disparity_rate, dtype: float64

### Clean Data

Remove an absolutely empty attribute `'segment'` and unnecessary `'locale'` (1 unique value):

In [49]:
df_full = df_full[df_full['accent'].notna()]
df_full = df_full.drop(['segment', 'locale'], axis=1)

Let's check each accent quantities:

In [50]:
df_full['accent'].value_counts()

us                171217
england            46836
indian             33637
australia          24230
canada             21077
scotland            6204
ireland             3938
newzealand          3378
african             3188
philippines         2295
singapore           2001
hongkong            1256
malaysia             560
wales                312
other                276
bermuda              165
southatlandtic         5
Name: accent, dtype: int64

Remove accents with counts less than 1000

In [51]:
acc_counts = df_full['accent'].value_counts()
accents_list = acc_counts[acc_counts >= 1000].index

In [52]:
df_full = df_full[df_full['accent'].isin(accents_list)]

df_full['accent'].value_counts()

us             171217
england         46836
indian          33637
australia       24230
canada          21077
scotland         6204
ireland          3938
newzealand       3378
african          3188
philippines      2295
singapore        2001
hongkong         1256
Name: accent, dtype: int64

In [53]:
df_full.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 319257 entries, 24 to 16134
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   client_id             319257 non-null  object 
 1   path                  319257 non-null  object 
 2   sentence              319257 non-null  object 
 3   up_votes              319257 non-null  int64  
 4   down_votes            319257 non-null  int64  
 5   age                   313977 non-null  object 
 6   gender                314496 non-null  object 
 7   accent                319257 non-null  object 
 8   votes_disparity_rate  319257 non-null  float64
dtypes: float64(1), int64(2), object(6)
memory usage: 24.4+ MB


In [54]:
df_full.nunique()

client_id                 7160
path                    319257
sentence                319257
up_votes                    16
down_votes                   7
age                          9
gender                       3
accent                      12
votes_disparity_rate        46
dtype: int64

In [55]:
print('Dataframe now consists of:')
print(f'{df_full["accent"].value_counts().count()} unique accents')
print(f'{len(df_full)} rows, {len(df_full.columns)} columns')

Dataframe now consists of:
12 unique accents
319257 rows, 9 columns


### Generate New Metainfo Storage

In [52]:
new_meta_name = 'source_df.h5'

In [53]:
df_full.to_hdf(DATA_PATH + new_meta_name, 'source_df', mode='w', index=False)

shape = f'{df_full.shape[0]} rows, {df_full.shape[1]} columns'
print(f'Successfully written accented samples dataset to: {new_meta_name} ({shape})')

Successfully written accented samples dataset to: source_df.h5 (319257 rows, 9 columns)


your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed,key->block0_values] [items->Index(['client_id', 'path', 'sentence', 'age', 'gender', 'accent'], dtype='object')]

  pytables.to_hdf(


### Delete Unnecessary Files

In [37]:
def filescount(dir_name):
    '''Counts files in the dir'''
    return len([f for f in os.listdir(dir_name) 
                if os.path.isfile(os.path.join(dir_name, f))])

In [139]:
def remove_junk(csv_file):
    '''Removes unnecessary junk'''
    print(f'Source CSV file: {csv_file}')
    
    # make list of proper files with accents
    df = pd.read_csv(DATA_PATH + csv_file)
    ok_list = df['filename'].tolist()

    # make array with all files present in the target directory
    directory = ntpath.split(ok_list[0])[0]
    target_dir = DATA_PATH + directory
    total_count = filescount(target_dir)
    files = target_dir.walkfiles('*.mp3')
    
    # delete all files with no accents
    print(f'Target dir: {target_dir} (contains {total_count} files in total)')
    print('Removing files with no accent label..')
    
    del_count = 0
    for f in files:
        filename = directory + '/' + ntpath.basename(f)
        if filename not in ok_list:
            del_count += 1
            f.remove()
    
    print(f'Total files removed: {del_count}')
    print(f'Files remaining: {total_count - del_count}')

In [140]:
# remove_junk(acc_suffix(CSVFILE_TRAIN))
# remove_junk(acc_suffix(CSVFILE_DEV))
remove_junk(acc_suffix(CSVFILE_TEST))

Source CSV file: cv-valid-test-accents.csv
Target dir: dataset/cv-valid-test contains 3995 files in total
Removing files with no accent label..
Total files removed: 2657
Files remaining: 1338
