In [21]:
import os
import scipy.io.wavfile as spwav
import numpy as np
import pandas as pd

def min_max_avg(ary: np.array) -> tuple:
    """
    Extract minimum, maximum, and compute the average of the array

    Args:
        ary (np.array): input array

    Returns:
        tuple: tuple of (min, max, avg)
    """
    return (np.min(ary), np.max(ary), np.mean(ary))


def create_dict_dataset(path:str, is_eval:bool=False) -> tuple:
    """
    function used to read the dataset. It creates a dictionary in
    the following form:
    id: {label, array, min_max_avg}

    Args:
        path (str): path used to read the dataset
        is_eval (bool, optional): boolean flag to set to True
        if the dataset you want to read is the evaluation one.
        Defaults to False.

    Returns:
        tuple: dataset in dictionary form, list of arrays' shape (num of samples), list of arrays
    """
    samples = os.listdir(path)
    d = dict()
    frequency = spwav.read(path + samples[0])[0]
    lengths = []
    mma_list = [] # min_max_avg list
    arys = []
    for sample in samples:
        if is_eval:
            id = int(sample.split('.')[0])
        else:
            id = int(sample.split('_')[0])
        ary = spwav.read(path + sample)[1]
        time_length = ary.shape
        arys.append(ary)
        lengths.append(time_length)
        mma_list.append(min_max_avg(ary))
        if is_eval:
            d_temp = {'ary': ary,
                    'min_max_avg':min_max_avg(ary),
                    'time': round(time_length[0]/frequency, 2)} # till -4 since it has the .wav extension
        else:
            d_temp = {'label': int(sample.split('_')[1][:-4]),
                    'ary': ary,
                    'min_max_avg':min_max_avg(ary),
                    'time': round(time_length[0]/frequency, 2)} # till -4 since it has the .wav extension
        d[id] = d_temp 
    return d, lengths, arys

def padding(ary: np.array, target_len:int, padding_element:int=0) -> np.array:
    """
    Pad an array with the given length and the given element

    Args:
        ary (np.array): input array to pad
        target_len (int): length to pad the array
        padding_element (int, optional): element used to pad. Defaults to 0.

    Returns:
        np.array: padded array
    """
    return np.pad(ary, (0, target_len), 'constant',  constant_values=padding_element)

def pad_arrays(arys:list, max_length:int, value=0,) -> list:
    """
    Function that padd all the arrays to the given max length

    Args:
        arys (list): list of arrays to pad
        max_length (int): target length of the array
        value (int, optional): value used to pad. Values can be 'mean' or generic values. Defaults to 0.

    Returns:
        list: list of padded arrays
    """
    new_arys = []
    for a in arys:
        width_to_pad = max_length-len(a)
                
        if value == 'mean':
            new_ar = padding(a, width_to_pad, np.mean(a))
        else:
            new_ar = padding(a, width_to_pad, value)
    
        new_arys.append(new_ar)
    return new_arys

## arrivato al punto in cui devo vedere se mettere o meno new_mma_list. 
# Creare funzione per creare il dataframe.

def compute_mma_list(arys:list) -> list:
    """
    Function that will compute min, max, and mean of each array in a given list

    Args:
        arys (list): list of arrays

    Returns:
        list: list of tuples of min, max, and average. 
        One entry for each array in arys
    """
    mma_list = []
    for a in arys:
        mma_list.append(min_max_avg(a))
    return mma_list

def create_df(d:dict, arys:list, avg_threshold=-5) -> pd.DataFrame:
    mma_list = compute_mma_list(arys)
    df = pd.DataFrame.from_dict(d).T
    df.ary = arys
    df.min_max_avg = mma_list
    df.sort_index(axis=0, inplace=True)
    avg = []
    for tup in df.min_max_avg:
        avg.append(tup[-1])
    df['avg'] = avg
    # noise removal, we remove arrays with mean value below -5. 
    # I've seen that even if I pad with 0, the overall mean is strongly below -5,
    # but I know I should do it in a more general way
    mask = df['avg'] > avg_threshold
    return df[mask]

def simple_moving_average(list_of_arys:list) -> np.array:
    list_of_arys = list(list_of_arys)
    ary = np.zeros(shape=(len(list_of_arys)))
    for id in range(len(list_of_arys)):
        num_values = len(list_of_arys[id])
        tot = np.sum(list_of_arys[id])
        ary[id] = tot/num_values
    return ary

def cumulative_moving_average(list_of_arys:list) -> np.array:
    temp = [(np.sum(i), len(i)) for i in list_of_arys]
    mean_variable = 0
    values_cnt = 0
    out_list = []
    
    for i in range(len(temp)):
        sum, l = temp[i]
        if i == 0:
            mean_variable = sum/l
            values_cnt += l
            out_list.append(mean_variable)
        else:
            mean_variable = (sum + values_cnt*mean_variable)/(l + values_cnt)
            values_cnt += l
            out_list.append(mean_variable)


    return np.array(out_list)

def bucket(df:pd.DataFrame, num_buckets:int=100, type_of_feature:str='sma', is_eval:bool=False):
    another_dict = dict()

    for id, array in zip(df.ary.index, df.ary):
        split_ary = np.array_split(array, num_buckets)
        cnt_feature = 0
        features = dict()
        for subary in split_ary:
            features[f'feature_{cnt_feature}'] = subary
            cnt_feature += 1
        another_dict[id] = features
    
    if type_of_feature == 'sma':
        # Simple moving average
        sma_dict = {}
        for key, item in another_dict.items():
            simple_mean = simple_moving_average(item.values())
            sma_dict[key] = {f'feature_{i}':m for i,m in enumerate(simple_mean)}
        bucket_df = pd.DataFrame.from_dict(sma_dict).T
        
    elif type_of_feature == 'cma':
        # Cumulative moving average
        cum_avg_dict = dict()
        for key, item in another_dict.items():
            mean = cumulative_moving_average(item.values())        
            cum_avg_dict[key] = {f'feature_{i}': m for i,m in enumerate(mean)}
        bucket_df = pd.DataFrame.from_dict(cum_avg_dict).T
    if is_eval: 
        return bucket_df    
    
    bucket_df['label'] = df.label
    return bucket_df

# Preprocessing

In [97]:
# train dataset
dev, train_lengths, train_arys = create_dict_dataset('./free_spoken_digit/dev/')
dev_max_length = max(train_lengths)[0]

# eval dataset
eval, eval_lengths, eval_arys = create_dict_dataset('./free_spoken_digit/eval/', is_eval= True)
eval_max_length = max(eval_lengths)[0]

#check the maximum length between train and eval, to have homogeneous padding
if eval_max_length > dev_max_length: 
    max_length = eval_max_length
else:
    max_length = dev_max_length
    
#### train ####
train_new_arys = pad_arrays(train_arys, max_length, 0)
train_df = create_df(dev, train_new_arys, avg_threshold=-5)
bucket_train_df = bucket(train_df, 1000, type_of_feature='sma')

#### eval #####
eval_new_arys = pad_arrays(eval_arys, eval_max_length, 0) # must be coherent with train!
eval_df = create_df(eval, eval_new_arys, avg_threshold=-5)
bucket_eval_df = bucket(eval_df, 1000, type_of_feature='sma', is_eval=True) # again, must be coherent with train



In [98]:
bucket_train_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999,label
0,6.894737,-7.157895,10.789474,14.210526,22.473684,-38.421053,2.684211,-50.368421,146.421053,-535.368421,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4
1,-0.210526,-0.736842,-1.105263,0.578947,-1.473684,-2.789474,2.0,-0.473684,-1.105263,-0.578947,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7
2,1.789474,-4.315789,1.842105,-8.578947,0.894737,-5.526316,-2.578947,3.421053,-1.842105,0.315789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5
4,0.894737,17.421053,25.578947,-26.105263,-20.157895,-4.631579,-17.578947,1.684211,6.789474,3.789474,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3
6,9.473684,-42.947368,65.157895,-21.736842,-38.631579,165.526316,-298.631579,158.421053,83.578947,-193.052632,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1


In [99]:
bucket_eval_df.head()

Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_990,feature_991,feature_992,feature_993,feature_994,feature_995,feature_996,feature_997,feature_998,feature_999
1,9.736842,-48.631579,50.368421,24.789474,-130.736842,154.052632,-31.157895,-123.105263,150.578947,-2.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-1.526316,-10.210526,-16.947368,137.947368,-97.210526,16.157895,-28.736842,-7.526316,-10.0,21.315789,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6.105263,6.631579,-13.842105,1.368421,14.526316,-7.736842,-24.315789,32.526316,-0.473684,-31.263158,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.421053,-1.421053,2.157895,-3.842105,-0.368421,-1.578947,-1.157895,-4.578947,3.894737,-0.631579,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,9.526316,-6.947368,-15.947368,25.947368,-5.473684,-20.526316,150.157895,79.736842,-262.894737,114.736842,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Classifiers
We will use macro F1 score as metric. The F1 score is:
$$F1_{score} = 2\cdot\frac{Precision\cdot Recall}{Precision + Recall}$$
where
$$Precision = \frac{TP}{TP + FP}$$
$$Recall = \frac{TP}{TP + FN}$$

Worst value of F1 score is 0, while the best one is 1.

In [126]:
from sklearn.model_selection import train_test_split, ParameterGrid
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

np.random.seed(0) #to make experiments reproducible

X_train, X_test, y_train, y_test = train_test_split(bucket_train_df[bucket_train_df.columns[:-1]], bucket_train_df[bucket_train_df.columns[-1]], test_size=.2)

y_train = y_train.astype('int')
y_test = y_test.astype('int')


In [101]:
clf = DecisionTreeClassifier().fit(X_train, y_train)
preds = clf.predict(X_test)
f1_score(y_test, preds, average='macro')

0.20931129986500788

In [147]:
rf = RandomForestClassifier(n_estimators=100, max_features='sqrt', max_depth=30).fit(X_train, y_train)
preds = rf.predict(X_test)
f1_score(y_test, preds, average='macro')

0.39850737721475016

In [119]:
knn = KNeighborsClassifier(n_neighbors=7).fit(X_train, y_train)
preds = knn.predict(X_test)
f1_score(y_test, preds, average='macro')

0.15383813941560903

In [127]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler().fit(X_train)
#X_train = scaler.transform(X_train)
gnb = GaussianNB().fit(X_train, y_train)
preds = gnb.predict(X_test)
f1_score(y_test, preds, average='macro')

0.1543745991823065

In [137]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import NuSVC

svm_clf = make_pipeline(StandardScaler(), NuSVC(degree=1))
svm_clf.fit(X_train, y_train)
preds = svm_clf.predict(X_test)
f1_score(y_test, preds, average='macro')


0.15765029059521102

In [143]:
# Neural Network, MLP
from sklearn.neural_network import MLPClassifier
mlp = MLPClassifier(max_iter=300).fit(X_train, y_train)
preds = mlp.predict(X_test)
f1_score(y_test, preds, average='macro')

0.12737773985155904

Indeed, results are not satisfactory. We should think to another kind of processing. We should think of frequency domain processing.

In [None]:
from scipy.io import wavfile
from scipy import signal
signal.spectrogram()