In [15]:
import os
import numpy as np
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report
import pickle

In [2]:
formants_folder = "all_formants_observed_formants_first_5_formants"
def load_formants_from_file(file_path):
    return pd.read_csv(file_path)


In [3]:
full_data = pd.DataFrame()
for formant_file in os.listdir(formants_folder):
    full_data = full_data.append(load_formants_from_file(os.path.join(formants_folder, formant_file)), ignore_index=True)



In [4]:
phoneme_remap = {
 'sil': 'sil',
 'g': 'non_vowel',
 'a': 'a',
 'f': 'non_vowel',
 's': 'non_vowel',
 'n': 'non_vowel',
 'r': 'non_vowel',
 'j': 'non_vowel',
 'R': 'non_vowel',
 'o': 'o',
 'b': 'non_vowel',
 'i': 'i',
 'k': 'non_vowel',
 'u': 'u',
 'd': 'non_vowel',
 'e': 'e',
 'l': 'non_vowel',
 'c': 'non_vowel',
 'm': 'non_vowel',
 't': 'non_vowel',
 'p': 'non_vowel',
 'y': 'non_vowel',
 'C': 'non_vowel',
 'N': 'non_vowel',
 '': 'non_vowel',
 'S': 'non_vowel'
}

full_data["phoneme"].replace(phoneme_remap, inplace=True)


In [5]:
sil_frames = full_data[full_data["phoneme"] == "sil"]

full_data = full_data.drop(sil_frames.index)


In [6]:
full_data['phoneme']
all_phonemes = [str(s) for s in full_data['phoneme'].unique()]


In [7]:
phonemes_grouped = full_data.groupby('phoneme')

phonemes_grouped.groups


{'a': [1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 3061, 3062, 3063, 3064, 3065, 3066, 3067, 3068, 3069, 3070, 3071, 3072, 3073, 3074, 3075, 3076, 3077, 3078, 3079, 3080, 3081, 3082, 3083, ...], 'e': [159, 160, 161, 162, 163, 164, 165, 166, 167, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 962, 963, 964, 965, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 979, 980, 981, 982, 983, 984, 985, 986, 987, 988, 989, 990, 991, 992, 993, 994, 995, 996, 997, 998, 1302, 1303, 1304, 1305, 1306, 1307, 1308,

In [8]:
phoneme_count = full_data['phoneme'].value_counts()
percentage_to_extract_test = 0.1
fixed_extra_to_test = 5

values_to_extract = (phoneme_count * percentage_to_extract_test + fixed_extra_to_test).round()
values_to_extract = values_to_extract.apply(np.int64)



In [9]:
train_set = full_data.dropna(subset=["phoneme"])
train_set[["i"]] = train_set[["i"]].fillna(value=-300)
test_set = pd.DataFrame()
for index, value in values_to_extract.items():
    sub_data_frame = train_set[train_set["phoneme"] == index]
    test_to_remove = sub_data_frame.sample(value)
    test_set = test_set.append(test_to_remove)
    train_set = train_set.drop(test_to_remove.index)

In [10]:
FORMANTS_TO_EXTRACT = 5
formant_names = [f"f_{i+1}" for i in range(FORMANTS_TO_EXTRACT)]

formant_names.append("i")
print(formant_names)

['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'i']


In [11]:
train_data = train_set[formant_names].fillna(train_set[formant_names].mean())


In [12]:
tree_classifier = tree.DecisionTreeClassifier(max_depth=6)
tree_classifier.fit(train_data, train_set['phoneme'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
test_data = test_set[formant_names].fillna(train_set[formant_names].mean())
predicted = tree_classifier.predict(test_data[formant_names])
score = accuracy_score(test_set["phoneme"], predicted)
score

0.6962879640044994

In [17]:
report = classification_report(test_set["phoneme"], predicted, labels=all_phonemes, digits=4)
print(report)

              precision    recall  f1-score   support

   non_vowel     0.6892    0.8641    0.7668       390
           e     0.6029    0.4767    0.5325        86
           o     0.6301    0.6389    0.6345       144
           u     0.6842    0.3714    0.4815        35
           a     0.8456    0.7159    0.7754       176
           i     0.5556    0.1724    0.2632        58
         nan     0.0000    0.0000    0.0000         0

   micro avg     0.6963    0.6963    0.6963       889
   macro avg     0.5725    0.4628    0.4934       889
weighted avg     0.6933    0.6963    0.6803       889



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results: 0.8136915077989602

Formants | Accuracy
-|-
With Sil| 0.8136915077989602
Without Sil | 0.7064116985376828

In [16]:
base_name = "no_sil_non_vowels"

dt_target_names = [str(s) for s in full_data['phoneme'].unique()]
tree.export_graphviz(
    tree_classifier,
    out_file=f'{base_name}.dot',
    feature_names=formant_names,
    class_names=all_phonemes,
    filled=True
)

print(tree.export_text(tree_classifier))

|--- feature_0 <= 575.82
|   |--- feature_1 <= 1162.31
|   |   |--- feature_0 <= 346.57
|   |   |   |--- feature_0 <= 328.44
|   |   |   |   |--- feature_0 <= 293.73
|   |   |   |   |   |--- feature_4 <= 3082.82
|   |   |   |   |   |   |--- class: a
|   |   |   |   |   |--- feature_4 >  3082.82
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |   |--- feature_0 >  293.73
|   |   |   |   |   |--- feature_2 <= 2219.23
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |   |   |--- feature_2 >  2219.23
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |--- feature_0 >  328.44
|   |   |   |   |--- feature_4 <= 3319.26
|   |   |   |   |   |--- feature_1 <= 1105.81
|   |   |   |   |   |   |--- class: i
|   |   |   |   |   |--- feature_1 >  1105.81
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |   |--- feature_4 >  3319.26
|   |   |   |   |   |--- feature_1 <= 952.51
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |   |   |--- feature_1 >  952.51


In [17]:

graph = pydotplus.graph_from_dot_file(f'{base_name}.dot')
print(type(graph))
with open(f"{base_name}.png", "wb+") as f:
    f.write(graph.create_png())

<class 'pydotplus.graphviz.Dot'>


In [18]:


with open(f"{base_name}.pickle", "wb+") as pickle_file:
    pickle.dump(tree_classifier, pickle_file)