In [29]:
import os
import numpy as np
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report

In [30]:
formants_folder = "all_formants_observed_formants_first_5_formants"
def load_formants_from_file(file_path):
    return pd.read_csv(file_path)


In [31]:
full_data = pd.DataFrame()
for formant_file in os.listdir(formants_folder):
    full_data = full_data.append(load_formants_from_file(os.path.join(formants_folder, formant_file)), ignore_index=True)


In [32]:
sil_frames = full_data[full_data["phoneme"] == "sil"]

full_data = full_data.drop(sil_frames.index)


In [33]:
full_data['phoneme']
all_phonemes = [str(s) for s in full_data['phoneme'].unique()]


In [34]:
phonemes_grouped = full_data.groupby('phoneme')

phonemes_grouped.groups


{'C': [7818, 7819, 7820, 7821, 7822, 7823, 7824, 7825, 7826, 7827, 7828, 7829, 7830, 7831, 7832, 7833, 7834, 9555, 9556, 9557, 9558, 9559, 9560, 9561, 9562, 9563, 9564, 9565, 9566, 9567, 9568, 9569, 9570, 9571, 9572, 9573, 9574, 9575, 9576, 9577, 9578, 9579], 'R': [148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 1685, 1686, 1687, 1688, 1689, 1690, 1691, 1692, 1693, 1694, 1695, 1696, 2752, 2753, 2754, 2755, 2756, 2757, 2758, 2759, 3345, 3346, 3347, 3348, 3349, 3350, 3351, 3352, 3353, 3354, 3687, 3688, 3689, 3690, 3691, 3692, 3693, 3694, 3695, 3696, 3697, 3698, 4019, 4020, 4021, 4022, 4023, 4024, 4025, 4026, 4027, 4028, 4029, 4030, 4031, 4032, 4033, 4034, 4035, 4036, 4037, 4038, 4039, 4040, 4041, 4042, 4043, 4044, 5270, 5271, 5272, 5273, 5274, 5275, 5276, 5277, 5278, 5279, 5280, 5281, 5282, 5283, 5284, 5285, 5286, 5287, 5288, 5289, 5290, ...], 'S': [22063, 22064, 22065, 22066, 22067, 22068, 22069, 22070, 22071, 22072, 22073, 22074, 22075, 22076, 22077], 'a': [1007, 1008, 1009, 101

In [35]:
phoneme_count = full_data['phoneme'].value_counts()
percentage_to_extract_test = 0.1
fixed_extra_to_test = 5

values_to_extract = (phoneme_count * percentage_to_extract_test + fixed_extra_to_test).round()
values_to_extract = values_to_extract.apply(np.int64)




In [36]:
train_set = full_data.dropna(subset=["phoneme"])
train_set[["i"]] = train_set[["i"]].fillna(value=-300)
test_set = pd.DataFrame()
for index, value in values_to_extract.items():
    sub_data_frame = train_set[train_set["phoneme"] == index]
    test_to_remove = sub_data_frame.sample(value)
    test_set = test_set.append(test_to_remove)
    train_set = train_set.drop(test_to_remove.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [37]:
FORMANTS_TO_EXTRACT = 5
formant_names = [f"f_{i+1}" for i in range(FORMANTS_TO_EXTRACT)]

formant_names.append("i")
print(formant_names)

['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'i']


In [38]:
train_data = train_set[formant_names].fillna(train_set[formant_names].mean())


In [39]:
tree_classifier = tree.DecisionTreeClassifier(max_depth=6)
tree_classifier.fit(train_data, train_set['phoneme'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [40]:
test_data = test_set[formant_names].fillna(train_set[formant_names].mean())
predicted = tree_classifier.predict(test_data[formant_names])
score = accuracy_score(test_set["phoneme"], predicted)
score

0.4234326824254882

In [42]:
report = classification_report(test_set["phoneme"], predicted, labels=all_phonemes, digits=4)
print(report)

              precision    recall  f1-score   support

           R     0.0000    0.0000    0.0000        29
           e     0.4118    0.6512    0.5045        86
           l     0.2105    0.5581    0.3057        43
           o     0.4762    0.6944    0.5650       144
           j     0.4286    0.1000    0.1622        30
           u     1.0000    0.0857    0.1579        35
           n     0.1818    0.0678    0.0988        59
           s     0.3519    0.5352    0.4246        71
           p     0.0000    0.0000    0.0000        18
           r     0.0000    0.0000    0.0000        17
           a     0.5480    0.7784    0.6432       176
           b     0.3810    0.5714    0.4571        28
           t     0.1250    0.0333    0.0526        30
           k     0.0000    0.0000    0.0000        28
           d     0.0000    0.0000    0.0000        21
           i     0.5476    0.3966    0.4600        58
           g     0.0000    0.0000    0.0000        13
           y     0.0000    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Results

Formants | Accuracy
-|-
5| 0.8047658862876255
5 No intensity | 0.6960702341137124
4| 0.7566889632107023
4 No intensity | 0.6588628762541806
3| 0.7508361204013378
3 No intensity | 0.6040969899665551
2| 0.6981605351170569
2 No intensity | 0.5493311036789298

In [14]:
dt_target_names = [str(s) for s in full_data['phoneme'].unique()]
tree.export_graphviz(
    tree_classifier,
    out_file='pruned_tree.dot',
    feature_names=formant_names,
    class_names=all_phonemes,
    filled=True
)

In [15]:

graph = pydotplus.graph_from_dot_file('pruned_tree.dot')
print(type(graph))
with open("pruned_tree.png", "wb+") as f:
    f.write(graph.create_png())

<class 'pydotplus.graphviz.Dot'>
