In [1]:
import os
import numpy as np
import pandas as pd
from sklearn import tree
import pydotplus
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report
import pickle

In [2]:
formants_folder = "all_formants_observed_formants_first_5_formants"
def load_formants_from_file(file_path):
    return pd.read_csv(file_path)


In [3]:
full_data = pd.DataFrame()
for formant_file in os.listdir(formants_folder):
    full_data = full_data.append(load_formants_from_file(os.path.join(formants_folder, formant_file)), ignore_index=True)



In [4]:
phoneme_remap = {
 'sil': 'sil',
 'g': 'non_vowel',
 'a': 'a',
 'f': 'non_vowel',
 's': 'non_vowel',
 'n': 'non_vowel',
 'r': 'non_vowel',
 'j': 'non_vowel',
 'R': 'non_vowel',
 'o': 'non_vowel',
 'b': 'non_vowel',
 'i': 'i',
 'k': 'non_vowel',
 'u': 'u',
 'd': 'non_vowel',
 'e': 'non_vowel',
 'l': 'non_vowel',
 'c': 'non_vowel',
 'm': 'non_vowel',
 't': 'non_vowel',
 'p': 'non_vowel',
 'y': 'non_vowel',
 'C': 'non_vowel',
 'N': 'non_vowel',
 '': 'non_vowel',
 'S': 'non_vowel'
}

full_data["phoneme"].replace(phoneme_remap, inplace=True)


In [5]:
sil_frames = full_data[full_data["phoneme"] == "sil"]

full_data = full_data.drop(sil_frames.index)


In [6]:
full_data['phoneme']
all_phonemes = [str(s) for s in full_data['phoneme'].unique()]


In [7]:
phonemes_grouped = full_data.groupby('phoneme')

phonemes_grouped.groups


{'a': [1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015, 1016, 1017, 1018, 1019, 1020, 1021, 1022, 1023, 1024, 1201, 1202, 1203, 1204, 1205, 1206, 1207, 1208, 1209, 1210, 1211, 1212, 1213, 1214, 1215, 1216, 1217, 1218, 1219, 1220, 1221, 1222, 1223, 1224, 1225, 1226, 1227, 1648, 1649, 1650, 1651, 1652, 1653, 1654, 1655, 1656, 1657, 1658, 1659, 1660, 1661, 1662, 2885, 2886, 2887, 2888, 2889, 2890, 2891, 2892, 2893, 2894, 2895, 2896, 2897, 2898, 2899, 2900, 2901, 3061, 3062, 3063, 3064, 3065, 3066, 3067, 3068, 3069, 3070, 3071, 3072, 3073, 3074, 3075, 3076, 3077, 3078, 3079, 3080, 3081, 3082, 3083, ...], 'i': [2137, 2138, 2139, 2140, 2141, 2142, 2143, 2144, 2145, 2146, 2147, 2148, 2149, 2150, 2151, 2152, 2376, 2377, 2378, 2379, 2380, 2381, 2382, 2383, 2384, 2385, 2386, 2387, 2388, 2389, 2390, 2391, 2392, 2393, 2394, 2395, 2396, 2397, 2398, 2399, 2400, 2401, 2402, 2403, 2404, 2405, 2406, 2407, 2408, 2409, 2410, 2411, 2412, 3110, 3111, 3112, 3113, 3114, 3115, 3116, 3117, 3118, 3119, 312

In [8]:
phoneme_count = full_data['phoneme'].value_counts()
percentage_to_extract_test = 0.1
fixed_extra_to_test = 5

values_to_extract = (phoneme_count * percentage_to_extract_test + fixed_extra_to_test).round()
values_to_extract = values_to_extract.apply(np.int64)



In [9]:
train_set = full_data.dropna(subset=["phoneme"])
train_set[["i"]] = train_set[["i"]].fillna(value=-300)
test_set = pd.DataFrame()
for index, value in values_to_extract.items():
    sub_data_frame = train_set[train_set["phoneme"] == index]
    test_to_remove = sub_data_frame.sample(value)
    test_set = test_set.append(test_to_remove)
    train_set = train_set.drop(test_to_remove.index)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [10]:
FORMANTS_TO_EXTRACT = 5
formant_names = [f"f_{i+1}" for i in range(FORMANTS_TO_EXTRACT)]

formant_names.append("i")
print(formant_names)

['f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'i']


In [11]:
train_data = train_set[formant_names].fillna(train_set[formant_names].mean())


In [12]:
tree_classifier = tree.DecisionTreeClassifier(max_depth=6)
tree_classifier.fit(train_data, train_set['phoneme'])

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=6, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [13]:
test_data = test_set[formant_names].fillna(train_set[formant_names].mean())
predicted = tree_classifier.predict(test_data[formant_names])
score = accuracy_score(test_set["phoneme"], predicted)
score

0.8120728929384966

In [14]:
report = classification_report(test_set["phoneme"], predicted, labels=all_phonemes, digits=4)
print(report)

              precision    recall  f1-score   support

   non_vowel     0.8239    0.9294    0.8735       609
           u     1.0000    0.2000    0.3333        35
           a     0.7500    0.7330    0.7414       176
           i     0.9167    0.1897    0.3143        58
         nan     0.0000    0.0000    0.0000         0

   micro avg     0.8121    0.8121    0.8121       878
   macro avg     0.6981    0.4104    0.4525       878
weighted avg     0.8222    0.8121    0.7885       878



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
base_name = "external_vowels"

dt_target_names = [str(s) for s in full_data['phoneme'].unique()]
tree.export_graphviz(
    tree_classifier,
    out_file=f'{base_name}.dot',
    feature_names=formant_names,
    class_names=all_phonemes,
    filled=True
)

print(tree.export_text(tree_classifier))

|--- feature_0 <= 596.74
|   |--- feature_1 <= 1987.01
|   |   |--- feature_1 <= 725.28
|   |   |   |--- feature_5 <= 72.14
|   |   |   |   |--- feature_0 <= 413.94
|   |   |   |   |   |--- feature_4 <= 3262.24
|   |   |   |   |   |   |--- class: i
|   |   |   |   |   |--- feature_4 >  3262.24
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |   |--- feature_0 >  413.94
|   |   |   |   |   |--- feature_0 <= 485.19
|   |   |   |   |   |   |--- class: u
|   |   |   |   |   |--- feature_0 >  485.19
|   |   |   |   |   |   |--- class: non_vowel
|   |   |   |--- feature_5 >  72.14
|   |   |   |   |--- feature_4 <= 3391.41
|   |   |   |   |   |--- class: non_vowel
|   |   |   |   |--- feature_4 >  3391.41
|   |   |   |   |   |--- feature_2 <= 2041.65
|   |   |   |   |   |   |--- class: u
|   |   |   |   |   |--- feature_2 >  2041.65
|   |   |   |   |   |   |--- class: non_vowel
|   |   |--- feature_1 >  725.28
|   |   |   |--- feature_0 <= 459.62
|   |   |   |   |--- feature_5 <= 75

In [16]:

graph = pydotplus.graph_from_dot_file(f'{base_name}.dot')
print(type(graph))
with open(f"{base_name}.png", "wb+") as f:
    f.write(graph.create_png())

<class 'pydotplus.graphviz.Dot'>


In [18]:


with open(f"{base_name}.pickle", "wb+") as pickle_file:
    pickle.dump(tree_classifier, pickle_file)

In [17]:
with open(f"{base_name}.pickle", "wb+") as pickle_file:
    pickle.dump(tree_classifier, pickle_file)