In [2]:
import os
from python_speech_features import mfcc, delta
from python_speech_features import logfbank
import scipy.io.wavfile as wav
from scipy.spatial.distance import euclidean
from fastdtw import fastdtw
import numpy as np
from pydub import AudioSegment

In [3]:
TEMPLATE_DIR = os.path.join(os.getcwd(), 'template')
TESTING_DIR = os.path.join(os.getcwd(), 'testing')

VOWEL_LIST = ['a', 'e', 'i', 'o', 'u']
DATA_DICT = dict()

for vowel in VOWEL_LIST:
  temp_dict = {
    'template' : os.path.join(TEMPLATE_DIR, vowel),
    'testing' : os.path.join(TESTING_DIR, vowel)
  }
  DATA_DICT[vowel] = temp_dict

# List all the data into a single dictionary
for k, v in DATA_DICT.items():
  print(k, v)

a {'template': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\template\\a', 'testing': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\testing\\a'}
e {'template': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\template\\e', 'testing': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\testing\\e'}
i {'template': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\template\\i', 'testing': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\testing\\i'}
o {'template': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\template\\o', 'testing': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\testing\\o'}
u {'template': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\template\\u', 'testing': 'd:\\Folder_Kuliah_Cadangan\\Sems_7\\Speech\\speech-dtw\\testing\\u'}


In [4]:
# Extract si mfcc dari file path
def extract_mfcc(file_path):
  rate,signal = wav.read(file_path)
  mfcc_features = mfcc(signal, rate, numcep=13)
  mfcc_delta = delta(mfcc_features, 2)
  mfcc_delta_delta = delta(mfcc_delta, 2)
  return np.hstack((mfcc_features, mfcc_delta, mfcc_delta_delta))

In [5]:
# Hitung jarak dari 1 test ke 1 template
def calculate_dtw(template, test):
  distance, _ = fastdtw(template, test, dist=euclidean)
  return distance

In [42]:
# Cocokin 1 data test ke templates
def speech_recognition(test, templates):
  distances = []
  for vowel, template in templates.items():
    distance = calculate_dtw(template, test)
    distances.append((vowel, distance))
  # print(distances)
  recognized_vowel = min(distances, key=lambda x: x[1])[0]
  return recognized_vowel

In [13]:
# Convert .m4a to .wav
for k, v in DATA_DICT.items():
  print(f"[PROCESS] Converting {k}")
  template_files = os.listdir(v['template'])
  testing_files = os.listdir(v['testing'])

  adjusted_template_files = [os.path.join(v['template'], file) for file in template_files]
  adjusted_testing_files = [os.path.join(v['testing'], file) for file in testing_files]

  for file in adjusted_template_files:
    m4a_audio = AudioSegment.from_file(file, format="m4a")
    out_file = file.replace(".m4a", ".wav")
    m4a_audio.export(out_file, format="wav")

  for file in adjusted_testing_files:
    m4a_audio = AudioSegment.from_file(file, format="m4a")
    out_file = file.replace(".m4a", ".wav")
    m4a_audio.export(out_file, format="wav")

[PROCESS] Converting a
[PROCESS] Converting e
[PROCESS] Converting i
[PROCESS] Converting o
[PROCESS] Converting u


In [34]:
# PROCESS TEMPLATES
template1 = dict()
template2 = dict()
template3 = dict()
for k, v in DATA_DICT.items():
  wav_template_files = list(filter(lambda x: '.wav' in x, os.listdir(v['template'])))
  adjusted_wav_template_files = [os.path.join(v['template'], file) for file in wav_template_files]

  for i in range (len(adjusted_wav_template_files)):
    template_file = adjusted_wav_template_files[i]
    template_file_res = extract_mfcc(template_file)
    
    if (i == 0):
      print(f"[PROCESSING] {i} | Template {template_file}")
      template1[k] = template_file_res
    elif (i == 1):
      print(f"[PROCESSING] {i} | Template {template_file}")
      template2[k] = template_file_res
    else: # i == 2
      print(f"[PROCESSING] {i} | Template {template_file}")
      template3[k] = template_file_res

# for k, v in template1.items():
#   print(k,v)
# for k, v in template2.items():
#   print(k,v)
# for k, v in template3.items():
#   print(k,v)
  





[PROCESSING] 0 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\a\a-(template).wav
[PROCESSING] 1 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\a\A-Template.wav
[PROCESSING] 2 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\a\a.wav
[PROCESSING] 0 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\e\e-(template).wav
[PROCESSING] 1 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\e\E-Template.wav
[PROCESSING] 2 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\e\e.wav
[PROCESSING] 0 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\i\I(template).wav




[PROCESSING] 1 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\i\I-Template.wav
[PROCESSING] 2 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\i\i.wav
[PROCESSING] 0 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\o\o(template).wav
[PROCESSING] 1 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\o\O-Template.wav
[PROCESSING] 2 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\o\o.wav
[PROCESSING] 0 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\u\u-(template).wav
[PROCESSING] 1 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\u\U-Template.wav
[PROCESSING] 2 | Template d:\Folder_Kuliah_Cadangan\Sems_7\Speech\speech-dtw\template\u\u.wav


In [43]:
template_speaker_list = ["Nathan", "Juan", "Nathania"]
summary_template_data = {
  "Nathan" : {
    "a" : [],
    "i" : [],
    "u" : [],
    "e" : [],
    "o" : []
  },
  "Juan"   : {
    "a" : [],
    "i" : [],
    "u" : [],
    "e" : [],
    "o" : []
  },
  "Nathania" : {
    "a" : [],
    "i" : [],
    "u" : [],
    "e" : [],
    "o" : []
  },
}

# PROCESS TESTING
for k, v in DATA_DICT.items():
  print(f"[PROCESS] Evaluating {k}")
  wav_testing_files = list(filter(lambda x: '.wav' in x, os.listdir(v['testing'])))
  adjusted_wav_testing_files = [os.path.join(v['testing'], file) for file in wav_testing_files]

  for i in range(len(adjusted_wav_testing_files)):
    testing_file = adjusted_wav_testing_files[i]
    testing_file_res = extract_mfcc(testing_file) 
    # Template 1
    recognized_vowel_1 = speech_recognition(testing_file_res, template1)
    # print(f"""Template Vowel: {k}  Speaker {template_speaker_list[0]} \n\t Detected Vowel: {recognized_vowel_1} Speaker {template_speaker_list[i]} """)

    # Template 2
    recognized_vowel_2 = speech_recognition(testing_file_res, template2)
    # print(f"""Template Vowel: {k}  Speaker {template_speaker_list[1]} \n\t Detected Vowel: {recognized_vowel_2} Speaker {template_speaker_list[i]} """)

    # Template 3
    recognized_vowel_3 = speech_recognition(testing_file_res, template3)
    # print(f"""Template Vowel: {k}  Speaker {template_speaker_list[2]} \n\t Detected Vowel: {recognized_vowel_3} Speaker {template_speaker_list[i]} """)


    summary_template_data[template_speaker_list[0]][k].append({
      "Detected Vowel" : recognized_vowel_1,
      "Speaker" : template_speaker_list[i]
    })
    summary_template_data[template_speaker_list[1]][k].append({
      "Detected Vowel" : recognized_vowel_2,
      "Speaker" : template_speaker_list[i]
    })
    summary_template_data[template_speaker_list[2]][k].append({
      "Detected Vowel" : recognized_vowel_3,
      "Speaker" : template_speaker_list[i]
    })


for k, v in summary_template_data.items():
  print(k)
  for k_in, v_in in v.items():
    print(k_in)
    for data in v_in:
      print(data)



[PROCESS] Evaluating a




[PROCESS] Evaluating e




[PROCESS] Evaluating i




[PROCESS] Evaluating o




[PROCESS] Evaluating u




Nathan
a
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'o', 'Speaker': 'Juan'}
{'Detected Vowel': 'e', 'Speaker': 'Nathania'}
i
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'e', 'Speaker': 'Juan'}
{'Detected Vowel': 'e', 'Speaker': 'Nathania'}
u
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'e', 'Speaker': 'Juan'}
{'Detected Vowel': 'e', 'Speaker': 'Nathania'}
e
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'e', 'Speaker': 'Juan'}
{'Detected Vowel': 'e', 'Speaker': 'Nathania'}
o
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'o', 'Speaker': 'Juan'}
{'Detected Vowel': 'e', 'Speaker': 'Nathania'}
Juan
a
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'a', 'Speaker': 'Juan'}
{'Detected Vowel': 'o', 'Speaker': 'Nathania'}
i
{'Detected Vowel': 'o', 'Speaker': 'Nathan'}
{'Detected Vowel': 'i', 'Speaker': 'Juan'}
{'Detected Vowel': 'o', 'Speaker': 'Nathania'}
u
{'Detected Vowel': 'o', 'Sp

In [53]:
summary_final_data = {
  "Nathan" : {},
  "Juan" : {},
  "Nathania" : {}
}

for speaker_template, template_data in summary_template_data.items():
  template_sum_accuracy = 0
  for vowel_template, testing_data_list in template_data.items():
    count = 0
    correct_count = 0
    for data in testing_data_list:
      count += 1
      if (data['Detected Vowel'] == vowel_template):
        correct_count += 1

    vowel_key_name = f"accuracy_{vowel_template}"
    vowel_accuracy = correct_count / count * 100
    summary_final_data[speaker_template][vowel_key_name] = f"{vowel_accuracy:.2f}"

    template_sum_accuracy += vowel_accuracy

  template_accuracy = template_sum_accuracy / 5
  summary_final_data[speaker_template]['template_accuracy'] = f"{template_accuracy:.2f}"


for speaker, accuracy_dict in summary_final_data.items():
  print(speaker)
  for k, v in accuracy_dict.items():
    print(k, v)
  print()
    

Nathan
accuracy_a 0.00
accuracy_i 0.00
accuracy_u 0.00
accuracy_e 66.67
accuracy_o 66.67
template_accuracy 26.67

Juan
accuracy_a 33.33
accuracy_i 33.33
accuracy_u 0.00
accuracy_e 33.33
accuracy_o 100.00
template_accuracy 40.00

Nathania
accuracy_a 33.33
accuracy_i 33.33
accuracy_u 33.33
accuracy_e 33.33
accuracy_o 33.33
template_accuracy 33.33

