<a href="https://colab.research.google.com/github/pietrodileo/Python_for_MD_thesis/blob/main/ExtractFormantsParselmouth_Iterative.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Install Parselmouth library to run praat scripts with python

In [None]:
!pip install -U praat-parselmouth

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting praat-parselmouth
  Downloading praat_parselmouth-0.4.2-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (10.7 MB)
[K     |████████████████████████████████| 10.7 MB 15.9 MB/s 
Installing collected packages: praat-parselmouth
Successfully installed praat-parselmouth-0.4.2


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import parselmouth 
from parselmouth import praat

import glob
import os.path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from pathlib import Path


# Define local function

Extract F0 from audio data

In [None]:
def f0Extract(snd):
  f0min=75
  f0max=300
  pitch = praat.call(snd, "To Pitch", 0.0, f0min, f0max) #create a praat pitch object
  f0 = pitch.selected_array['frequency']
  f0times = pitch.xs()
  pointProcess = praat.call(snd, "To PointProcess (periodic, cc)", f0min, f0max)
  return f0, f0times, pointProcess 

Extract the formants

In [None]:
def formantExtraction(snd):
  formNum = 5
  formants = praat.call(snd, "To Formant (burg)", 0.0025, formNum, 5000, 0.025, 50)
  numPoints = praat.call(pointProcess, "Get number of points")
  # Assign formant values with times where they make sense (periodic instances)
  f0_list = []
  f1_list = []
  f2_list = []
  f3_list = []
  f4_list = []
  f5_list = []
  times = []
  # Extract 5 formants (F1,...,F5)
  for point in range(0, numPoints):
    point += 1
    t = praat.call(pointProcess, "Get time from index", point)
    times.append(t)
    f1 = praat.call(formants, "Get value at time", 1, t, 'Hertz', 'Linear')
    f2 = praat.call(formants, "Get value at time", 2, t, 'Hertz', 'Linear')
    f3 = praat.call(formants, "Get value at time", 3, t, 'Hertz', 'Linear')
    f4 = praat.call(formants, "Get value at time", 4, t, 'Hertz', 'Linear')
    f5 = praat.call(formants, "Get value at time", 5, t, 'Hertz', 'Linear')
    
    f1_list.append(f1)
    f2_list.append(f2)
    f3_list.append(f3)
    f4_list.append(f4)
    f5_list.append(f5)

  formants_all = []
  # Create List of lists
  formants_all.append(f1_list)
  formants_all.append(f2_list)
  formants_all.append(f3_list)
  formants_all.append(f4_list)
  formants_all.append(f5_list)

  # Create formant dataframe
  formdf = pd.DataFrame(formants_all)
  # transpose of dataframe
  formdf = formdf.transpose()
  formdf.columns = ['F1','F2','F3','F4','F5']
  return f1_list,f2_list,f3_list,f4_list,f5_list,times

Extract Features 

In [None]:
def FeatureExtraction(f_list,times,doPlot):
  dataf = np.array(f_list)
  xval = np.array(times).reshape(-1, 1)
  featuresExtracted = []
  if np.isnan(dataf).all():
    featuresExtracted = np.empty((1,8,))
    featuresExtracted[:] = np.nan
    featuresDf = pd.DataFrame(featuresExtracted)
  else:
    if np.isnan(dataf).any():
      dataf = np.nan_to_num(dataf,nan=np.nanmean(dataf))
    media = np.nanmean(dataf)
    stddev = np.nanstd(dataf)
    perc95 = np.nanpercentile(dataf,95)
    perc75 = np.nanpercentile(dataf,75)
    perc25 = np.nanpercentile(dataf,25)
    perc5 = np.nanpercentile(dataf,5)
    rangePerc = perc95 - perc5
    lm = LinearRegression()
    lm.fit(xval, dataf)
    intercept = lm.intercept_
    slope = lm.coef_

    if doPlot == 1:
      print(lm.intercept_)
      print(lm.coef_)
      # Define the line
      line = lm.predict(xval)
      # Plot outputs
      plt.scatter(xval, dataf, color="black")
      plt.plot(xval, line, color="blue", linewidth=3)
      plt.xticks(())
      plt.yticks(())
      plt.show()

    # Create features vector
    featuresExtracted.append(media)
    featuresExtracted.append(stddev)
    featuresExtracted.append(perc95)
    featuresExtracted.append(perc75)
    featuresExtracted.append(perc25)
    featuresExtracted.append(perc5)
    featuresExtracted.append(rangePerc)
    featuresExtracted.append(int(slope[0]))

    # Create formant dataframe
    featuresDf = pd.DataFrame(featuresExtracted)
    # transpose of dataframe
    featuresDf = featuresDf.transpose()
  featuresDf.columns = ['mean','std.dev','perc95','perc75','perc25','perc5','rangePerc','slopeLinFit']
  return featuresDf

Teager-Kaiser Energy Operator (compute instantaneous energy)

In [None]:
# Teager-Kaiser Energy Operator Calculation
def TKEO_calc(data_list):
  x = np.array(data_list)
  l=0; p=0; q=1; s=-1;
  N = len(x)
  x_nleo = np.zeros(N)
  iedges = abs(l) + abs(p) + abs(q) + abs(s)
  n = np.arange(iedges + 1, (N - iedges - 1))
  x_nleo[n] = x[n-l] * x[n-p] - x[n-q] * x[n-s]
  return x_nleo

Create Dataframe

In [None]:
# input = f1_list, x = times, task = 'F1'
def DataframeOrganization(input, x, task, doPlot):
  featuresDf = FeatureExtraction(input,x,doPlot)
  names = featuresDf.columns
  new_names = []
  for i in range(0,len(names)):
    new_names.append(task+'_'+ names[i])
  featuresDf.columns = new_names
  x_nleo = TKEO_calc(input)
  featuresDfTKEO = FeatureExtraction(x_nleo,x,doPlot)
  namesTKEO = featuresDf.columns
  new_namesTKEO = []
  for i in range(0,len(names)):
    new_namesTKEO.append(task+'_'+'_TKEO_'+ names[i])
  featuresDfTKEO.columns = new_namesTKEO
  result = pd.concat([featuresDf, featuresDfTKEO], axis=1, join="inner")
  return result

#Extract the following features from f0 and formants:


1.   Mean
2.   Std.Dev
3.   Range 95th percentile - 5th percentile
4.   95th percentile
5.   75th percentile
6.   25th percentile
7.   5th percentile
8.   Angular coefficient of linear regression 
9.   TKEO mean
10.  TKEO std.dev
11.  TKEO range 95th percentile - 5th percentile
12.  TKEO 95th percentile
13.  TKEO 75th percentile
14.  TKEO 25th percentile
15.  TKEO 5th percentile
16.  TKEO Angular coefficient of linear regression 



In [None]:
FinalResult = pd.DataFrame([])
doPlot = 0
# The pattern "**" means all subdirectories recursively,
# with "*.wav" meaning all files with any name ending in ".wav" -> glob("**/*.wav")
for wave_file in glob.glob("audio/*.wav"):
    print("Processing {}...".format(wave_file))
    # do stuff
    snd = parselmouth.Sound(wave_file)
    f0, f0times, pointProcess  = f0Extract(snd)
    f1_list,f2_list,f3_list,f4_list,f5_list,times = formantExtraction(snd)
    F0_results = DataframeOrganization(f0, f0times, 'F0', doPlot)
    F1_results = DataframeOrganization(f1_list, times, 'F1', doPlot)
    F2_results = DataframeOrganization(f2_list, times, 'F2', doPlot)
    F3_results = DataframeOrganization(f3_list, times, 'F3', doPlot)
    F4_results = DataframeOrganization(f4_list, times, 'F4', doPlot)
    F5_results = DataframeOrganization(f5_list, times, 'F5', doPlot)
    istance = wave_file.split('/')[1].split('.')[0]
    nameDf = pd.DataFrame([istance],columns=['Name'])
    Result = pd.concat([nameDf, F0_results, F1_results, F2_results, F3_results, F4_results, F5_results], axis=1, join="inner")
    FinalResult = pd.concat([FinalResult, Result]) 

Processing audio/HC_TesiPDL_VowelE_0265_E1_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0092_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0043_E_3_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0052_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0044_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0053_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0049_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0093_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0048_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0091_E_3_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0094_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0265_E_2_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0045_E_3_FineCut_LowCut12kHz.wav...
Processing audio/HC_TesiPDL_VowelE_0043_E_2_FineCut_LowCut12kHz.wav...
Process

Save output

In [None]:
Output = '/content/drive/MyDrive/TesiMagistrale/'
outputName = Output+"FinalResult_new2.xlsx"
FinalResult.to_excel(outputName, sheet_name='Sheet_name_1') 