In [2]:
import re
import pandas as pd

def extract_vowel_from_filename(filename):
    match = re.search(r'[a-zA-Z]', filename)
    if match:
        return match.group().lower()
    return None

def process_folder(input_folder, output_csv, sampling_rate=16000):
    data = []
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            samples = read_samples_from_txt(file_path)
            
            lpcas = extract_plp_features(samples, sampling_rate=sampling_rate)
            if lpcas is None:
                continue

            w, h = compute_lpc_filter_and_response(lpcas)

            formant_1_w, formant_2_w = find_formants_across_frames(w, h)

            # Convert to Hertz
            formant_1_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_1_w]
            formant_2_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_2_w]

            vowel = extract_vowel_from_filename(filename)
            if vowel is None:
                print(f"Could not extract vowel from {filename}")
                continue

            # Prepare data for CSV
            for f1, f2 in zip(formant_1_Hz, formant_2_Hz):
                if f1 is not None and f2 is not None:
                    data.append([f1, f2, vowel])
    
    # Save data to CSV
    df = pd.DataFrame(data, columns=['Formant_1_Hz', 'Formant_2_Hz', 'Vowel'])
    df.to_csv(output_csv, index=False)
    print(f"Features saved to {output_csv}")

# Example usage
input_folder = r"Vowel_recordings"
output_csv = r"vowel_formant_features.csv"
process_folder(input_folder, output_csv)


Features saved to vowel_formant_features.csv


In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load the data
df = pd.read_csv(output_csv)

# Encode vowel labels
le = LabelEncoder()
df['Vowel_Encoded'] = le.fit_transform(df['Vowel'])

# Split the data into training and test sets
X = df[['Formant_1_Hz', 'Formant_2_Hz']]
y = df['Vowel_Encoded']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Random Forest Regressor
rfr = RandomForestRegressor(n_estimators=100, random_state=42)
rfr.fit(X_train, y_train)

# Evaluate the model
score = rfr.score(X_test, y_test)
print(f"Model score: {score:.4f}")


Model score: 0.4432


In [25]:
import re
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import numpy as np
import math
import os
from scipy.signal import freqz
import peakutils
from sidekit.frontend.features import plp

def read_samples_from_txt(file_path):
    samples = []
    with open(file_path, 'r') as f:
        for line in f:
            samples.extend([float(value) for value in line.strip().split()])
    return np.array(samples)

def extract_plp_features(samples, sampling_rate=16000, rasta=False, plp_order=10):
    try:
        lpcas = plp(samples, fs=sampling_rate, rasta=rasta, plp_order=plp_order)
        return lpcas
    except Exception as e:
        print(f"Error extracting PLP features: {e}")
        return None

def compute_lpc_filter_and_response(lpc_coeffs):
    if lpc_coeffs is None:
        raise ValueError("LPC coefficients are not provided.")
    
    if isinstance(lpc_coeffs, (list, np.ndarray)):
        lpc_coeffs = np.array(lpc_coeffs)
        if lpc_coeffs.ndim != 2:
            raise ValueError("LPC coefficients must be a 2D array (frames x order).")
    
    num_frames, num_order = lpc_coeffs.shape
    all_w = []
    all_h = []
    
    for i in range(num_frames):
        lpc_filter = np.concatenate(([1], -lpc_coeffs[i]))
        w, h = freqz(lpc_filter)
        all_w.append(w)
        all_h.append(h)
    
    all_w = np.array(all_w)
    all_h = np.array(all_h)
    
    return all_w, all_h

def find_formants_across_frames(w, h):
    formant_1_w = []
    formant_2_w = []
    formant_3_w = []
    
    for i in range(w.shape[0]):
        h_db = 20 * np.log10(np.abs(h[i]))
        peak_indices = peakutils.indexes(h_db)

        if peak_indices.size > 2:
            formant_1_w.append(w[i][peak_indices[0]])
            formant_2_w.append(w[i][peak_indices[1]])
            formant_3_w.append(w[i][peak_indices[2]])
        else:
            formant_1_w.append(None)
            formant_2_w.append(None)
            formant_3_w.append(None)
    
    return formant_1_w, formant_2_w, formant_3_w

def extract_vowel_from_filename(filename):
    match = re.search(r'[a-zA-Z]', filename)
    if match:
        return match.group().lower()
    return None

def process_folder(input_folder, output_csv, sampling_rate=16000):
    data = []
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            samples = read_samples_from_txt(file_path)
            
            lpcas = extract_plp_features(samples, sampling_rate=sampling_rate)
            if lpcas is None:
                continue

            w, h = compute_lpc_filter_and_response(lpcas)

            formant_1_w, formant_2_w, formant_3_w = find_formants_across_frames(w, h)

            formant_1_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_1_w]
            formant_2_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_2_w]
            formant_3_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_3_w]

            vowel = extract_vowel_from_filename(filename)
            if vowel is None:
                print(f"Could not extract vowel from {filename}")
                continue

            for f1, f2, f3 in zip(formant_1_Hz, formant_2_Hz, formant_3_Hz):
                if f1 is not None and f2 is not None and f3 is not None:
                    data.append([f1, f2, f3, vowel])
    
    df = pd.DataFrame(data, columns=['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz', 'Vowel'])
    df.to_csv(output_csv, index=False)
    print(f"Features saved to {output_csv}")

def train_random_forest(output_csv):
    df = pd.read_csv(output_csv)
    le = LabelEncoder()
    df['Vowel_Encoded'] = le.fit_transform(df['Vowel'])
    X = df[['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz']]
    y = df['Vowel_Encoded']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    # Using the given hyperparameters directly
    best_params = {
        'max_depth': 20,
        'max_features': 'auto',
        'min_samples_leaf': 1,
        'min_samples_split': 5,
        'n_estimators': 100
    }

    rfc = RandomForestClassifier(random_state=42, **best_params)
    rfc.fit(X_train, y_train)
    
    # Evaluate model performance
    train_accuracy = rfc.score(X_train, y_train)
    test_accuracy = rfc.score(X_test, y_test)
    print(f"Train accuracy: {train_accuracy:.2f}")
    print(f"Test accuracy: {test_accuracy:.2f}")

    # Cross-validation
    cv_scores = cross_val_score(rfc, X, y, cv=5)
    print(f"Cross-validation accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    return rfc, le

def classify_files(input_folder, model, le, sampling_rate=16000, output_csv="predicted_vowels.csv"):
    results = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            samples = read_samples_from_txt(file_path)
            
            lpcas = extract_plp_features(samples, sampling_rate=sampling_rate)
            if lpcas is None:
                continue

            w, h = compute_lpc_filter_and_response(lpcas)

            formant_1_w, formant_2_w, formant_3_w = find_formants_across_frames(w, h)

            formant_1_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_1_w]
            formant_2_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_2_w]
            formant_3_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_3_w]

            frame_features = []
            for f1, f2, f3 in zip(formant_1_Hz, formant_2_Hz, formant_3_Hz):
                if f1 is not None and f2 is not None and f3 is not None:
                    frame_features.append([f1, f2, f3])

            if not frame_features:
                continue

            frame_features_df = pd.DataFrame(frame_features, columns=['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz'])
            predicted_vowels = model.predict(frame_features_df)
            predicted_vowels = le.inverse_transform(predicted_vowels.astype(int))

            predicted_vowel, count = np.unique(predicted_vowels, return_counts=True)
            classified_vowel = predicted_vowel[np.argmax(count)]

            actual_vowel = extract_vowel_from_filename(filename)
            results.append((filename, actual_vowel, classified_vowel))

    df_results = pd.DataFrame(results, columns=["Filename", "Actual Vowel", "Predicted Vowel"])
    df_results.to_csv(output_csv, index=False)
    print(f"Predicted vowels saved to {output_csv}")

    correct_predictions = df_results[df_results['Actual Vowel'] == df_results['Predicted Vowel']].shape[0]
    total_predictions = df_results.shape[0]
    accuracy = (correct_predictions / total_predictions) * 100
    print(f"Accuracy: {accuracy:.2f}%")

# Example usage
input_folder = r"Vowel_recordings"
output_csv = r"vowel_formant_features.csv"
process_folder(input_folder, output_csv)

rfc, le = train_random_forest(output_csv)
classify_files(input_folder, rfc, le)


Features saved to vowel_formant_features.csv
Train accuracy: 0.95
Test accuracy: 0.74
Cross-validation accuracy: 0.72 (+/- 0.03)
Predicted vowels saved to predicted_vowels.csv
Accuracy: 100.00%


In [46]:
import re
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import math
import os
from scipy.signal import freqz
import peakutils
from sidekit.frontend.features import plp
import pickle

def read_samples_from_txt(file_path):
    samples = []
    with open(file_path, 'r') as f:
        for line in f:
            samples.extend([float(value) for value in line.strip().split()])
    return np.array(samples)

def extract_plp_features(samples, sampling_rate=16000, rasta=False, plp_order=10):
    try:
        lpcas = plp(samples, fs=sampling_rate, rasta=rasta, plp_order=plp_order)
        return lpcas
    except Exception as e:
        print(f"Error extracting PLP features: {e}")
        return None

def compute_lpc_filter_and_response(lpc_coeffs):
    if lpc_coeffs is None:
        raise ValueError("LPC coefficients are not provided.")
    
    if isinstance(lpc_coeffs, (list, np.ndarray)):
        lpc_coeffs = np.array(lpc_coeffs)
        if lpc_coeffs.ndim != 2:
            raise ValueError("LPC coefficients must be a 2D array (frames x order).")
    
    num_frames, num_order = lpc_coeffs.shape
    all_w = []
    all_h = []
    
    for i in range(num_frames):
        lpc_filter = np.concatenate(([1], -lpc_coeffs[i]))
        w, h = freqz(lpc_filter)
        all_w.append(w)
        all_h.append(h)
    
    all_w = np.array(all_w)
    all_h = np.array(all_h)
    
    return all_w, all_h

def find_formants_across_frames(w, h):
    formant_1_w = []
    formant_2_w = []
    formant_3_w = []
    
    for i in range(w.shape[0]):
        h_db = 20 * np.log10(np.abs(h[i]))
        peak_indices = peakutils.indexes(h_db)

        if peak_indices.size > 2:
            formant_1_w.append(w[i][peak_indices[0]])
            formant_2_w.append(w[i][peak_indices[1]])
            formant_3_w.append(w[i][peak_indices[2]])
        else:
            formant_1_w.append(None)
            formant_2_w.append(None)
            formant_3_w.append(None)
    
    return formant_1_w, formant_2_w, formant_3_w

def extract_vowel_from_filename(filename):
    match = re.search(r'[a-zA-Z]', filename)
    if match:
        return match.group().lower()
    return None

def process_folder(input_folder, output_csv, sampling_rate=16000):
    data = []
    
    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            samples = read_samples_from_txt(file_path)
            
            lpcas = extract_plp_features(samples, sampling_rate=sampling_rate)
            if lpcas is None:
                continue

            w, h = compute_lpc_filter_and_response(lpcas)

            formant_1_w, formant_2_w, formant_3_w = find_formants_across_frames(w, h)

            formant_1_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_1_w]
            formant_2_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_2_w]
            formant_3_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_3_w]

            vowel = extract_vowel_from_filename(filename)
            if vowel is None:
                print(f"Could not extract vowel from {filename}")
                continue

            for f1, f2, f3 in zip(formant_1_Hz, formant_2_Hz, formant_3_Hz):
                if f1 is not None and f2 is not None and f3 is not None:
                    data.append([f1, f2, f3, vowel])
    
    df = pd.DataFrame(data, columns=['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz', 'Vowel'])
    df.to_csv(output_csv, index=False)
    print(f"Features saved to {output_csv}")

def train_random_forest(output_csv):
    df = pd.read_csv(output_csv)
    le = LabelEncoder()
    df['Vowel_Encoded'] = le.fit_transform(df['Vowel'])
    X = df[['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz']]
    y = df['Vowel_Encoded']
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Using the given hyperparameters directly
    best_params = {
        'max_depth': 5,
        'max_features': 'auto',
        'min_samples_leaf': 1,
        'min_samples_split': 5,
        'n_estimators': 100
    }

    rfc = RandomForestClassifier(random_state=42, **best_params)
    rfc.fit(X_train, y_train)
    
    # Evaluate model performance
    train_accuracy = rfc.score(X_train, y_train)
    test_accuracy = rfc.score(X_test, y_test)
    print(f"Train accuracy: {train_accuracy:.2f}")
    print(f"Test accuracy: {test_accuracy:.2f}")

    # Cross-validation
    cv_scores = cross_val_score(rfc, X, y, cv=5)
    print(f"Cross-validation accuracy: {cv_scores.mean():.2f} (+/- {cv_scores.std():.2f})")

    # Save the model and label encoder
    with open('vowel_classifier_model.pkl', 'wb') as f:
        pickle.dump((rfc, le), f)

    return rfc, le

def classify_files(input_folder, model, le, sampling_rate=16000, output_csv="predicted_vowels.csv"):
    results = []

    for filename in os.listdir(input_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_folder, filename)
            samples = read_samples_from_txt(file_path)
            
            lpcas = extract_plp_features(samples, sampling_rate=sampling_rate)
            if lpcas is None:
                continue

            w, h = compute_lpc_filter_and_response(lpcas)

            formant_1_w, formant_2_w, formant_3_w = find_formants_across_frames(w, h)

            formant_1_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_1_w]
            formant_2_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_2_w]
            formant_3_Hz = [(x * sampling_rate) / (2 * math.pi) if x is not None else None for x in formant_3_w]

            frame_features = []
            for f1, f2, f3 in zip(formant_1_Hz, formant_2_Hz, formant_3_Hz):
                if f1 is not None and f2 is not None and f3 is not None:
                    frame_features.append([f1, f2, f3])

            if not frame_features:
                continue

            frame_features_df = pd.DataFrame(frame_features, columns=['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz'])
            predicted_vowels = model.predict(frame_features_df)
            predicted_vowels = le.inverse_transform(predicted_vowels.astype(int))

            predicted_vowel, count = np.unique(predicted_vowels, return_counts=True)
            classified_vowel = predicted_vowel[np.argmax(count)]

            actual_vowel = extract_vowel_from_filename(filename)
            results.append((filename, actual_vowel, classified_vowel))

    df_results = pd.DataFrame(results, columns=["Filename", "Actual Vowel", "Predicted Vowel"])
    df_results.to_csv(output_csv, index=False)
    print(f"Predicted vowels saved to {output_csv}")

    correct_predictions = df_results[df_results['Actual Vowel'] == df_results['Predicted Vowel']].shape[0]
    total_predictions = df_results.shape[0]
    accuracy = (correct_predictions / total_predictions) * 100
    print(f"Accuracy: {accuracy:.2f}%")

# Example usage
input_folder = r"Vowel_recordings"
output_csv = r"vowel_formant_features.csv"
process_folder(input_folder, output_csv)

rfc, le = train_random_forest(output_csv)
classify_files(input_folder, rfc, le)


Features saved to vowel_formant_features.csv
Train accuracy: 0.71
Test accuracy: 0.70
Cross-validation accuracy: 0.69 (+/- 0.01)
Predicted vowels saved to predicted_vowels.csv
Accuracy: 99.33%


In [10]:
import sounddevice as sd
import numpy as np
import pandas as pd
import pickle
from scipy.signal import freqz
import peakutils
from sidekit.frontend.features import plp



# Load the model and label encoder
with open('vowel_classifier_model.pkl', 'rb') as f:
    rfc, le = pickle.load(f)

def record_audio(duration, fs):
    print("Recording...")
    audio = sd.rec(int(duration * fs), samplerate=fs, channels=1, dtype='float64')
    sd.wait()
    print("Recording complete")
    audio = audio.flatten()
    return audio

def trim_audio(audio, fs, threshold):
    max_amplitude = np.max(np.abs(audio))
    amplitude_threshold = threshold * max_amplitude
    audio[np.abs(audio) < amplitude_threshold] = 0
    start_idx = np.argmax(audio > amplitude_threshold)
    end_idx = len(audio) - np.argmax(audio[::-1] > amplitude_threshold)
    return audio[start_idx:end_idx]

def predict_vowel(audio, fs, model, le):
    lpcas = extract_plp_features(audio, sampling_rate=fs)
    if lpcas is None:
        return None

    w, h = compute_lpc_filter_and_response(lpcas)
    formant_1_w, formant_2_w, formant_3_w = find_formants_across_frames(w, h)

    formant_1_Hz = [(x * fs) / (2 * np.pi) if x is not None else None for x in formant_1_w]
    formant_2_Hz = [(x * fs) / (2 * np.pi) if x is not None else None for x in formant_2_w]
    formant_3_Hz = [(x * fs) / (2 * np.pi) if x is not None else None for x in formant_3_w]

    frame_features = []
    for f1, f2, f3 in zip(formant_1_Hz, formant_2_Hz, formant_3_Hz):
        if f1 is not None and f2 is not None and f3 is not None:
            frame_features.append([f1, f2, f3])

    if not frame_features:
        return None

    frame_features_df = pd.DataFrame(frame_features, columns=['Formant_1_Hz', 'Formant_2_Hz', 'Formant_3_Hz'])
    predicted_vowels = model.predict(frame_features_df)
    predicted_vowels = le.inverse_transform(predicted_vowels.astype(int))

    predicted_vowel, count = np.unique(predicted_vowels, return_counts=True)
    return predicted_vowel[np.argmax(count)]

# Parameters
duration = 5  # seconds
fs = 16000  # sampling rate
threshold = 0.01

# Record and trim audio
audio = record_audio(duration, fs)
trimmed_audio = trim_audio(audio, fs, threshold)

# Predict vowel

predicted_vowel = predict_vowel(trimmed_audio, fs, rfc, le)
print(f"Predicted Vowel: {predicted_vowel}")


ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject

In [11]:
!pip uninstall pandas
!pip install pandas
