# Simple demonstration and exploration

In [17]:
!pip install torchaudio transformers librosa pydub jiwer



In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import torchaudio
import librosa
import numpy as np

def load_audio(file_path, sr=16000):
    """Load an audio file and return the waveform and sample rate."""
    waveform, sample_rate = librosa.load(file_path, sr=sr)
    return waveform, sample_rate

# testing the function on an audio
audio_path = "data/l2arctic_release_v5.0/ABA/ABA/wav/arctic_a0001.wav"
waveform, sample_rate = load_audio(audio_path)
print(f"Loaded audio with sample rate: {sample_rate}")

Loaded audio with sample rate: 16000


In [3]:
import os
def get_speaker_files(root_folder, speaker):
    """Retrieve all wav and transcript file paths for a given speaker, handling nested folder structure."""
    speaker_folder = os.path.join(root_folder, speaker, speaker)  # Adjusted path

    wav_dir = os.path.join(speaker_folder, "wav")
    transcript_dir = os.path.join(speaker_folder, "transcript")

    if not os.path.exists(wav_dir) or not os.path.exists(transcript_dir):
        print(f"Missing folders in {speaker_folder}")
        return None

    audio_files = sorted([os.path.join(wav_dir, f) for f in os.listdir(wav_dir) if f.endswith('.wav')])
    transcript_files = sorted([os.path.join(transcript_dir, f) for f in os.listdir(transcript_dir) if f.endswith('.txt')])

    return list(zip(audio_files, transcript_files))

# Example usage
root_folder = "data/l2arctic_release_v5.0"
speaker = "ABA"  # Example speaker
paired_files = get_speaker_files(root_folder, speaker)

if paired_files:
    print(f"Found {len(paired_files)} audio-transcript pairs for speaker {speaker}.")

Found 1129 audio-transcript pairs for speaker ABA.


In [4]:
def load_transcript(transcript_path):
    """Load transcript from a text file."""
    with open(transcript_path, "r", encoding="utf-8") as f:
        return f.read().strip()

# testing 
sample_audio, sample_transcript = paired_files[0]
print(f"Audio file: {sample_audio}")
print(f"Transcript: {load_transcript(sample_transcript)}")

Audio file: data/l2arctic_release_v5.0/ABA/ABA/wav/arctic_a0001.wav
Transcript: Author of the danger trail Philip Steels etc


Let us now load the data for all the different speakers.

In [5]:
def get_all_speakers_data(root_folder):
    """Retrieve all audio-text pairs from all speakers with nested folder structure."""
    speakers = [sp for sp in os.listdir(root_folder) if os.path.isdir(os.path.join(root_folder, sp))]
    all_data = []

    for speaker in speakers:
        speaker_data = get_speaker_files(root_folder, speaker)
        if speaker_data:
            all_data.extend(speaker_data)

    return all_data

# Example usage
dataset_files = get_all_speakers_data(root_folder)
print(f"Total dataset size: {len(dataset_files)} audio-transcript pairs.")

Total dataset size: 26889 audio-transcript pairs.


The data is loaded

## Automatic Transcription Using Whisper

In [6]:
!pip install  tf-keras transformers



In [7]:
from transformers import pipeline

# Load Whisper model
whisper_asr = pipeline("automatic-speech-recognition", model="openai/whisper-small")

def transcribe_audio(file_path):
    """Transcribe an audio file using Whisper, handling long-form audio."""
    transcription = whisper_asr(file_path, return_timestamps=True)["text"]
    return transcription


# Example usage (using the first audio file)
sample_audio, sample_transcript_path = dataset_files[0]
whisper_transcription = transcribe_audio(sample_audio)

print("Ground Truth:", open(sample_transcript_path).read().strip())
print("Whisper Transcription:", whisper_transcription)

Device set to use mps:0
Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English.This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`.


Ground Truth: Author of the danger trail Philip Steels etc
Whisper Transcription:  author of the Danger Trail, Philip Steele's, etc.


In [8]:
from jiwer import wer

# Define ground truth and Whisper transcription
ground_truth_text = "Author of the danger trail Philip Steels etc"
whisper_text = "author of the Danger Trail, Philip Steele's etc."


## Extracting Fluency Features

### Speech rate

In [9]:
import librosa

def compute_speech_rate(audio_path, transcript):
    """Compute words per second from an audio file."""
    waveform, sr = librosa.load(audio_path, sr=16000)

    # Compute duration of the audio
    duration = librosa.get_duration(y=waveform, sr=sr)

    # Count words in the transcript
    num_words = len(transcript.split())

    # Speech Rate: Words per Second
    speech_rate = num_words / duration if duration > 0 else 0

    return speech_rate

# test
sample_speech_rate = compute_speech_rate(sample_audio, whisper_text)
print(f"Speech Rate: {sample_speech_rate:.2f} words/sec")

Speech Rate: 2.55 words/sec


Native speakers typically have a speech rate of 2.5 - 4.0 words/sec.
Lower rates (~1.0 words/sec) might indicate lower fluency (A1-A2).

### Detecting Pauses in Speech

In [10]:
def count_pauses(audio_path, threshold_db=20):
    """Detect number of pauses based on silence in the audio."""
    waveform, sr = librosa.load(audio_path, sr=16000)
    
    # Identify silent segments
    intervals = librosa.effects.split(waveform, top_db=threshold_db)
    
    # Number of pauses = silent gaps between words
    num_pauses = len(intervals) - 1

    return num_pauses

num_pauses = count_pauses(sample_audio)
print(f"Number of Pauses: {num_pauses}")

Number of Pauses: 3


### Word Error Rate (Pronunciation Accuracy)

In [11]:
def compute_wer(ground_truth, predicted_text):
    """Compute Word Error Rate (WER) between ASR and ground truth text."""
    return wer(ground_truth.lower(), predicted_text.lower())

wer_score = compute_wer(ground_truth_text, whisper_text)
print(f"Word Error Rate: {wer_score:.2f}")

Word Error Rate: 0.38


- WER < 0.2 → Clear pronunciation, native-like fluency.
- WER > 0.4 → Many errors, possible pronunciation difficulties.

### Measuring Lexical Richness (Vocabulary Diversity)

This is not relevant on one audio only (too short)

In [12]:
from collections import Counter

def lexical_richness(transcript):
    """Compute lexical diversity using Type-Token Ratio (TTR)."""
    words = transcript.split()
    unique_words = set(words)

    ttr = len(unique_words) / len(words) if len(words) > 0 else 0
    return ttr

ttr_score = lexical_richness(whisper_text)
print(f"Lexical Diversity Score (TTR): {ttr_score:.2f}")

Lexical Diversity Score (TTR): 1.00


### Syllables per Second (Prosody Feature)

In [13]:
!pip install textstat



In [14]:
import textstat

def syllables_per_second(transcript, duration):
    """Estimate syllables per second as a fluency indicator."""
    num_syllables = textstat.syllable_count(transcript)
    return num_syllables / duration if duration > 0 else 0

duration = librosa.get_duration(filename=sample_audio)
sps = syllables_per_second(whisper_text, duration)
print(f"Syllables per Second: {sps:.2f}")

Syllables per Second: 4.78


- SPS > 3.0 → Native-level fluency (C1-C2).
- SPS < 2.0 → Slower speech, possible hesitation (A1-B1).

### Assign CEFR/TOEFL Levels

In [15]:
def classify_fluency(speech_rate, num_pauses, wer_score, ttr_score, sps):
    """Classify fluency into CEFR levels based on extracted features."""
    
    if speech_rate > 2.5 and num_pauses < 5 and wer_score < 0.15 and ttr_score > 0.6 and sps > 3.0:
        return "C2"
    elif speech_rate > 2.0 and num_pauses < 7 and wer_score < 0.20 and ttr_score > 0.5 and sps > 2.7:
        return "C1"
    elif speech_rate > 1.5 and num_pauses < 10 and wer_score < 0.30 and ttr_score > 0.45 and sps > 2.4:
        return "B2"
    elif speech_rate > 1.2 and num_pauses < 15 and wer_score < 0.40 and ttr_score > 0.4 and sps > 2.2:
        return "B1"
    elif speech_rate > 1.0 and num_pauses < 20 and wer_score < 0.50 and ttr_score > 0.35 and sps > 2.0:
        return "A2"
    else:
        return "A1"

# test
fluency_level = classify_fluency(sample_speech_rate, num_pauses, wer_score, ttr_score, sps)
print(f"Fluency Level: {fluency_level}")

Fluency Level: B1


### Applying to the whole data set

In [16]:
def evaluate_all_speakers(dataset_files):
    """Evaluate fluency for all speakers in the dataset."""
    results = []

    for audio_path, transcript_path in dataset_files:
        ground_truth_text = open(transcript_path).read().strip()
        whisper_text = transcribe_audio(audio_path)

        # Extract features
        speech_rate = compute_speech_rate(audio_path, whisper_text)
        num_pauses = count_pauses(audio_path)
        wer_score = compute_wer(ground_truth_text, whisper_text)
        ttr_score = lexical_richness(whisper_text)
        sps = syllables_per_second(whisper_text, librosa.get_duration(filename=audio_path))

        # Predict fluency level
        fluency_level = classify_fluency(speech_rate, num_pauses, wer_score, ttr_score, sps)

        results.append({
            "audio": audio_path,
            "transcription": whisper_text,
            "speech_rate": speech_rate,
            "num_pauses": num_pauses,
            "wer_score": wer_score,
            "ttr_score": ttr_score,
            "sps": sps,
            "fluency_level": fluency_level
        })

    return results

# Run evaluation on first 10 samples to test the function first
fluency_results = evaluate_all_speakers(dataset_files[:10])

for res in fluency_results:
    print(res)

{'audio': 'data/l2arctic_release_v5.0/ZHAA/ZHAA/wav/arctic_a0001.wav', 'transcription': " author of the Danger Trail, Philip Steele's, etc.", 'speech_rate': 2.5477707006369426, 'num_pauses': 3, 'wer_score': 0.375, 'ttr_score': 1.0, 'sps': 4.777070063694267, 'fluency_level': 'B1'}
{'audio': 'data/l2arctic_release_v5.0/ZHAA/ZHAA/wav/arctic_a0002.wav', 'transcription': ' Now, at this particular case, Tom apologized white more.', 'speech_rate': 2.307692307692308, 'num_pauses': 1, 'wer_score': 0.5, 'ttr_score': 1.0, 'sps': 3.8461538461538463, 'fluency_level': 'A1'}
{'audio': 'data/l2arctic_release_v5.0/ZHAA/ZHAA/wav/arctic_a0003.wav', 'transcription': ' for the 20th time that evening the two men shook hands.', 'speech_rate': 3.081232492997199, 'num_pauses': 2, 'wer_score': 0.18181818181818182, 'ttr_score': 0.9090909090909091, 'sps': 3.361344537815126, 'fluency_level': 'C1'}
{'audio': 'data/l2arctic_release_v5.0/ZHAA/ZHAA/wav/arctic_a0004.wav', 'transcription': " Lord, but I'm glad to see yo

In [31]:
fluency_results_df = evaluate_all_speakers(dataset_files)

Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.
Whisper did not predict an ending timestamp, which can happen if audio is cut off in the middle of a word. Also make sure WhisperTimeStampLogitsProcessor was used during generation.


KeyboardInterrupt: 