# EDA on Speech Dataset That's Used for TTS Application

### Load Metadata

In [1]:
import pandas as pd
import os
from tqdm import tqdm

def load_libritts_metadata(data_dir='LibriTTS/train-clean-100'):
    metadata = []
    
    for speaker in tqdm(os.listdir(data_dir)):
        speaker_dir = os.path.join(data_dir, speaker)
        if not os.path.isdir(speaker_dir):
            continue

        for chapter in os.listdir(speaker_dir):
            chapter_dir = os.path.join(speaker_dir, chapter)
            if not os.path.isdir(chapter_dir):
                continue

            transcript_file = os.path.join(chapter_dir, f'{speaker}_{chapter}.trans.txt')
            if not os.path.exists(transcript_file):
                continue

            with open(transcript_file, 'r') as f:
                for line in f:
                    line = line.strip()

                    if not line:
                        continue

                    file_id, text = line.split(' ', 1)
                    audio_path = os.path.join(chapter_dir, f'{file_id}.wav')
                    metadata.append({
                        "speaker_id": speaker,
                        "chapter_id": chapter,
                        "file_id": file_id,
                        "text": text,
                        "audio_path": audio_path
                    })

    return pd.DataFrame(metadata)

df = load_libritts_metadata()
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'LibriTTS/train-clean-100'

## Speaker Distribution Analysis

**Number of unique speaker**

In [None]:
print("Total speaker: ", df['speaker_id'].nunique())

Gender Distribution (if metadata available)

In [None]:
# If gender info is in speaker IDs (e.g., 'M' or 'F' prefix)
df['gender'] = df['speaker_id'].str[0].map({'M': 'Male', 'F': 'Female'})
df['gender'].value_counts().plot(kind='bar')

Recordings per Speaker

In [None]:
speaker_counts = df['speaker_id'].value_counts()
speaker_counts.describe()  # mean, min, max
speaker_counts.hist(bins=50)  # Check imbalance

**Insights**:
- Is the dataset dominated by a few speakers?
- Are genders balanced?
- Are some speakers underrepresented?

## Text Analysis

Sentence Length (Word & Character Count)

In [None]:
df['word_count'] = df['text'].apply(lambda x: len(x.split()))
df['char_count'] = df['text'].apply(len)
df[['word_count', 'char_count']].describe()

Vocabulary & Phoneme Coverage

In [None]:
from collections import Counter
import nltk
nltk.download('punkt')

# Unique words
all_words = ' '.join(df['text']).lower().split()
word_counts = Counter(all_words)
print("Top 20 words:", word_counts.most_common(20))

# Phoneme analysis (requires G2P model)
!pip install g2p-en
from g2p_en import G2p
g2p = G2p()

def text_to_phonemes(text):
    return ' '.join(g2p(text))

df['phonemes'] = df['text'].apply(text_to_phonemes)
all_phonemes = ' '.join(df['phonemes']).split()
phoneme_counts = Counter(all_phonemes)
print("Top 20 phonemes:", phoneme_counts.most_common(20))

**Insights**:
- Are there very short/long sentences?
- Are rare words or phonemes missing?
- Does the dataset cover diverse linguistic patterns?

## Audio Analysis

Audio Duration Distribution

In [None]:
import librosa

def get_duration(audio_path):
    try:
        return librosa.get_duration(filename=audio_path)
    except:
        return None

df['duration'] = df['audio_path'].apply(get_duration)
df['duration'].describe()  # Check min, max, mean
df['duration'].hist(bins=100)  # Visualize

Sample Rate & Channel Check

In [None]:
def get_audio_info(audio_path):
    try:
        y, sr = librosa.load(audio_path, sr=None)
        return sr, y.shape[0]
    except:
        return None, None

df['sample_rate'] = df['audio_path'].apply(lambda x: get_audio_info(x)[0])
df['sample_rate'].value_counts()  # Should be consistent (e.g., 22050 Hz)

**Insights**:
- Are there extremely short/long clips?
- Is the sample rate consistent?
- Are there silent or corrupted files?

## Speaker-Text-Audio Correlation

Do some speakers have longer/shorter sentences?

In [None]:
df.groupby('speaker_id')['word_count'].mean().sort_values()

Do certain phonemes appear more with certain speakers?

In [None]:
# Example: Check if nasal sounds (/m/, /n/) vary by speaker
df['nasal_count'] = df['phonemes'].apply(lambda x: x.count('m') + x.count('n'))
df.groupby('speaker_id')['nasal_count'].mean().sort_values()

**Insights**:
- Are some speakers overrepresented in certain linguistic patterns?
- Are there dialectal variations?

## Automated EDA Tools
`pandas-profiling`: Quick overview of distributions.

In [None]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
profile.to_widgets()
# profile.to_file("libritts_eda.html")