# Thống kê số speaker và số utt của từng speaker (đọc tên file)

In [None]:
import os
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter

# Configuration
folder_path = r'E:\speech_data\train_raw'

def analyze_speech_data(path):
    # Get all files in directory
    files = [f for f in os.listdir(path) if os.path.isfile(os.path.join(path, f))]
    
    # Extract speaker IDs (before first '_')
    speakers = [f.split('_')[0] for f in files if '_' in f]
    stats = Counter(speakers)
    
    # Create and sort DataFrame
    df = pd.DataFrame(stats.items(), columns=['Speaker_ID', 'Utterance_Count'])
    df = df.sort_values(by='Utterance_Count', ascending=False)
    
    # Save to CSV
    df.to_csv('speaker_stats.csv', index=False)
    print(f"CSV report saved: speaker_stats.csv")
    
    # Print summary
    print(f"Total Speakers: {len(df)}")
    print(f"Total Utterances: {df['Utterance_Count'].sum()}")
    
    # Plotting distribution
    plt.figure(figsize=(10, 6))
    plt.hist(df['Utterance_Count'], bins=30, color='skyblue', edgecolor='black')
    plt.title('Utterances per Speaker Distribution')
    plt.xlabel('Number of Utterances')
    plt.ylabel('Number of Speakers')
    plt.grid(axis='y', alpha=0.3)
    
    # Save and show plot
    plt.savefig('distribution_plot.png')
    print("Plot saved: distribution_plot.png")
    plt.show()

analyze_speech_data(folder_path)