# Data Analysis
Prepared by: Hieng-

In [1]:
data_path = "/Users/maohieng/cadt/master/cadt_tts/phanet/tacotron2_khmer/km_kh_female"

In [2]:
import os
import pandas as pd

In [11]:
def metadata_processing(data_path) -> tuple[pd.DataFrame, list]:
    import wave
    wavs = f"{data_path}/wavs"
    # Read tsv file
    txts = f"{data_path}/line_index.tsv"
    with open(txts, 'r', encoding='utf-8') as f:
        lines = f.readlines()

    not_found_files = []
    md = []

    for line in lines:
        parts = line.strip().split('\t\t')
        filename = parts[0]+".wav"
        speaker = parts[0].split('_')[1]
        text = parts[1]
        # Calculate duration
        wav_path = os.path.join(data_path, "wavs", filename)
        try:
            with wave.open(wav_path, 'r') as wav_file:
                frames = wav_file.getnframes()
                rate = wav_file.getframerate()
                duration = frames / float(rate)
                wav_file.close()
        except FileNotFoundError:
            not_found_files.append(filename)
            continue

        # duration in seconds with 3 decimal places
        duration = round(duration, 3)

        # Append to metadata DataFrame
        md.append({"filename": filename, 
                   "text": text, 
                   "duration": duration, 
                   "sampling_rate": rate, 
                   "speaker": speaker})
    
    metadata = pd.DataFrame(md)
    return metadata, not_found_files

In [12]:
# check if metadata.csv exists in the current directory
# if os.path.exists("metadata.csv"):
#     metadata = pd.read_csv("metadata.csv", encoding='utf-8', sep='\t')
# else:
metadata, not_found_files = metadata_processing(data_path)
metadata.to_csv("metadata.csv", index=False, encoding='utf-8', sep='\t')
if not_found_files:
    print(f"Warning: The following files were not found: {not_found_files}")

In [13]:
print(metadata.head())

                  filename                                               text  \
0  khm_0308_0011865648.wav  ស្ពាន កំពង់ ចម្លង អ្នកលឿង នៅ ព្រៃវែង ជា ស្ពាន ...   
1  khm_0308_0032157149.wav  ភ្លើង កំពុង ឆាប ឆេះ ផ្ទះ ប្រជា ពលរដ្ឋ នៅ សង្កា...   
2  khm_0308_0038959268.wav  អ្នក សុំ ទាន ដេក ប្រកាច់ ម្នាក់ ឯង ក្បែរ ខ្លោង...   
3  khm_0308_0054635313.wav  ស្ករ ត្នោត ដែល មាន គុណភាព ល្អ ផលិត នៅ ខេត្ត កំ...   
4  khm_0308_0055735195.wav         ភ្នំបាខែង មាន កម្ពស់ តែ ចិត សិប ម៉ែត្រ សោះ   

   duration  sampling_rate speaker  
0     6.997          48000    0308  
1     4.864          48000    0308  
2     5.888          48000    0308  
3     4.352          48000    0308  
4     3.584          48000    0308  


In [14]:
from collections import Counter
# word count
word_count = Counter()
for text in metadata['text']:
    word_count.update(text.split())
print(f"Word Count: {sum(word_count.values())}")
print(f"Unique Words: {len(word_count)}")
print(f"Most Common Words: {word_count.most_common(10)}")

Word Count: 26179
Unique Words: 4347
Most Common Words: [('ជា', 698), ('នៅ', 658), ('មាន', 497), ('បាន', 438), ('ការ', 255), ('និង', 247), ('ប្រទេស', 244), ('ក្នុង', 239), ('ទៅ', 228), ('មួយ', 212)]


In [7]:
# Create a vocabulary from the text data
vocab = set()
for text in metadata['text']:
    vocab.update(set(text))
vocab = sorted(vocab)
vocab_size = len(vocab)
print(f"Vocabulary Size: {vocab_size}")
print(f"Vocabulary: {vocab}")

Vocabulary Size: 71
Vocabulary: [' ', 'ក', 'ខ', 'គ', 'ឃ', 'ង', 'ច', 'ឆ', 'ជ', 'ឈ', 'ញ', 'ដ', 'ឋ', 'ឌ', 'ឍ', 'ណ', 'ត', 'ថ', 'ទ', 'ធ', 'ន', 'ប', 'ផ', 'ព', 'ភ', 'ម', 'យ', 'រ', 'ល', 'វ', 'ស', 'ហ', 'ឡ', 'អ', 'ឥ', 'ឧ', 'ឪ', 'ឫ', 'ឬ', 'ឭ', 'ឮ', 'ឯ', 'ឱ', 'ា', 'ិ', 'ី', 'ឹ', 'ឺ', 'ុ', 'ូ', 'ួ', 'ើ', 'ឿ', 'ៀ', 'េ', 'ែ', 'ៃ', 'ោ', 'ៅ', 'ំ', 'ះ', 'ៈ', '៉', '៊', '់', '៌', '៍', '៎', '៏', '័', '្']
