In [1]:
import glob

audio_files = glob.glob("tone_perfect/*.mp3")
print("n audio_files:", len(audio_files))

n audio_files: 9840


In [2]:
import pandas as pd

records = []
for audio_file in audio_files:
    label = audio_file.split("/")[-1].split("_")[0]
    tone = int(label[-1])
    pinyin = label[:-1]
    records.append({
        "audio_file": audio_file,
        "tone": tone,
        "pinyin": pinyin,
        "label": label,
    })

df = pd.DataFrame(records)
df.to_csv("labels.csv", sep=",", index=False)
df.head()

Unnamed: 0,audio_file,tone,pinyin,label
0,tone_perfect/sa3_MV1_MP3.mp3,3,sa,sa3
1,tone_perfect/cao2_FV1_MP3.mp3,2,cao,cao2
2,tone_perfect/hong1_MV2_MP3.mp3,1,hong,hong1
3,tone_perfect/zheng3_MV3_MP3.mp3,3,zheng,zheng3
4,tone_perfect/shao4_MV1_MP3.mp3,4,shao,shao4


In [3]:
import json
import numpy as np

tones = sorted(np.int32(df["tone"].unique()).tolist())
with open("tones.json", "w") as f:
    json.dump(tones, f, indent=2)
tones[:10], len(tones)

([1, 2, 3, 4], 4)

In [4]:
import json

pinyins = sorted(df["pinyin"].unique().tolist())
with open("pinyins.json", "w") as f:
    json.dump(pinyins, f, indent=2)
pinyins[:10], len(pinyins)

(['a', 'ai', 'an', 'ang', 'ao', 'ba', 'bai', 'ban', 'bang', 'bao'], 410)

In [5]:
import json

labels = sorted(df["label"].unique().tolist())
with open("labels.json", "w") as f:
    json.dump(labels, f, indent=2)
labels[:10], len(labels)

(['a1', 'a2', 'a3', 'a4', 'ai1', 'ai2', 'ai3', 'ai4', 'an1', 'an2'], 1640)

In [6]:
import itertools
from sklearn.model_selection import train_test_split
from tqdm import tqdm

train_data, test_data = train_test_split(audio_files, test_size=0.2, random_state=42)
print(len(train_data), len(test_data))
train_data = list(itertools.product(train_data, train_data))
test_data = list(itertools.product(test_data, test_data))
print(len(train_data), len(test_data))


7872 1968
61968384 3873024


In [7]:
records = []
for audio_fname1, audio_fname2 in tqdm(train_data):
    label1 = audio_fname1.split("/")[-1].split("_")[0]
    label2 = audio_fname2.split("/")[-1].split("_")[0]
    is_same = 1 if label1 == label2 else 0
    records.append({
        "audio_fname1": audio_fname1,
        "audio_fname2": audio_fname2,
        "is_same": is_same
    })

df = pd.DataFrame(records)
same_df = df[df["is_same"] == 1]
n_same = len(same_df)
diff_df = df[df["is_same"] == 0]
diff_df = diff_df.sample(n=n_same, random_state=42)
print(len(same_df), len(diff_df))

df = pd.concat([same_df, diff_df])
df.to_csv("annotation_train.csv", sep=",", index=False)
df.head()

100%|██████████| 61968384/61968384 [00:38<00:00, 1601231.53it/s]


In [None]:
records = []
for audio_fname1, audio_fname2 in tqdm(test_data):
    label1 = audio_fname1.split("/")[-1].split("_")[0]
    label2 = audio_fname2.split("/")[-1].split("_")[0]
    is_same = 1 if label1 == label2 else 0
    records.append({
        "audio_fname1": audio_fname1,
        "audio_fname2": audio_fname2,
        "is_same": is_same
    })

df = pd.DataFrame(records)
same_df = df[df["is_same"] == 1]
n_same = len(same_df)
diff_df = df[df["is_same"] == 0]
diff_df = diff_df.sample(n=n_same, random_state=42)
print(len(same_df), len(diff_df))

df = pd.concat([same_df, diff_df])
df.to_csv("annotation_test.csv", sep=",", index=False)
df.head()