In [23]:
import pandas as pd
import soundfile as sf
import os
import re
from pypinyin import lazy_pinyin

In [2]:
BASE_PATH = os.getcwd()

In [3]:
data_path = os.path.join(BASE_PATH, 'magic_data')

In [20]:
def pinyin_cover(char):
    if 'zh' in char:
        char = char.replace("zh", "z")
    char = char.replace("z", "z-zh")
    if 'ch' in char:
        char = char.replace("ch", "c")
    char = char.replace("c", "c-ch")
    if 'sh' in char:
        char = char.replace("sh", "s")
    char = char.replace("s", "s-sh")
    if 'l' in char:
        char = char.replace("l", "n")
    if 'ing' in char:
        char = char.replace("ing", "in")
    char = char.replace("in", "in-ing")
    return char

In [4]:
real_wav_list = []
for data_set in ["train", "test", "dev"]:
    for person in os.listdir(os.path.join(data_path,'wav',data_set)):
        for i in os.listdir(os.path.join(data_path,'wav',data_set, person)):
            real_wav_list.append(i)

In [5]:
print('total real num of wavs is {}'.format(len(real_wav_list)))

total real num of wavs is 3552


In [24]:
TRANS_list = []
for data_set in ["train", "test", "dev"]:
    with open('magic_data/transcript/{}_TRANS.txt'.format(data_set), 'r') as f:
        lines = f.readlines()[1:]
        TRANS_list.extend(lines)

wav_txt_dic = {}
for i in TRANS_list:
    wav_file, person, txt = i.strip('\n').split('\t')
    wav_txt_dic[wav_file] = re.sub('[^\w\u4e00-\u9fff]+', '', txt)

In [7]:
print('total files in TRANS {}'.format(len(TRANS_list)))

total files in TRANS 621345


In [25]:
scp_list = []
for data_set in ["train", "test", "dev"]:
    with open("magic_data/metadata/{}.scp".format(data_set), 'r') as f:
        lines = f.readlines()
        scp_list.extend(lines)

wav_file_list = []
path_list = []
durations_list = []
for wav in scp_list:
    wav_file, path = wav.strip('\n').split('\t')
    wav_file_list.append(wav_file)
    path_list.append('MAGIC_DATA/magic_data/{}'.format(path))
    sig, sr = sf.read(os.path.join(data_path, path))
    durations_list.append(round(float(len(sig))/float(sr),3))

In [13]:
print('total files in scp {}'.format(len(scp_list)))

total files in scp 609552


In [26]:
data = pd.DataFrame()

In [27]:
data['wav'] = wav_file_list
data['wav_path'] = path_list
data['durations'] = durations_list

In [28]:
data['txt'] = data.wav.apply(lambda x: wav_txt_dic[x])
data['pinyin'] =  data.txt.apply(lambda x: ' '.join([pinyin_cover(i) for i in lazy_pinyin(x)]))

In [29]:
data

Unnamed: 0,wav,wav_path,txt,pinyin
0,14_3466_20170826171159.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,请语言播放小说,qin-ing yu yan bo fang xiao s-shuo
1,14_3466_20170826171236.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,这里,z-zhe ni
2,14_3466_20170826171323.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,全民唱吧,quan min-ing c-chang ba
3,14_3466_20170826171404.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,请搜索我附近的超市,qin-ing s-shou s-shuo wo fu jin-ing de c-chao ...
4,14_3466_20170826171424.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,帮我搜三生三世十里桃花电影预告片,bang wo s-shou s-shan s-sheng s-shan s-shi s-s...
5,14_3466_20170826171437.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,准备完毕,z-zhun bei wan bi
6,14_3466_20170826171511.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,这个衣服我不知道我合适不合适可以帮我投影吗,z-zhe ge yi fu wo bu z-zhi dao wo he s-shi bu ...
7,14_3466_20170826171524.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,在地图上标出我的位置,z-zhai di tu s-shang biao c-chu wo de wei z-zhi
8,14_3466_20170826171536.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,您是否超速了,nin-ing s-shi fou c-chao s-shu ne
9,14_3466_20170826171549.wav,MAGIC_DATA/magic_data/wav/train/14_3466/14_346...,推荐一个电台,tui jian yi ge dian tai


In [31]:
print('total duration is {}'.format(round(float(sum(data.durations))/3600.0, 3)))

In [61]:
data.to_csv('{}/magic_data.csv'.format(BASE_PATH), index=False)