In [None]:
!pip install numpy scipy librosa unidecode inflect librosa
!apt-get update
!apt-get install -y libsndfile1

In [None]:
from scipy.io.wavfile import write
import torch
from IPython.display import Audio
import nltk
from nltk.corpus import brown
import os
from tqdm import tqdm
import pandas as pd
import shutil
import json
import gc
import re
from IPython.display import FileLink

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

In [None]:
tacotron2 = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tacotron2', model_math='fp16')
tacotron2 = tacotron2.to('cuda')
tacotron2.eval()
utils = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_tts_utils')

In [None]:
tacotron2.decoder.max_decoder_steps = 3000

In [None]:
waveglow = torch.hub.load('NVIDIA/DeepLearningExamples:torchhub', 'nvidia_waveglow', model_math='fp16')
waveglow = waveglow.remove_weightnorm(waveglow)
waveglow = waveglow.to('cuda')
waveglow.eval()
print('done')

In [None]:
def tts(text, rate=16000, write_to_file=False, path='audio.wav', return_auido=True):
    sequences, lengths = utils.prepare_input_sequence([text])
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)
        audio = waveglow.infer(mel)
        audio_numpy = audio[0].data.cpu().numpy()
    if write_to_file:
        write(path, rate, audio_numpy)
    if return_auido:
        return Audio(audio_numpy, rate=rate)

In [None]:
def tts2(text, rate=16000, write_to_file=False, path='audio.wav', return_auido=True, threshold=20):
    sequences, lengths = utils.prepare_input_sequence([text])
    with torch.no_grad():
        mel, _, _ = tacotron2.infer(sequences, lengths)
        audio = waveglow.infer(mel)
        audio_numpy = audio[0].data.cpu().numpy()
    if len(audio_numpy) // rate > threshold:
        print(f'not accepted because length is {len(audio_numpy) // rate} for path {path}')
        return False
    if write_to_file:
        write(path, rate, audio_numpy)
        return True
    if return_auido:
        return Audio(audio_numpy, rate=rate)

In [None]:
os.makedirs('dataset', exist_ok=True)

In [None]:
nltk.download('brown')

sentences = brown.sents()
len(sentences)

In [None]:
MAX_LENGTH_ACCEPTABLE = 47
max_length = 0
max_sentence = None
c = 0
for item in sentences:
    if len(item) > MAX_LENGTH_ACCEPTABLE:
        c += 1
    if MAX_LENGTH_ACCEPTABLE + 1 > len(item) > max_length :
        max_length = len(item)
        max_sentence = item
print(len(max_sentence))
print(len(sentences) - c)

In [None]:
print(' '.join(max_sentence))
tts(' '.join(max_sentence), rate=22050)

In [None]:
accepted_sentences = []
unaccepted_sentences = []
for item in sentences:
    if len(item) <= MAX_LENGTH_ACCEPTABLE:
        accepted_sentences.append(item)
    else:
        unaccepted_sentences.append(item)
print(len(accepted_sentences))

In [None]:
unaccepted_sentences.sort(key=lambda x: len(x))
len(unaccepted_sentences[0])

In [None]:
s = ' '.join(unaccepted_sentences[25])
print(len(unaccepted_sentences[25]))
print(s)
a, b = tts(s, rate=22050)

In [None]:
len(b) // 22050

In [None]:
size = len(accepted_sentences) // 10 + 3 
sublists = [accepted_sentences[i:i + size] for i in range(0, len(accepted_sentences), size)]
print([len(sublists[i]) for i in range(len(sublists))])

In [None]:
meta_data = []
localized_meta_data = {}
BASIC_PATH = 'dataset'
c = 0
for i, sublist in enumerate(sublists):
    part_id = i + 1
    BASIC_PATH_PART = f'{BASIC_PATH}/part{part_id}/audios'
    local_data = []
    for item in sublist:
        c += 1
        sentence = ' '.join(item)
        sentence = re.sub(r'\s+([?.!"])', r'\1', sentence)
        path = f'{BASIC_PATH_PART}/audio_{c}.wav'
        local_data.append({'id':c,'file_path': path, 'text': sentence})
        meta_data.append({'id':c,'file_path': path, 'text': sentence})
    localized_meta_data[f'part{part_id}'] = local_data

In [None]:
BASIC_PATH = 'dataset'
META_DATA_LOCALIZED_PATH = f'{BASIC_PATH}/localized_metadata.json'
with open(META_DATA_LOCALIZED_PATH, 'r') as f:
    localized_meta_data = json.load(f)

In [None]:
META_DATA_GLOBAL_PATH = f'{BASIC_PATH}/global_metadata.json'
with open(META_DATA_GLOBAL_PATH, 'r') as f:
    global_meta_data = json.load(f)

In [None]:
with open(f'{BASIC_PATH}/global_metadata.json', 'w') as f:
    json.dump(meta_data, f)
META_DATA_LOCALIZED_PATH = f'{BASIC_PATH}/localized_metadata.json'
with open(META_DATA_LOCALIZED_PATH, 'w') as f:
    json.dump(localized_meta_data, f)

In [None]:
shutil.rmtree(BASIC_PATH)
os.makedirs(BASIC_PATH, exist_ok=True)

In [None]:
os.listdir(BASIC_PATH)

In [None]:
def create_audios(part_id, meta_data_localized_path, start_index=None):
    with open(meta_data_localized_path, 'r') as f:
        meta_data = json.load(f)
    meta_data = meta_data[f'part{part_id}']
    
    os.makedirs(f'{BASIC_PATH}/part{part_id}', exist_ok=True)
    BASIC_PATH_PART = f'{BASIC_PATH}/part{part_id}/audios'
    os.makedirs(BASIC_PATH_PART, exist_ok=True)
    
    first_index = meta_data[0]['id']
    if start_index is None:
        start_index = first_index
    
    for i in tqdm(range(start_index - first_index, len(meta_data))):
        tts(meta_data[i]['text'], rate=22050, write_to_file=True, path=meta_data[i]['file_path'], return_auido=False)
        
        if i % 20 == 0:
            gc.collect()
            torch.cuda.empty_cache()
            torch.cuda.ipc_collect()

In [None]:
create_audios(1, META_DATA_LOCALIZED_PATH, start_index=133)

In [None]:
shutil.make_archive('dataset_part1', 'zip', f'{BASIC_PATH}/part1')

In [None]:
import tarfile
with tarfile.open('dataset_part1.tar.gz', 'w:gz') as tar:
    tar.add(f'{BASIC_PATH}/part1', arcname='dataset_part1')

In [None]:
os.remove('dataset_part1.tar.gz')

In [None]:
FileLink(r'dataset_part1.zip')

In [None]:
create_audios(2, META_DATA_LOCALIZED_PATH, start_index=8641)

In [None]:
# bad_ids = [(9710, 2), (19729, 4), (20309, 4), (26563, 5), (27331, 5), (27334, 5), (27377, 5), (27679, 6)]

In [None]:
def replace_bad_sentences(ids, start_index_bad_sentences=0):
    c = start_index_bad_sentences
    for _id, part_id in tqdm(ids):
        start_index = localized_meta_data[f'part{part_id}'][0]['id']
        final_index = _id - start_index 
        path = localized_meta_data[f'part{part_id}'][final_index]['file_path']
        replaced_sentence =  ' '.join(unaccepted_sentences[c])
        is_ok = tts2(replaced_sentence, rate=22050, write_to_file=True, path=path, return_auido=False, threshold=18)
        if is_ok:
            localized_meta_data[f'part{part_id}'][final_index]['text'] = replaced_sentence
            global_meta_data[_id - 1]['text'] = replaced_sentence
        else:
            print((_id, part_id))
        c += 1
    print(f'c is {c}')

In [None]:
replace_bad_sentences(bad_ids, 0)

In [None]:
bad_ids = [(27377, 5)]
replace_bad_sentences(bad_ids, 11)

In [None]:
with open(f'{BASIC_PATH}/global_metadata.json', 'w') as f:
    json.dump(global_meta_data, f)
META_DATA_LOCALIZED_PATH = f'{BASIC_PATH}/localized_metadata.json'
with open(META_DATA_LOCALIZED_PATH, 'w') as f:
    json.dump(localized_meta_data, f)

In [None]:
shutil.make_archive('dataset_part2', 'zip', f'{BASIC_PATH}/part2')

In [None]:
# FileLink(r'dataset_part2.zip')

In [None]:
# shutil.make_archive('dataset_part4', 'zip', f'{BASIC_PATH}/part4')

In [None]:
# FileLink(r'dataset_part4.zip')

In [None]:
# os.remove('dataset_part2.zip')

In [None]:
# os.remove('dataset_part6.zip')

In [None]:
# shutil.make_archive('dataset_part5', 'zip', f'{BASIC_PATH}/part5')

In [None]:
# FileLink(r'dataset_part5.zip')

In [None]:
# shutil.make_archive('dataset_part6', 'zip', f'{BASIC_PATH}/part6')

In [None]:
# FileLink(r'dataset_part6.zip')

In [None]:
# shutil.rmtree('dataset/part2')
# shutil.rmtree('dataset/part4')
# shutil.rmtree('dataset/part5')

In [None]:
# create_audios(3, META_DATA_LOCALIZED_PATH)

In [None]:
# os.remove('dataset_part6.zip')

In [None]:
# shutil.make_archive('dataset_part3', 'zip', f'{BASIC_PATH}/part3')

In [None]:
# FileLink(r'dataset_part3.zip')

In [None]:
# part_id = 5
# start_index = localized_meta_data[f'part{part_id}'][0]['id']
# final_index = 22080 - start_index 

# print(localized_meta_data[f'part{part_id}'][final_index])
# _id = localized_meta_data[f'part{part_id}'][final_index]['id']
# Audio(filename=f'{BASIC_PATH}/part{part_id}/audios/audio_{_id}.wav')

In [None]:
# create_audios(4, META_DATA_LOCALIZED_PATH, 20310)

In [None]:
# os.remove('dataset_part3.zip')

In [None]:
# create_audios(5, META_DATA_LOCALIZED_PATH, 27378)

In [None]:
# create_audios(6, META_DATA_LOCALIZED_PATH, 27680)

In [None]:
# shutil.rmtree('dataset/part3')

In [None]:
# create_audios(7, META_DATA_LOCALIZED_PATH, 35321)

In [None]:
# create_audios(8, META_DATA_LOCALIZED_PATH)

In [None]:
# create_audios(9, META_DATA_LOCALIZED_PATH)

In [None]:
create_audios(10, META_DATA_LOCALIZED_PATH, 51165)

In [None]:
shutil.make_archive('dataset_part7', 'zip', f'{BASIC_PATH}/part7')

In [None]:
FileLink(r'dataset_part7.zip')

In [None]:
shutil.make_archive('dataset_part8', 'zip', f'{BASIC_PATH}/part8')

In [None]:
FileLink(r'dataset_part8.zip')

In [None]:
shutil.make_archive('dataset_part10', 'zip', f'{BASIC_PATH}/part10')
FileLink(r'dataset_part10.zip')

In [None]:

# for extract
import zipfile
with zipfile.ZipFile('example.zip', 'r') as zip_ref:
    zip_ref.extractall('target_dir')

# Dataset Histogram

In [None]:
import nltk
from nltk.corpus import brown
categories = brown.categories()
sentences = [len(brown.sents(categories=cat)) for cat in categories]
print(dict(zip(categories, sentences)))


In [None]:
import matplotlib.pyplot as plt
plt.bar(categories, sentences)
plt.xticks(rotation=90)
plt.xlabel('Category')
plt.ylabel('Number of sentences')
plt.show()