In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
import tensorflow as tf
from glob import glob
import string
import requests
import shutil
import random
import json
from multiprocessing import Pool

CTC_VOCAB = [''] + list(string.ascii_lowercase + string.digits) + [' ']



2022-06-10 14:54:12.954543: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


In [3]:
def download_file_cloud(url, filename):
    r = requests.get(url, stream=True)
    total_size = int(r.headers['content-length'])
    version = int(r.headers.get('X-Bz-Upload-Timestamp', 0))
    try:
        local_size = os.path.getsize(filename)
        if local_size == total_size:
            print(f'{filename} local size matched with cloud size')
            return version
    except Exception as e:
        print(e)
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, 'wb') as f:
        for data in r.iter_content(chunk_size=1_048_576):
            f.write(data)

In [4]:
def get_dataset(files, directory='tfrecord', overwrite_directory=True):
    os.makedirs(directory, exist_ok=True)
    if overwrite_directory:
        shutil.rmtree(directory)
    files_to_download = []
    for f in files:
        filename = os.path.join(directory, '-'.join(f.split('/')[-2:]))
        files_to_download.append((f, filename))

    pool = Pool(processes=len(files))
    pool.starmap(download_file_cloud, files_to_download)
    pool.close()
    pool.join()
    tfrecords = glob(f'{directory}/*.tfrecord')
    return tfrecords

In [5]:
def preprocess_inputs(example):
    length = tf.cast(tf.shape(example['waveforms'])[0], tf.int32)
    length = tf.expand_dims(length, 0)
    example['waveforms_length'] = length
    example['targets'] = tf.cast(example['targets'], tf.int32)
    example['targets_length'] = tf.cast(example['targets_length'], tf.int32)
    return example


def parse(serialized_example):

    data_fields = {
        'waveforms': tf.compat.v1.VarLenFeature(tf.float32),
        'targets': tf.compat.v1.VarLenFeature(tf.int64),
        'targets_length': tf.compat.v1.VarLenFeature(tf.int64),
        'lang': tf.compat.v1.VarLenFeature(tf.int64),
    }
    features = tf.compat.v1.parse_single_example(
        serialized_example, features=data_fields
    )
    for k in features.keys():
        features[k] = features[k].values

    features = preprocess_inputs(features)

    keys = list(features.keys())
    for k in keys:
        if k not in ['waveforms', 'waveforms_length', 'targets', 'targets_length']:
            features.pop(k, None)

    return features

In [6]:
files = glob('tfrecord-300m-test/*.tfrecord')
files

['tfrecord-300m-test/malay-2-25.tfrecord']

In [7]:
get_dataset(['https://huggingface.co/huseinzol05/STT-Mixed-TFRecord/resolve/main/mandarin/0-35.tfrecord'])

[Errno 2] No such file or directory: 'tfrecord/mandarin-0-35.tfrecord'


['tfrecord/mandarin-0-35.tfrecord']

In [8]:
batch_files = 10

def generate(files, directory):
    while True:
        random.shuffle(files)
        for i in range(0, len(files), batch_files):
            batch = files[i: i + batch_files]
            batch = [b.decode() if isinstance(b, bytes) else b for b in batch]
            directory = directory.decode() if isinstance(directory, bytes) else directory
            r = get_dataset(batch, directory = directory)
            print(r)
            yield r

In [9]:
def get_datasets(files, directory, batch_size=2, shuffle_size=32, num_cpu_threads = 4, thread_count=24):
    def get():
        d = tf.data.Dataset.from_generator(
            generate, tf.string, output_shapes=tf.TensorShape([None]), args=(files,directory),
        )
        d = d.repeat(3)
        d = d.interleave(
            tf.data.TFRecordDataset,
            cycle_length=num_cpu_threads,
            block_length=thread_count)
        d = d.shuffle(buffer_size=100)
        d = d.map(parse, num_parallel_calls=num_cpu_threads)
        d = d.padded_batch(
            batch_size,
            padded_shapes={
                'waveforms': tf.TensorShape([None]),
                'waveforms_length': tf.TensorShape([None]),
                'targets': tf.TensorShape([None]),
                'targets_length': tf.TensorShape([None]),
            },
            padding_values={
                'waveforms': tf.constant(0, dtype=tf.float32),
                'waveforms_length': tf.constant(0, dtype=tf.int32),
                'targets': tf.constant(0, dtype=tf.int32),
                'targets_length': tf.constant(0, dtype=tf.int32),
            },
        )
        return d
    return get

In [10]:
with open('huggingface-3mixed-train-test.json') as fopen:
    dataset = json.load(fopen)['train'][:5]
dataset

['https://huggingface.co/huseinzol05/STT-Mixed-TFRecord/resolve/main/mandarin/0-35.tfrecord',
 'https://huggingface.co/huseinzol05/STT-Mixed-TFRecord/resolve/main/mandarin/0-97.tfrecord',
 'https://huggingface.co/huseinzol05/STT-Mixed-TFRecord/resolve/main/singlish/3-7.tfrecord',
 'https://huggingface.co/huseinzol05/STT-Mixed-TFRecord/resolve/main/singlish/0-17.tfrecord',
 'https://huggingface.co/huseinzol05/STT-Mixed-TFRecord/resolve/main/mandarin/2-103.tfrecord']

In [11]:
d = get_datasets(dataset, 'tfrecord-test')()
d = d.make_one_shot_iterator().get_next()
d

{'targets': <tf.Tensor 'IteratorGetNext:0' shape=(?, ?) dtype=int32>,
 'targets_length': <tf.Tensor 'IteratorGetNext:1' shape=(?, ?) dtype=int32>,
 'waveforms': <tf.Tensor 'IteratorGetNext:2' shape=(?, ?) dtype=float32>,
 'waveforms_length': <tf.Tensor 'IteratorGetNext:3' shape=(?, ?) dtype=int32>}

In [12]:
sess = tf.Session()




2022-06-10 14:54:24.927370: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-06-10 14:54:24.929981: E tensorflow/stream_executor/cuda/cuda_driver.cc:282] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-06-10 14:54:24.930002: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop
2022-06-10 14:54:24.930006: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop
2022-06-10 14:54:24.930049: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: 470.129.6
2022-06-10 14:54:24.930072: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6
2022-06-10 14:54:24.930075: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:310] kernel version seems to match DSO: 470.129.6
2022-06-10 14:54:24.935473: I tensorflow/

In [14]:
sess.run(d)

{'targets': array([[ 6,  1, 14,  7, 37, 25,  9, 37, 19,  8, 15, 21, 37, 12,  9, 21,
         37,  2, 15, 37,  4,  5, 37,  7,  5,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0],
        [24,  9,  5, 37,  7,  5, 37, 19,  8,  9, 37, 24,  9,  1, 14, 37,
         24,  9,  5, 37,  7,  5, 37,  3,  9, 37,  8, 21,  1, 14, 37, 19,
          8,  9, 37, 24,  9,  1, 14, 37, 24,  9,  5, 37, 17, 21, 37, 26,
          9, 37, 14,  9]], dtype=int32),
 'targets_length': array([[25],
        [52]], dtype=int32),
 'waveforms': array([[ 0.        , -0.00655022, -0.00436681, ...,  0.        ,
          0.        ,  0.        ],
        [ 0.00215983, -0.00431965, -0.00431965, ..., -0.00215983,
          0.        ,  0.        ]], dtype=float32),
 'waveforms_length': array([[69632],
        [96939]], dtype=int32)}