In [14]:
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pandas as pd
import numpy as np
import librosa
import os

from features import get_features
from text import encode
import constants as c

In [30]:
train = pd.DataFrame([
    ('SC/train/1.wav', 'deaf variable get max camel case'),
    ('SC/train/2.wav', 'brackets'),
    ('SC/train/3.wav', 'array one'),
    ('SC/train/4.wav', 'to be minus one enter'),
    ('SC/train/5.wav', 'for eye in range'),
    ('SC/train/6.wav', 'length brackets'),
    ('SC/train/7.wav', 'if array one of eye'),
    ('SC/train/8.wav', 'move right'),
    ('SC/train/9.wav', 'at least'),
    ('SC/train/10.wav', 'variable max value snake case'),
    ('SC/train/11.wav', 'new scope'),
    ('SC/train/12.wav', 'to be array one of eye'),
    ('SC/train/13.wav', 'return exit scope'),
    ('SC/train/14.wav', 'stop listening')
], columns=["filename", "text"])

train.to_csv("SC/sc_train.csv", sep=",", header=True, index=False)

train = train.to_numpy()
new_train = []

In [None]:
users = ['moaz', 'hamada', 'khaled']

In [27]:
for user in users:
    for path, transcript in train:
        
        file_name = path.split('.')[0] + user + '.wav'
        new_train.append((file_name, transcript))
        
        if(not os.path.exists(file_name + '.X.npy')):
            X = get_features(librosa.load(path, sr=c.sample_rate)[0])
            X = pad_sequences([X], padding='post', value=c.masking_value, dtype=np.float32, maxlen=c.max_X_seq_len)[0]
            np.save(file_name + '.X.npy', X)

        if(not os.path.exists(file_name + '.y.npy')):
            _y = encode(c.start_token + transcript + c.end_token)    
            _y = pad_sequences([_y], padding='post', value=encode(c.pad_token)[0], dtype=np.float32, maxlen=c.max_y_seq_len)[0]
            y = to_categorical(_y, num_classes=c.n_output)

            y_lag = y.copy()
            y_lag = np.delete(y_lag, 0, 0)
            pad = np.reshape(to_categorical(encode(c.pad_token)[0], num_classes=c.n_output), (1, -1))
            y_lag = np.append(y_lag, pad, 0)

            np.save(file_name + '.y.npy', np.array([y, y_lag]))

In [None]:
pd.DataFrame(new_train, columns=["filename", "text"]).to_csv("SC/sc_train.csv", sep=",", header=True, index=False)

In [None]:
shutil.make_archive('SC', 'zip', 'SC')

In [None]:
from google.cloud import storage

bucketName = 'cv-dataset'
bucketFolder = 'cv-dataset'
localFile = 'SC.zip'
cloudFile = 'SC.zip'

storage_client = storage.Client()
bucket = storage_client.get_bucket(bucketName)

blob = bucket.blob(os.path.join(bucketFolder, cloudFile))
blob.upload_from_filename(localFile)