In [1]:
from glob import glob

files = glob('*.parse')
files

['mnli.tsv.parse',
 'dumping-iium.tsv.parse',
 'dumping-parliament.tsv.parse',
 'qa.tsv.parse',
 'chatbot.tsv.parse',
 'dumping-wiki.tsv.parse',
 'dumping-news.tsv.parse',
 'dumping-pdf.tsv.parse',
 'summary.tsv.parse',
 'quora.tsv.parse',
 'snli.tsv.parse',
 'dumping-watpadd.tsv.parse',
 'stemming.tsv.parse',
 'news-title.tsv.parse',
 'synonym.tsv.parse']

In [2]:
from unidecode import unidecode
from tqdm import tqdm
import collections
import tensorflow as tf
maxlen = 1024

def create_int_feature(values):
    feature = tf.train.Feature(
        int64_list = tf.train.Int64List(value = list(values))
    )
    return feature


In [3]:
def get_inputs(x, filename):
    input_ids, input_masks, segment_ids, ys = [], [], [], []
    for i in tqdm(range(len(x))):
        tokens = x[i][0]
        input_id = tokens
        
        segment_id = [0] * len(tokens)
        input_mask = [1] * len(input_id)
        
        input_id = input_id + [0] * (maxlen - len(input_id))
        segment_id = segment_id + [0] * (maxlen - len(segment_id))
        input_mask = input_mask + [0] * (maxlen - len(input_mask))

        input_ids.append(input_id)
        input_masks.append(input_mask)
        segment_ids.append(segment_id)
        
        r = x[i][1]
        r = r + [0] * (maxlen - len(r))
        ys.append(r)
        
    r = tf.python_io.TFRecordWriter(f'{filename}.tfrecord')
    for i in tqdm(range(len(ys))):
        features = collections.OrderedDict()
        features['input_ids'] = create_int_feature(input_ids[i])
        features['input_mask'] = create_int_feature(input_masks[i])
        features['segment_ids'] = create_int_feature(segment_ids[i])
        features['y'] = create_int_feature(ys[i])
        tf_example = tf.train.Example(
            features = tf.train.Features(feature = features)
        )
        r.write(tf_example.SerializeToString())
    r.close()

In [4]:
import json

for file in files:
    print(file)

    with open(file) as fopen:
        data = json.load(fopen)
    
    get_inputs(data, file)

mnli.tsv.parse


100%|██████████| 387577/387577 [00:48<00:00, 7924.53it/s] 
100%|██████████| 387577/387577 [05:47<00:00, 1116.93it/s]


dumping-iium.tsv.parse


100%|██████████| 19477/19477 [00:01<00:00, 14919.61it/s]
100%|██████████| 19477/19477 [00:17<00:00, 1090.20it/s]


dumping-parliament.tsv.parse


100%|██████████| 12812/12812 [00:00<00:00, 13597.24it/s]
100%|██████████| 12812/12812 [00:11<00:00, 1084.96it/s]


qa.tsv.parse


100%|██████████| 98619/98619 [00:10<00:00, 9391.01it/s] 
100%|██████████| 98619/98619 [01:31<00:00, 1077.99it/s]


chatbot.tsv.parse


100%|██████████| 325598/325598 [00:37<00:00, 8583.86it/s] 
100%|██████████| 325598/325598 [04:57<00:00, 1093.41it/s]


dumping-wiki.tsv.parse


100%|██████████| 45764/45764 [00:04<00:00, 9956.19it/s] 
100%|██████████| 45764/45764 [00:42<00:00, 1084.32it/s]


dumping-news.tsv.parse


100%|██████████| 48592/48592 [00:06<00:00, 7894.79it/s] 
100%|██████████| 48592/48592 [00:43<00:00, 1118.30it/s]


dumping-pdf.tsv.parse


100%|██████████| 10101/10101 [00:00<00:00, 12582.01it/s]
100%|██████████| 10101/10101 [00:09<00:00, 1103.25it/s]


summary.tsv.parse


100%|██████████| 107472/107472 [00:11<00:00, 9000.12it/s] 
100%|██████████| 107472/107472 [01:37<00:00, 1098.24it/s]


quora.tsv.parse


100%|██████████| 403831/403831 [00:52<00:00, 7657.49it/s] 
100%|██████████| 403831/403831 [06:05<00:00, 1104.31it/s]


snli.tsv.parse


100%|██████████| 380288/380288 [00:48<00:00, 7761.48it/s] 
100%|██████████| 380288/380288 [05:44<00:00, 1104.69it/s]


dumping-watpadd.tsv.parse


100%|██████████| 24245/24245 [00:01<00:00, 14948.11it/s]
100%|██████████| 24245/24245 [00:21<00:00, 1108.62it/s]


stemming.tsv.parse


100%|██████████| 200000/200000 [00:23<00:00, 8423.83it/s] 
100%|██████████| 200000/200000 [02:59<00:00, 1111.57it/s]


news-title.tsv.parse


100%|██████████| 120410/120410 [00:15<00:00, 7990.70it/s] 
100%|██████████| 120410/120410 [01:47<00:00, 1117.19it/s]


synonym.tsv.parse


100%|██████████| 150000/150000 [00:17<00:00, 8791.39it/s] 
100%|██████████| 150000/150000 [02:17<00:00, 1091.58it/s]


In [2]:
files = glob('*.tfrecord')
files

['dumping-parliament.tsv.parse.tfrecord',
 'mnli.tsv.parse.tfrecord',
 'dumping-iium.tsv.parse.tfrecord',
 'qa.tsv.parse.tfrecord',
 'stemming.tsv.parse.tfrecord',
 'dumping-pdf.tsv.parse.tfrecord',
 'chatbot.tsv.parse.tfrecord',
 'synonym.tsv.parse.tfrecord',
 'dumping-watpadd.tsv.parse.tfrecord',
 'news-title.tsv.parse.tfrecord',
 'dumping-wiki.tsv.parse.tfrecord',
 'quora.tsv.parse.tfrecord',
 'summary.tsv.parse.tfrecord',
 'dumping-news.tsv.parse.tfrecord',
 'snli.tsv.parse.tfrecord']

In [3]:
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'mesolitica-storage.json'

In [4]:
from google.cloud import storage
client = storage.Client()
bucket = client.bucket('mesolitica-general')

In [5]:
for file in files:
    print(file)
    blob = bucket.blob(f'b2b-data/{file}')
    blob.upload_from_filename(file)

dumping-parliament.tsv.parse.tfrecord
mnli.tsv.parse.tfrecord
dumping-iium.tsv.parse.tfrecord
qa.tsv.parse.tfrecord
stemming.tsv.parse.tfrecord
dumping-pdf.tsv.parse.tfrecord
chatbot.tsv.parse.tfrecord
synonym.tsv.parse.tfrecord
dumping-watpadd.tsv.parse.tfrecord
news-title.tsv.parse.tfrecord
dumping-wiki.tsv.parse.tfrecord
quora.tsv.parse.tfrecord
summary.tsv.parse.tfrecord
dumping-news.tsv.parse.tfrecord
snli.tsv.parse.tfrecord


In [7]:
# !pip3 freeze