In [None]:
%load_ext autoreload
%autoreload 2

!pip install transformers
!pip install sentencepiece

from google.colab import drive
drive.mount('/content/drive', force_remount=True)

import random
import numpy as np
import pandas as pd

import tensorflow as tf
tf.config.optimizer.set_jit(True)

from tqdm.auto import tqdm
from transformers import XLMRobertaTokenizer, BartTokenizer, RobertaTokenizer, T5Tokenizer

import os
import yaml
from pathlib import Path

import sys
sys.path.append('/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/modules')
import pipeline, protobuf_handler, md_code_rep_fc, order_check_fc, multi_rep_fc

print('Libraries Imported')

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.20.1-py3-none-any.whl (4.4 MB)
[K     |████████████████████████████████| 4.4 MB 14.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.9 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 61.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 62.2 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninsta

In [None]:
# Unzip files from drive to disk
data_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/AI4Code.zip'
disk_path = '/content'
pipeline.unzip_files(data_path, disk_path)

Unzipping files:   0%|          | 0/139263 [00:00<?, ?it/s]


 Done unzipping data to disk path.


In [None]:
# Get correct order of documents
data_dir = Path(disk_path)
df_orders = pd.read_csv(data_dir / 'train_orders.csv',
                        index_col='id',
                        squeeze=True).str.split()

In [None]:
# Load in excluded IDs and drop them from the data filepath
excluded_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/excluded_ids.yaml'

# Avaliable options: 'similars', 'dupe_md', 'single_code_cell', 'less_than_five', 'less_than_eight', 'less_than_four', 'single_md', 'non_english_ids', 'outliers'
excluded_keys = ['similars']  # can cosnider similars for md_pointwise
excluded_ids = pipeline.load_yaml_file(excluded_ids_path)
dupe_ids = excluded_ids['dupe_md']
excluded_ids = pipeline.remove_only_excluded_keys(excluded_ids, excluded_keys)

random_seed = 20
data_paths = list((data_dir / 'train').glob('*.json'))
random.Random(random_seed).shuffle(data_paths)  # Shuffle data paths list
data_paths = pipeline.remove_excluded_id_paths(data_paths, excluded_ids)

print(f"There are {len(data_paths)} files after removing excluded ids.")
print(f"Each TFRecord file will contain {round(len(data_paths)/96)} files.")
print(f"With indexes as {np.linspace(0, len(data_paths), num=97, dtype=np.int32)}")

Safe loaded in yaml file.
There are 130271 files after removing excluded ids.
Each TFRecord file will contain 1357 files.
With indexes as [     0   1356   2713   4070   5427   6784   8141   9498  10855  12212
  13569  14926  16283  17640  18997  20354  21711  23068  24425  25782
  27139  28496  29853  31210  32567  33924  35281  36638  37995  39352
  40709  42066  43423  44780  46137  47494  48851  50208  51565  52922
  54279  55636  56993  58350  59707  61064  62421  63778  65135  66492
  67849  69206  70563  71920  73277  74634  75991  77348  78705  80062
  81419  82776  84133  85490  86847  88204  89561  90918  92275  93632
  94989  96346  97703  99060 100417 101774 103131 104488 105845 107202
 108559 109916 111273 112630 113987 115344 116701 118058 119415 120772
 122129 123486 124843 126200 127557 128914 130271]


In [None]:
# Select Feature Creation Type and Transformer Model
# Mean tokens in a markdown cell is 112.53 and median is 50.0
# 11 markdown count per document median and 15 mean

# Mean tokens in a code cell is 43.66 and median is 15.0
# 23 code count per document median and 30 mean

multi_trans_model = 'multi'
quad_trans_model = 'gbert'
folder_name = 'multi_gbert_md_code_rep'
feature_creation_type = 'md_code_rep_fc'

# Initalize Model Name and Tokenizer
trans_options = {'multi': ['xlm-roberta-base', XLMRobertaTokenizer],
                 'cbert': ['microsoft/codebert-base', RobertaTokenizer],
                 'gbert': ['microsoft/graphcodebert-base', RobertaTokenizer],
                 't5': ['Salesforce/codet5-base', RobertaTokenizer]}
                 
fc_reqs = {'md_code_rep_fc': [protobuf_handler.write_ensemble_set_to_tfrecords,
                              protobuf_handler.decode_ensemble_protobuf,
                              protobuf_handler.parse_ensemble_tensor_arrays}

write_tensor_arrays_to_tfrecords = fc_reqs[feature_creation_type][0]
decode_protobuf = fc_reqs[feature_creation_type][1]
parse_tensor_arrays = fc_reqs[feature_creation_type][2]
code_split_tokens = fc_reqs[feature_creation_type][3]

selected_trans = trans_options[multi_trans_model] 
multi_trans_name = selected_trans[0]
multi_tokenizer_class = selected_trans[1]
multi_tokenizer = multi_tokenizer_class.from_pretrained(multi_trans_name)

selected_trans = trans_options[quad_trans_model] 
quad_trans_name = selected_trans[0]
quad_tokenizer_class = selected_trans[1]
quad_tokenizer = quad_tokenizer_class.from_pretrained(quad_trans_name)

In [None]:
# Add custom tokens
# common_custom_tokens = ['dataframe', 'dataset', 'figsize',
#                         'kaggle', 'keras', 'loc', 'plt',
#                         'read_csv', 'sklearn', 'sns', 'matplotlib.pyplot',
#                         'x_train', 'x_test', 'y_train', 'y_test']  
common_custom_tokens = []
unique_tokens = ['[DIVIDER]', '[EMPTY]', '[EMOJI]']

# Adding Quadrant 0:, Quadrant 1:, Quadrant 2:, Quadrant 3:
custom_tokens = unique_tokens + code_split_tokens +  common_custom_tokens
multi_tokenizer = pipeline.add_custom_tokens_to_tokenizer(multi_tokenizer, custom_tokens)
quad_tokenizer = pipeline.add_custom_tokens_to_tokenizer(quad_tokenizer, custom_tokens)

Custom tokens have been added to the tokenizer
Custom tokens have been added to the tokenizer


### Create One Dataset to Test

In [None]:
# Load data to memory and preprocess text (formatting and stemming)

start_idx = 51002
end_idx = 51100

doc_ids, cell_metadata, unprocessed_text = pipeline.load_and_parse_data(data_paths, start_idx, end_idx)
code_markdown_locs = [cell for cells in cell_metadata for cell in cells.values()]

text = pipeline.preprocess_text(unprocessed_text, code_markdown_locs)

Loading Json Files:   0%|          | 0/98 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/4155 [00:00<?, ?it/s]

In [None]:
# Prepare Features and Labels
if feature_creation_type == 'quad_search_fc':
  md_quadrant_locs = quad_search_fc.get_quadrant_locs(doc_ids, cell_metadata, df_orders)
  md_code_groupings, labels = quad_search_fc.collect_md_code_groupings(md_quadrant_locs, input_ids, cell_metadata)
  features = quad_search_fc.create_quadrant_features(md_code_groupings, tokenizer)
  labels = quad_search_fc.hot_end_encode_labels(labels)
  features, labels = quad_search_fc.sample_balance_shuffle(features, labels)

elif feature_creation_type == 'md_code_rep_fc':
  md_pct_ranks = md_code_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
  md_groupings, md_quad_groupings, labels = md_code_rep_fc.collect_md_code_groupings(input_ids, cell_metadata, md_pct_ranks)
  md_features = md_code_rep_fc.create_features(md_groupings, tokenizer)
  md_quad_features = md_code_rep_fc.create_features(md_quad_groupings, tokenizer)
  features = tuple((md_features[0], md_features[1], md_quad_features[0], md_quad_features[1]))

elif feature_creation_type == 'multi_rep_fc':
  multi_input_ids = pipeline.encode_text_for_input_ids(text, multi_tokenizer)
  quad_input_ids = pipeline.encode_text_for_input_ids(text, quad_tokenizer)
  md_pct_ranks = multi_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
  md_groupings, md_quad_groupings, labels = multi_rep_fc.collect_md_code_groupings(multi_input_ids,
                                                                                   quad_input_ids,
                                                                                   cell_metadata,
                                                                                   md_pct_ranks,
                                                                                   doc_ids,
                                                                                   dupe_ids)
  md_features = multi_rep_fc.create_features(md_groupings, multi_tokenizer)
  quad_features = multi_rep_fc.create_features(md_quad_groupings, quad_tokenizer)
  features = tuple((md_features[0], md_features[1], quad_features[0], quad_features[1]))

elif feature_creation_type in 'order_check_fc':
  input_ids = order_check_fc.adjust_input_ids_for_ordering(input_ids, cell_metadata)
  orders = order_check_fc.get_orders(doc_ids, cell_metadata, df_orders)
  idx_map, md_locs = order_check_fc.create_idx_map(cell_metadata)
  labels = order_check_fc.create_labels(idx_map, orders)
  idx_map, md_locs, labels = order_check_fc.shuffle_and_adjust_class_imbalance(idx_map, md_locs, labels) # Please address for samples
  mapped_input_ids = order_check_fc.index_input_ids_to_map(input_ids, idx_map)
  features = order_check_fc.add_special_tokens_and_masks(mapped_input_ids,
                                                         md_locs,
                                                         tokenizer)

Encoding Text for Input Ids:   0%|          | 0/4155 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/4155 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/98 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/3198 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/3198 [00:00<?, ?it/s]

In [None]:
sample_filepath = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/tf_data/{folder_name}/{start_idx}-{end_idx}.tfrecords'
write_tensor_arrays_to_tfrecords(features, labels, sample_filepath)
print(f"Finished dataset for {start_idx}-{end_idx}")

Writing TFRecords to Disk:   0%|          | 0/3198 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 51002-51100


### Double Check Created Features

In [None]:
# Take a look at the unparsed daata
print(f'Input Ids:\n{features[0]}\n')
print(f'Attention masks:\n{features[1]}\n')
print(f'Quad II:\n{features[2]}\n')
print(f'QUAD AM:\n{features[3]}\n')
print(f'Labels:\n{labels[:10]}\n')
print(f'Labels Shapes (Please Ensure 2D): {labels.shape}')

Input Ids:
[[     0  10842     25 ...      1      1      1]
 [     0  10842     25 ...      1      1      1]
 [     0  10842     25 ...      1      1      1]
 ...
 [     0  50656 170038 ...      1      1      1]
 [     0    901  83390 ...      1      1      1]
 [     0   5188  27543 ...      1      1      1]]

Attention masks:
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]

Quad II:
[[    0  7939    18 ...     1     1     1]
 [    0  7939    18 ...     1     1     1]
 [    0  7939    18 ...     1     1     1]
 ...
 [    0 25417    48 ...     1     1     1]
 [    0 25417  4333 ...     1     1     1]
 [    0 25417 17772 ...     1     1     1]]

QUAD AM:
[[1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]
 [1 1 1 ... 0 0 0]]

Labels:
[[ 0.03571427]
 [ 0.03571427]
 [ 0.03571427]
 [-0.3214286 ]
 [-0.3214286 ]
 [-0.3214286 ]
 [-0.10714287]
 [-0.10714287]
 [-0.10714287]
 [-

In [None]:
# Manually parse TFRecords
auto = tf.data.experimental.AUTOTUNE
batch_size = 32
epochs = 2
print(f'Sample Filepath is {sample_filepath}')

sample_dataset = (tf.data.TFRecordDataset(sample_filepath, num_parallel_reads=auto)
                  .map(decode_protobuf, num_parallel_calls=auto)
                  .map(parse_tensor_arrays, num_parallel_calls=auto)
                  .shuffle(len(labels)+1, seed=42)
                  .repeat(epochs)
                  .batch(batch_size, drop_remainder=True, num_parallel_calls=auto)
                  .prefetch(auto))
print(sample_dataset)

for batch_features, batch_labels in sample_dataset.take(1):
  print(f'\nParsed Input Ids:\n{batch_features[0]}\n')
  print(f'\nFirst Input Id:\n{batch_features[0][0]}\n')
  print(f'Parsed Attention Mask:\n{batch_features[1]}\n')
  print(f'\nFirst Attention Mask Id:\n{batch_features[1][0]}\n')

  # print(f'Parsed Attention Mask:\n{batch_features[2]}\n')
  # print(f'Parsed Attention Mask:\n{batch_features[3]}\n')
  print(f'Parsed Labels:\n{batch_labels}\n')
  print(f'\nFirst Label:\n{batch_labels[0]}\n')

# Remove sample file to avoid noise in final TFRecords folder
try:
  os.remove(sample_filepath)
  print('Deleted sample file to avoid noise creation in final TFRecords folder.')
except:
  print('Sample file has already been deleted.')

Sample Filepath is /content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/tf_data/multi_gbert_md_code_rep/51002-51100.tfrecords
<PrefetchDataset element_spec=((TensorSpec(shape=(32, 512), dtype=tf.int32, name=None), TensorSpec(shape=(32, 512), dtype=tf.int32, name=None), TensorSpec(shape=(32, 512), dtype=tf.int32, name=None), TensorSpec(shape=(32, 512), dtype=tf.int32, name=None)), TensorSpec(shape=(32, 1), dtype=tf.float32, name=None))>

Parsed Input Ids:
[[     0  17006  11814 ...      1      1      1]
 [     0   1401    765 ...      1      1      1]
 [     0 228006  20763 ...      1      1      1]
 ...
 [     0  56888  50462 ...      1      1      1]
 [     0      6 108369 ...      1      1      1]
 [     0   2367      6 ...      1      1      1]]


First Input Id:
[     0  17006  11814    387  34153      2  92966    214     70   5303
    111   7279  47416      7      4     87    765    914   9319    297
     70  35064   3016    271      8    420  87388    454  19111  474

### Create all TFRecord Datasets

In [None]:
# Create all 11 TFRecords Datasets

all_dataset_splits = np.linspace(0, len(data_paths), num=97, dtype=np.int32)
avg_samples_per_file = []

for i in range(len(all_dataset_splits[:-1])):
  start_idx = all_dataset_splits[i]
  end_idx = all_dataset_splits[i+1]
  filepath = f'/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/tf_data/{folder_name}/{start_idx}-{end_idx}.tfrecords'

  doc_ids, cell_metadata, text = pipeline.load_and_parse_data(data_paths, start_idx, end_idx)
  code_markdown_locs = [cell for cells in cell_metadata for cell in cells.values()]
  
  text = pipeline.preprocess_text(text, code_markdown_locs)
  #input_ids = pipeline.encode_text_for_input_ids(text, tokenizer)
  
  # Prepare Features and Labels
  if feature_creation_type == 'quad_search_fc':
    md_quadrant_locs = quad_search_fc.get_quadrant_locs(doc_ids, cell_metadata, df_orders)
    md_code_groupings, labels = quad_search_fc.collect_md_code_groupings(md_quadrant_locs, input_ids, cell_metadata)
    features = quad_search_fc.create_quadrant_features(md_code_groupings, tokenizer)
    labels = quad_search_fc.hot_end_encode_labels(labels)
    features, labels = quad_search_fc.sample_balance_shuffle(features, labels)

  elif feature_creation_type == 'md_code_rep_fc':
    md_pct_ranks = md_code_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
    md_groupings, md_quad_groupings, labels = md_code_rep_fc.collect_md_code_groupings(input_ids, cell_metadata, md_pct_ranks)
    md_features = md_code_rep_fc.create_features(md_groupings, tokenizer)
    md_quad_features = md_code_rep_fc.create_features(md_quad_groupings, tokenizer)
    features = tuple((md_features[0], md_features[1], md_quad_features[0], md_quad_features[1]))

  elif feature_creation_type == 'multi_rep_fc':
    multi_input_ids = pipeline.encode_text_for_input_ids(text, multi_tokenizer)
    quad_input_ids = pipeline.encode_text_for_input_ids(text, quad_tokenizer)
    md_pct_ranks = multi_rep_fc.get_md_pct_ranks(doc_ids, cell_metadata, df_orders)
    md_groupings, md_quad_groupings, labels = multi_rep_fc.collect_md_code_groupings(multi_input_ids,
                                                                                     quad_input_ids,
                                                                                     cell_metadata,
                                                                                     md_pct_ranks,
                                                                                     doc_ids,
                                                                                     dupe_ids)
    md_features = multi_rep_fc.create_features(md_groupings, multi_tokenizer)
    quad_features = multi_rep_fc.create_features(md_quad_groupings, quad_tokenizer)
    features = tuple((md_features[0], md_features[1], quad_features[0], quad_features[1]))

  # Write Features to Protobufs
  avg_samples_per_file.append(len(features[0]))
  write_tensor_arrays_to_tfrecords(features, labels, filepath)
  print(f"Finished dataset for {start_idx}-{end_idx}\n")
print('All requested datasets have been recoreded as TFRecords!')
print(f'The average samples per file was {sum(avg_samples_per_file)/len(avg_samples_per_file):.2f}')

Loading Json Files:   0%|          | 0/1356 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61152 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61152 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61152 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1356 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47429 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47429 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47429 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 0-1356



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60940 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60940 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60940 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46712 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46712 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46712 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 1356-2713



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60737 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60737 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60737 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45085 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45085 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45085 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 2713-4070



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61739 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61739 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61739 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48668 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48668 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48668 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 4070-5427



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60160 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60160 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60160 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45561 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45561 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45561 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 5427-6784



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63618 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63618 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63618 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47647 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47647 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47647 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 6784-8141



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62649 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62649 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62649 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47882 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47882 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47882 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 8141-9498



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63912 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63912 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63912 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49671 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49671 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49671 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 9498-10855



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62172 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62172 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62172 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47783 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47783 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47783 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 10855-12212



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61432 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61432 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61432 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46283 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46283 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46283 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 12212-13569



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60939 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60939 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60939 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46194 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46194 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46194 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 13569-14926



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62309 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62309 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62309 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48941 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48941 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48941 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 14926-16283



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61795 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61795 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61795 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47305 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47305 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47305 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 16283-17640



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61960 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61960 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61960 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48156 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48156 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48156 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 17640-18997



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63618 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63618 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63618 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/51664 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/51664 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/51664 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 18997-20354



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62420 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62420 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62420 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48648 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48648 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48648 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 20354-21711



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61232 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61232 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61232 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46721 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46721 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46721 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 21711-23068



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62369 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62369 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62369 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48509 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48509 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48509 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 23068-24425



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61939 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61939 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61939 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48055 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48055 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48055 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 24425-25782



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/59716 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59716 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59716 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46526 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46526 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46526 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 25782-27139



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60622 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60622 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60622 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46387 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46387 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46387 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 27139-28496



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60949 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60949 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60949 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46334 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46334 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46334 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 28496-29853



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61659 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61659 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61659 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47822 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47822 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47822 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 29853-31210



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61818 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61818 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61818 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47622 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47622 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47622 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 31210-32567



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62744 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62744 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62744 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49731 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49731 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49731 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 32567-33924



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62859 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62859 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62859 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49637 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49637 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49637 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 33924-35281



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61327 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61327 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61327 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45896 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45896 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45896 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 35281-36638



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61405 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61405 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61405 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48289 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48289 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48289 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 36638-37995



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63184 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63184 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63184 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50225 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50225 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50225 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 37995-39352



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61519 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61519 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61519 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47077 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47077 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47077 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 39352-40709



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/65070 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/65070 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/65070 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/51237 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/51237 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/51237 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 40709-42066



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63443 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63443 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63443 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48554 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48554 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48554 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 42066-43423



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63682 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63682 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63682 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/52106 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/52106 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/52106 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 43423-44780



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63141 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63141 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63141 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/51096 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/51096 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/51096 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 44780-46137



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61487 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61487 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61487 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47267 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47267 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47267 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 46137-47494



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60959 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60959 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60959 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47119 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47119 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47119 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 47494-48851



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61720 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61720 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61720 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47244 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47244 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47244 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 48851-50208



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/65061 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/65061 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/65061 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50984 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50984 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50984 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 50208-51565



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/64367 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/64367 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/64367 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50620 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50620 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50620 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 51565-52922



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60702 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60702 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60702 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46660 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46660 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46660 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 52922-54279



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60889 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60889 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60889 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46162 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46162 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46162 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 54279-55636



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60918 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60918 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60918 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47198 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47198 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47198 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 55636-56993



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61308 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61308 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61308 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45573 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45573 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45573 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 56993-58350



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62547 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62547 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62547 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48977 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48977 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48977 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 58350-59707



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61310 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61310 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61310 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47255 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47255 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47255 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 59707-61064



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62616 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62616 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62616 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49088 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49088 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49088 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 61064-62421



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61933 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61933 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61933 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49575 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49575 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49575 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 62421-63778



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61663 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61663 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61663 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48987 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48987 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48987 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 63778-65135



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60443 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60443 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60443 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46982 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46982 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46982 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 65135-66492



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60161 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60161 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60161 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47031 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47031 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47031 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 66492-67849



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60765 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60765 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60765 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45776 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45776 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45776 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 67849-69206



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62452 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62452 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62452 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46546 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46546 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46546 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 69206-70563



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61334 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61334 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61334 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49442 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49442 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49442 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 70563-71920



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62346 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62346 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62346 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48420 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48420 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48420 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 71920-73277



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61566 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61566 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61566 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47714 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47714 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47714 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 73277-74634



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62858 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62858 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62858 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48267 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48267 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48267 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 74634-75991



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61366 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61366 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61366 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47715 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47715 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47715 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 75991-77348



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63618 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63618 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63618 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50036 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50036 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50036 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 77348-78705



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61691 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61691 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61691 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46926 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46926 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46926 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 78705-80062



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/58731 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/58731 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/58731 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/43637 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/43637 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/43637 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 80062-81419



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/58918 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/58918 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/58918 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45766 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45766 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45766 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 81419-82776



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63116 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63116 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63116 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50509 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50509 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50509 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 82776-84133



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61119 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61119 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61119 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46514 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46514 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46514 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 84133-85490



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62381 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62381 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62381 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47281 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47281 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47281 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 85490-86847



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63149 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63149 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63149 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49131 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49131 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49131 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 86847-88204



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/58873 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/58873 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/58873 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/44658 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/44658 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/44658 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 88204-89561



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61572 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61572 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61572 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47993 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47993 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47993 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 89561-90918



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61209 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61209 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61209 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47375 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47375 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47375 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 90918-92275



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61384 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61384 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61384 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49353 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49353 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49353 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 92275-93632



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/64211 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/64211 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/64211 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50432 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50432 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50432 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 93632-94989



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63741 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63741 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63741 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49661 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49661 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49661 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 94989-96346



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63828 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63828 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63828 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50446 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50446 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50446 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 96346-97703



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60723 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60723 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60723 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47865 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47865 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47865 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 97703-99060



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62516 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62516 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62516 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49304 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49304 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49304 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 99060-100417



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61959 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61959 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61959 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48032 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48032 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48032 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 100417-101774



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61955 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61955 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61955 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49265 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/49265 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/49265 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 101774-103131



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60940 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60940 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60940 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47349 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47349 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47349 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 103131-104488



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61018 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61018 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61018 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47572 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47572 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47572 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 104488-105845



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63412 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63412 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63412 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48735 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48735 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48735 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 105845-107202



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60614 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60614 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60614 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46659 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46659 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46659 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 107202-108559



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/59761 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59761 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59761 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46440 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46440 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46440 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 108559-109916



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63698 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63698 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63698 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48959 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48959 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48959 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 109916-111273



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/62249 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62249 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/62249 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46810 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46810 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46810 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 111273-112630



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/59685 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59685 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59685 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/44597 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/44597 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/44597 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 112630-113987



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63898 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63898 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63898 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50180 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50180 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50180 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 113987-115344



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61761 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61761 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61761 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47927 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47927 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47927 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 115344-116701



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/64887 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/64887 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/64887 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50855 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50855 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50855 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 116701-118058



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60576 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60576 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60576 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46193 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46193 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46193 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 118058-119415



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61552 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61552 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61552 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45994 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45994 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45994 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 119415-120772



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61771 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61771 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61771 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47901 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47901 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47901 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 120772-122129



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60868 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60868 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60868 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47314 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/47314 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/47314 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 122129-123486



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/60392 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60392 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/60392 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46413 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/46413 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/46413 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 123486-124843



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63633 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63633 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63633 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50677 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/50677 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/50677 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 124843-126200



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/63123 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63123 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/63123 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48205 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48205 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48205 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 126200-127557



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/59734 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59734 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/59734 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45422 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/45422 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/45422 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 127557-128914



Loading Json Files:   0%|          | 0/1357 [00:00<?, ?it/s]

Preprocessing Text:   0%|          | 0/61353 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61353 [00:00<?, ?it/s]

Encoding Text for Input Ids:   0%|          | 0/61353 [00:00<?, ?it/s]

Collecting Markdown Code Groupings:   0%|          | 0/1357 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48502 [00:00<?, ?it/s]

Assembling Input Ids and Attention Masks:   0%|          | 0/48502 [00:00<?, ?it/s]

Writing TFRecords to Disk:   0%|          | 0/48502 [00:00<?, ?it/s]


All examples have been written as tfrecords to disk
Finished dataset for 128914-130271

All requested datasets have been recoreded as TFRecords!
The average samples per file was 47947.32


In [None]:
# Save Test IDS for Post Processing
test_ids_path = '/content/drive/MyDrive/Colab Notebooks/ML Projects/Google AI4Code/data/test_ids.yaml'
test_id_idxs = all_dataset_splits[-17:]

test_ids = {}
test_ids['ids'] = [test_id_idx.stem for test_id_idx in data_paths[test_id_idxs[0]:test_id_idxs[-1]]]

with open(test_ids_path, 'r+') as stream:
  try:
    yaml.dump(test_ids, stream, default_flow_style=False)
  except yaml.YAMLError as error:
    print(error)
print('Succesfully dumped test ids to yaml file')