# Amazon Fine Foods in-task pre-training dataset creation

In-task pre-training can help boost downstream fine-tuning performance

In [None]:
import os
import glob
import random
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
import tensorflow as tf
import tensorflow_hub as hub
import bert
from bert import tokenization, optimization, create_pretraining_data
logger = tf.get_logger()
logger.propagate = False

### Dataset settings 
In task Pre-training will re-use the Wikipedia pre-trained weights

In [None]:
bert_model_hub = "https://tfhub.dev/google/small_bert/bert_uncased_L-4_H-256_A-4/1"
max_seq_len = 256  # The pre-training task will typically contain 2 segments of text i.e. 2* 128
max_predictions_per_seq = 40  # The MLM task will have at most these many masked tokens per example
destination_folder = "datasets/in_task_pretraining/"

# Load the raw dataset
The train/dev/test splits were previously create in the create_finetuning_dataset notebook. The same split is used here, for consistency. 

In [None]:
data = pd.read_csv("datasets/Reviews_train.csv")
train_text = data["Text"]
data = pd.read_csv("datasets/Reviews_dev.csv")
dev_text = data["Text"]
data = pd.read_csv("datasets/Reviews_test.csv")
test_text = data["Text"]
print(f"Train dataset length is {len(train_text)} samples")
print(f"Dev dataset length is {len(dev_text)} samples")
print(f"Test dataset length is {len(test_text)} samples")

#  Data Preprocessing
Bert pretraining data should contain one sentence per line and a blank line between different documents (i.e. Reviews). This section processes the raw csv data into this format.

In [None]:
def write_raw_text_files(text, destination_folder, split):
    output_dir = os.path.join(destination_folder, split)
    if not os.path.exists(output_dir): os.makedirs(output_dir)
    output_file = None
    
    # Loop therough all the reviews
    for i, t in enumerate(text):
        
        # Each file should contain no more than 50k reviews
        if i%50000 == 0:
            if output_file is not None: output_file.close()
            output_file = open(os.path.join(output_dir, f"file_{i//50000}"), "w")
        
        # Write each sentence in the review on a separate line
        for sentence in nltk.tokenize.sent_tokenize(t):
            print(sentence, file=output_file)
        
        # Use a blank line between reviews
        print(file=output_file)
    
    # Make sure file is closed
    if not output_file.closed: output_file.close()
    return

write_raw_text_files(train_text, destination_folder, "train")
write_raw_text_files(dev_text, destination_folder, "dev")

# Get the tokenizer

In [None]:
with tf.Graph().as_default():
    bert_module = hub.Module(bert_model_hub)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])      
    tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
    
# Example of tokenization
print("\nEXAMPLE:")
example=\
"""
I bought this after doing some research.
This food is very good for your cat.
My cats coat is all smooth now. <\ br> He is more active.
This is what cats we designed to eat.
"""
print(tokenizer.tokenize(example))

## Create pretraining data
The bert pacakge contains methods for creating pretraining data. The procesure used here is similar to https://github.com/google-research/bert/blob/master/create_pretraining_data.py, except that we use the checkpoint tokenizer information, rather than creating a new dictionary.
Note that this step is quite slow and expensive

In [None]:
def create_pretraining_split(split):
    input_pattern = os.path.join(destination_folder, split) + "/file**"
    input_files = glob.glob(input_pattern)
    output_file = os.path.join(destination_folder, split, f"mlm_max_seq_len_{max_seq_len}.tfrecord")
    print("Using input files: ", input_files)
    print("Using output file: ", output_file)
    
    rng = random.Random(0)
    instances = create_pretraining_data.create_training_instances(
        input_files, tokenizer, max_seq_len, dupe_factor=1,
        short_seq_prob=0.02, masked_lm_prob=0.15, max_predictions_per_seq=max_predictions_per_seq, rng=rng)
    print("Done creating instances")
    create_pretraining_data.write_instance_to_example_files(instances, tokenizer, max_seq_len,
                                    max_predictions_per_seq, [output_file])
    print("Done writting examples to file.")
    return

create_pretraining_split("dev")
create_pretraining_split("train")