# Amazon Fine Foods dataset creation

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd
import nltk
nltk.download('punkt')
import tensorflow as tf
import tensorflow_hub as hub
import bert
from bert import run_classifier, tokenization, optimization
logger = tf.get_logger()
logger.propagate = False

### Dataset settings 
The featurization will depend on the vocab.txt of the bert model. For now, a smaller
4 layer model is used, with a hidden size of 256 and 4 attention heads.

In [None]:
bert_model_hub = "https://tfhub.dev/google/small_bert/bert_uncased_L-4_H-256_A-4/1"
max_seq_len = 128  # this is relatively small and helps keep the compute cost down
label_list = [1, 2, 3, 4, 5]  # reviews awards scores between 1 and 5

### Create dataset splits
This is a relatively large dataset with ~600k examples. 

In [None]:
data = pd.read_csv("datasets/Reviews.csv")
all_text = data["Text"]
all_scores = data["Score"]
print(f"Total dataset length is {len(all_text)} samples")

n_test = 20000
n_dev  = 40000
train_text, test_text, train_scores, test_scores = train_test_split(all_text, all_scores, test_size=n_test, shuffle=True)
train_text, dev_text, train_scores, dev_scores = train_test_split(train_text, train_scores, test_size=n_dev, shuffle=True)
print(f"Train dataset has length {len(train_text)} samples")
print(f"Dev dataset has length    {len(dev_text)} samples")
print(f"Test dataset has length   {len(test_text)} samples")

# Save dataset splits to disk
This ensures the same dataset split is used when creating in-task fine-tuning data and in-task pretraining

In [None]:
def save_split(text, scores, split_name):
    data = {"Text": text, "Score": scores}
    df = pd.concat(data, axis=1)
    df.to_csv(f"datasets/Reviews_{split_name}.csv")
    return

save_split(train_text, train_scores, "train")
save_split(test_text, test_scores, "test")
save_split(dev_text, dev_scores, "dev")

# Load dataset splits from disk
If splits have already been saved to disk, they can be loaded here instead of created above


In [None]:
def load_split(split_name):
    df = pd.read_csv(f"datasets/Reviews_{split_name}.csv")
    text = df["Text"]
    scores = df["Score"]
    return text, scores

train_text, train_scores = load_split("train")
test_text, test_scores = load_split("test")
dev_text, dev_scores = load_split("dev")

# Histogram of score distributions

In [None]:
from matplotlib import pyplot as plt
plt.title("Histogram of scores")
import numpy as np
train_scores.hist(bins=np.linspace(0.5, 5, 10) + 0.25, align="mid")
plt.ylabel("No. examples")
plt.xlabel("Score (i.e. stars)")
plt.show()
#plt.savefig("histogram_of_scores.png", dpi=200, bbox_inches="tight")

# Get the tokenizer
The tokenizer does a few things (this is also included in the Python library):

1. Lowercase our text (if we're using a BERT lowercase model)
2. Tokenize it (i.e. "sally says hi" -> ["sally", "says", "hi"])
3. Break words into WordPieces (i.e. "calling" -> ["call", "##ing"])
4. Map our words to indexes using a vocab file that BERT provides
5. Add special "CLS" and "SEP" tokens (see the [readme](https://github.com/google-research/bert))
6. Append "index" and "segment" tokens to each input (see the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf))

To start, we'll need to load a vocabulary file and lowercasing information directly from the BERT tf hub module:

In [None]:
with tf.Graph().as_default():
    bert_module = hub.Module(bert_model_hub)
    tokenization_info = bert_module(signature="tokenization_info", as_dict=True)
    with tf.Session() as sess:
        vocab_file, do_lower_case = sess.run([tokenization_info["vocab_file"], tokenization_info["do_lower_case"]])      
    tokenizer = bert.tokenization.FullTokenizer(vocab_file=vocab_file, do_lower_case=do_lower_case)
    
# Example of tokenization
print("\nEXAMPLE:")
print(tokenizer.tokenize("Fine-tuning BERT on fine foods, with tricks"))

###  Data Preprocessing
This proprocessing code is taken from the offical bert repo example: https://github.com/google-research/bert/blob/master/predicting_movie_reviews_with_bert_on_tf_hub.ipynb

We need to transform our data into a format BERT understands. This involves two steps. First, we create  `InputExample`'s using the constructor provided in the BERT library.

For the baseline we only used `text_a` as the first 128 tokens of the text we wanted to classify, but for the final model we will use the  first 64 and the last 64 tokens of the review. The start and end tokens of reviews tend to contains the most information.

- `text_a` is the first and last 64 tokens of the text that we want to classify, 
- `text_b` None
- `label` is the label for our example, i.e. True, False



In [None]:
def create_examples(text, scores):    
    examples = []
    for t, s in zip(text, scores):
        t = t.replace("<br />", "")   # custom data clean-up
        tokens = tokenizer.tokenize(t)
        
        # If the entire review fits, then there is no need to
        # use text b
        if len(tokens) < 128:
            text_a = t
            text_b = None
            
        # If review is long, split into the first 64 and last 64 tokens
        # only append complete sentences\
        else:
            target_length = max_seq_len//2
            text_a = ""
            for sentence in nltk.tokenize.sent_tokenize(t):
                if len(tokenizer.tokenize(text_a)) < target_length:
                    text_a += sentence + " "
                else:
                    break
            text_a = text_a.strip()  # remove trailing whitespace
                
            text_b = ""
            for sentence in reversed(nltk.tokenize.sent_tokenize(t)):
                if len(tokenizer.tokenize(sentence + " " + text_b)) <= target_length:
                    text_b = sentence + " " + text_b
                else:
                    break
            text_b = text_b.strip()  # remove trailing whitespace
            text_a += " " + text_b
            text_a = text_a.replace("  ", " ")
        
        example = bert.run_classifier.InputExample(guid=None, text_a = text_a, text_b = None, label = s)
        examples.append(example)
    return examples
train_examples = create_examples(train_text, train_scores)
dev_examples = create_examples(dev_text, dev_scores)
test_examples = create_examples(test_text, test_scores)        

## Write datasets to disk
This fine foods dataset is relatively large, so writing it to disk may take around ~ 20 mins.

In [None]:
writer = bert.run_classifier.file_based_convert_examples_to_features
writer(train_examples, label_list, max_seq_len, tokenizer, "datasets/training2")
writer(dev_examples, label_list, max_seq_len, tokenizer, "datasets/dev2")
writer(test_examples, label_list, max_seq_len, tokenizer, "datasets/test2")