# NLP Experiments

In [1]:
# Standard Library
import os
import io
from pathlib import Path

# Scientific
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# AWS
import awswrangler as wr
import boto3
import botocore
import sagemaker
import sagemaker.huggingface

# HuggingFace
from transformers import AutoTokenizer, pipeline, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import load_dataset, load_from_disk, Dataset, load_metric
from datasets.filesystems import S3FileSystem

# Various Third Party
from dotenv import load_dotenv

## Setup Session Configuration

In [2]:
load_dotenv()
bucket = os.getenv("DEFAULT_BUCKET")
prefix = os.getenv("S3_PREFIX")

path_prefix = f"s3://{bucket}/{prefix}"

boto_session = boto3.Session(profile_name=os.getenv("AWS_PROFILE"), region_name=os.getenv("AWS_DEFAULT_REGION"))
session = sagemaker.Session(boto_session=boto_session, default_bucket=bucket)
s3fs = S3FileSystem(session=botocore.session.Session(profile=os.getenv("AWS_PROFILE")))
role = os.getenv("SAGEMAKER_ARN_ROLE", None)

print(path_prefix)

s3://play-projects-joshpeak/nlp-play/sagemaker


## Fetch Data from Athena

In [3]:
cache_dataset = Path("./data/transactions.dataset/")

if not cache_dataset.exists():
    print("Cache miss... fetching...")
    df = wr.athena.read_sql_query(
        """
    SELECT 
        rule_based_label as label_string
        , description as text
    FROM finances.silver_labelled
    WHERE rule_based_label not in ('Other')
    """,
        "finances",
        boto3_session=boto_session,
    )
    ds = Dataset.from_pandas(df)
    ds.save_to_disk(str(cache_dataset))

transactions_dataset = load_from_disk(str(cache_dataset))
labels = list(transactions_dataset.to_pandas().label_string.unique())
label2id = {l: i for i, l in enumerate(labels)}
id2label = {i: l for i, l in enumerate(labels)}
N = len(labels)
N, label2id, id2label

Cache miss... fetching...


(13,
 {'Groceries': 0,
  'Utility': 1,
  'Takeaway/Fastfood': 2,
  'Parking': 3,
  'Fuel': 4,
  'Newspaper': 5,
  'Health/Pharmacy': 6,
  'Home/Garden/Office': 7,
  'Cafe': 8,
  'Fitness': 9,
  'Pets/Vet': 10,
  'Vehicle': 11,
  'HomeLoan': 12},
 {0: 'Groceries',
  1: 'Utility',
  2: 'Takeaway/Fastfood',
  3: 'Parking',
  4: 'Fuel',
  5: 'Newspaper',
  6: 'Health/Pharmacy',
  7: 'Home/Garden/Office',
  8: 'Cafe',
  9: 'Fitness',
  10: 'Pets/Vet',
  11: 'Vehicle',
  12: 'HomeLoan'})

## Create Train / Test / Validate Data Split

In [4]:
train_test_transactions_dataset = transactions_dataset.shuffle().train_test_split(test_size=0.1)
train_test_transactions_dataset.cleanup_cache_files()
train_test_transactions_dataset

DatasetDict({
    train: Dataset({
        features: ['label_string', 'text'],
        num_rows: 909
    })
    test: Dataset({
        features: ['label_string', 'text'],
        num_rows: 102
    })
})

## Export Data to S3 ready for Training

In [5]:
path = f"{path_prefix}/transactions"
print(path)
# train_test_transactions_dataset.save_to_disk(path, fs=s3fs)

s3://play-projects-joshpeak/nlp-play/sagemaker/transactions


In [6]:
import torch

torch.cuda.is_available()

False

# Train Model

## Load and Configure Base Model and Tokenizer

In [7]:
from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, AutoModel

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
config = AutoConfig.from_pretrained(model_name, label2id=label2id, id2label=id2label)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, ignore_mismatched_sizes=True, config=config)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([2, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([2]) in the checkpoint and torch.Size([13]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


## Preprocess / Tokenize

This uses HuggingFace's Dataset API to efficiently load batches into memory using Arrow and parallelizes the batches independently.

In [8]:
def preprocessing_function(batch):
    tokenized_batch = tokenizer(batch["text"], padding=True, truncation=True)
    tokenized_batch["labels"] = [label2id[label] for label in batch["label_string"]]
    return tokenized_batch


tokenized_dataset = train_test_transactions_dataset.map(preprocessing_function, batched=True)
tokenized_dataset

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['label_string', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 909
    })
    test: Dataset({
        features: ['label_string', 'text', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 102
    })
})

## Train and Evaluate Model

In [9]:
metric = load_metric("f1")


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels, average="weighted")


training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    evaluation_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: label_string, text. If label_string, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 909
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 285


Epoch,Training Loss,Validation Loss,F1
1,No log,1.422814,0.488144
2,No log,0.814761,0.875576
3,No log,0.513699,0.921081
4,No log,0.384919,0.930652
5,No log,0.348806,0.930652


The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: label_string, text. If label_string, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 102
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: label_string, text. If label_string, text are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 102
  Batch size = 16
The following columns in the evaluation set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: label_string, text. If label_string, text are not expected by `DistilBertForSequenceClassification.forward`,  yo

TrainOutput(global_step=285, training_loss=0.8363298382675438, metrics={'train_runtime': 79.9768, 'train_samples_per_second': 56.829, 'train_steps_per_second': 3.564, 'total_flos': 21170476177020.0, 'train_loss': 0.8363298382675438, 'epoch': 5.0})

In [10]:
classifier = pipeline(task="text-classification", model=model, tokenizer=tokenizer)

In [11]:
others_cache_dataset = Path("./data/others_transactions.dataset/")

if not others_cache_dataset.exists():
    print("Cache miss... fetching...")
    others_df = wr.athena.read_sql_query(
        """
    SELECT 
        rule_based_label as label_string
        , description as text
    FROM finances.silver_labelled
    WHERE rule_based_label in ('Other')
    """,
        "finances",
        boto3_session=boto_session,
    )
    others_ds = Dataset.from_pandas(others_df)
    others_ds.save_to_disk(str(others_cache_dataset))

Cache miss... fetching...


In [12]:
import ipywidgets as widgets

category_widget = widgets.ToggleButtons(
    options=labels,
    description="Labels:",
    disabled=False,
)


def classify(batch):
    tokenized_batch = tokenizer(batch["text"], padding=True, truncation=True)
    # tokenized_batch["tokens"] = tokenizer.tokenize(batch["text"])
    tokenized_batch["classification"] = classifier(batch["text"])
    return tokenized_batch


classified_others = others_ds.map(classify, batched=True)

  0%|          | 0/1 [00:00<?, ?ba/s]

Disabling tokenizer parallelism, we're using DataLoader multithreading already


In [13]:
classified_others_df = classified_others.to_pandas()
other_iter = classified_others_df.iterrows()
item = None

## Data Labelling

Use the below cell with Control+Enter.

 - Interactively loop through the Dataframe, using the model to try and predictively label the unlabelled entries.
 - Use the widget to select the correct label and run the cell
 - Running the cell grabs the value from the widget and applies the label before lodaing a new entry.


In [18]:
if item is not None:
    print("=" * 30)
    print(f"Labelled {item[1].text} as {category_widget.value}...")
    classified_others_df.at[item[0], "label_string"] = category_widget.value
    print(item)

print("=" * 30)
print("Please label this entry...\n")
item = next(other_iter)
print(item)
category_widget.value = item[1].classification["label"]  # predictively try to label entry
print(tokenizer.tokenize(item[1].text))
print("=" * 30)
category_widget

Labelled THETUMN ROOMS COOKS HILL as Cafe...
(1, label_string                                                   Cafe
text                                       THETUMN ROOMS COOKS HILL
input_ids         [101, 1996, 11667, 2078, 4734, 26929, 2940, 10...
attention_mask    [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...
classification    {'label': 'Groceries', 'score': 0.270081222057...
Name: 1, dtype: object)
Please label this entry...

(2, label_string                                                  Other
text                              Belflora Newcastle Fl Broadmeadow
input_ids         [101, 19337, 10258, 6525, 8142, 13109, 5041, 4...
attention_mask    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, ...
classification    {'label': 'Health/Pharmacy', 'score': 0.331919...
Name: 2, dtype: object)
['bel', '##fl', '##ora', 'newcastle', 'fl', 'broad', '##me', '##ado', '##w']


ToggleButtons(description='Labels:', index=6, options=('Groceries', 'Utility', 'Takeaway/Fastfood', 'Parking',…

In [19]:
classified_others_df[classified_others_df.label_string != "Other"]

Unnamed: 0,label_string,text,input_ids,attention_mask,classification
0,Home/Garden/Office,THE GAMES SHOP CHARLESTOWN,"[101, 1996, 2399, 4497, 2798, 4665, 102, 0, 0,...","[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ...","{'label': 'Groceries', 'score': 0.366621792316..."
1,Cafe,THETUMN ROOMS COOKS HILL,"[101, 1996, 11667, 2078, 4734, 26929, 2940, 10...","[1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...","{'label': 'Groceries', 'score': 0.270081222057..."


In [None]:
# TODO: Save updated labelled dataset into silver_labelled