# NLP Experiments

In [32]:
# Standard Library
import os
import io
from pathlib import Path

# Scientific
from sklearn.model_selection import train_test_split

# AWS
import awswrangler as wr
import boto3
import botocore
import sagemaker
import sagemaker.huggingface

# HuggingFace
from transformers import AutoTokenizer, pipeline
from datasets import load_dataset, load_from_disk, Dataset
from datasets.filesystems import S3FileSystem

# Various Third Party
from dotenv import load_dotenv

## Setup Session Configuration

In [33]:
load_dotenv()
bucket = os.getenv("DEFAULT_BUCKET")
prefix = os.getenv("S3_PREFIX")

path_prefix = f"s3://{bucket}/{prefix}"

boto_session = boto3.Session(profile_name=os.getenv("AWS_PROFILE"), region_name=os.getenv("AWS_DEFAULT_REGION"))
session = sagemaker.Session(boto_session=boto_session, default_bucket=bucket)
s3fs = S3FileSystem(session=botocore.session.Session(profile=os.getenv("AWS_PROFILE")))
role = os.getenv("SAGEMAKER_ARN_ROLE", None)

print(path_prefix)

s3://play-projects-joshpeak/nlp-play/sagemaker


## Fetch Data from Athena

In [34]:
cache_dataset = Path("./data/transactions.dataset/")

if not cache_dataset.exists():
    print("Cache miss... fetching...")
    df = wr.athena.read_sql_query(
        """
    SELECT 
        rule_based_label as label
        , description as source
    FROM finances.silver_labelled
    WHERE rule_based_label not in ('Other')
    """,
        "finances",
        boto3_session=boto_session,
    )
    ds = Dataset.from_pandas(df)
    ds.save_to_disk(str(cache_dataset))

transactions_dataset = load_from_disk(str(cache_dataset))


## Create Train / Test / Validate Data Split

In [35]:
train_test_transactions_dataset = transactions_dataset.shuffle().train_test_split(test_size=0.1)
train_test_transactions_dataset.cleanup_cache_files()
train_test_transactions_dataset

DatasetDict({
    train: Dataset({
        features: ['label', 'source'],
        num_rows: 909
    })
    test: Dataset({
        features: ['label', 'source'],
        num_rows: 102
    })
})

## Export Data to S3 ready for Training

In [36]:
path = f"{path_prefix}/transactions"
print(path)
train_test_transactions_dataset.save_to_disk(path, fs=s3fs)

s3://play-projects-joshpeak/nlp-play/sagemaker/transactions


Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

Flattening the indices:   0%|          | 0/1 [00:00<?, ?ba/s]

In [37]:
import torch

torch.cuda.is_available()

False

In [38]:
df_train = train_test_transactions_dataset["train"].to_pandas()
df_test = train_test_transactions_dataset["test"].to_pandas()
df_test.source

0      EnergyAustralia Pty NetBank BPAY Bill Electricity
1                             PRICELINE KOTARA KOTARAWAU
2                     The Forum Univer EZYPAYID_11900826
3                                    WOOLWORTHS MAYFIELD
4                       MCDONALDS F3 NORTHBOU JILLIBYWAU
                             ...                        
97                                        COLES WALLSEND
98                      MCDONALDS F3 NORTHBOU JILLIBYWAU
99                               OFFICEWORKS NEWCASTLE W
100                                CommInsure--148724286
101                         CALTEX WYONG PETROL SO WYONG
Name: source, Length: 102, dtype: object

# Train Model

In [39]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModel

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
# classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)


Some weights of the model checkpoint at distilbert-base-uncased-finetuned-sst-2-english were not used when initializing DistilBertModel: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [40]:
docs = (doc for doc in df_test.source if not any([k in doc for k in ["WOOLWORTHS", "COLES", "BUNNINGS"]]))
docs_iter = iter(docs)

In [41]:
doc = next(docs_iter)
print(doc)
result = tokenizer.tokenize(doc, return_tensors="pt")
print(result)

EnergyAustralia Pty NetBank BPAY Bill Electricity
['energy', '##aus', '##tral', '##ia', 'pt', '##y', 'net', '##bank', 'bp', '##ay', 'bill', 'electricity']
