# NLP Experiments

In [1]:
import os
import io

import awswrangler as wr
import boto3
import sagemaker
import sagemaker.huggingface
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, pipeline

from dotenv import load_dotenv

## Setup Session Configuration

In [2]:
load_dotenv()
bucket = os.getenv("DEFAULT_BUCKET")
prefix = os.getenv("S3_PREFIX")

path_prefix = f"s3://{bucket}/{prefix}"

boto_session = boto3.Session(profile_name=os.getenv("AWS_PROFILE"), region_name=os.getenv("AWS_DEFAULT_REGION"))
session = sagemaker.Session(boto_session=boto_session, default_bucket=bucket)
role = os.getenv("SAGEMAKER_ARN_ROLE", None)

## Fetch Data from Athena

In [3]:
df = wr.athena.read_sql_query(
    """
SELECT 
    rule_based_label as label
    , description as source
FROM finances.silver_labelled
WHERE rule_based_label not in ('Other')
""",
    "finances",
    boto3_session=boto_session,
)

## Create Train / Test / Validate Data Split

In [4]:
df_train, df_test_validate = train_test_split(df, test_size=0.2)
df_test, df_validate = train_test_split(df_test_validate, test_size=0.5)

export = [("train", df_train), ("test", df_test), ("validate", df_validate)]

df_train.shape, df_test.shape, df_validate.shape

((1558, 2), (195, 2), (195, 2))

In [5]:
df_train[:3]

Unnamed: 0,label,source
609,Groceries,WOOLWORTHS MAYFIELD
1063,Groceries,BWS LIQUOR MAYFIELD
1310,Cafe,SQ *KAROO & CO Wahroonga


## Export Data to S3 ready for Training

In [6]:
def df_s3_json_lines(df, path, boto3_session):
    temp_file_name = "temp.jsonlines"
    with open(temp_file_name, "wb") as f:
        df.to_json(f, orient="records", lines=True)

    with open(temp_file_name, "rb") as f:
        wr.s3.upload(local_file=f, path=path, boto3_session=boto_session)

    os.remove(temp_file_name)
    return path


[df_s3_json_lines(df, f"{path_prefix}/{k}.jsonlines", boto_session) for k, df in export]

['s3://play-projects-joshpeak/nlp-play/sagemaker/train.jsonlines',
 's3://play-projects-joshpeak/nlp-play/sagemaker/test.jsonlines',
 's3://play-projects-joshpeak/nlp-play/sagemaker/validate.jsonlines']

In [7]:
import torch

torch.cuda.is_available()

False

# Train Model

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "distilbert-base-uncased-finetuned-sst-2-english"
model = AutoModelForSequenceClassification.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
docs = list(df_test.source)
docs_iter = iter(docs)


In [22]:
doc = next(docs_iter)
print(doc)
result = classifier(doc)
print(result)

Scott -Dibben Chemist Kotara
[{'label': 'NEGATIVE', 'score': 0.972266435623169}]
