# NLP Experiments

In [38]:
import os
import io

import awswrangler as wr
import boto3
import sagemaker
import sagemaker.huggingface
from sklearn.model_selection import train_test_split

from dotenv import load_dotenv

## Setup Session Configuration

In [39]:
load_dotenv()
bucket = os.getenv("DEFAULT_BUCKET")
prefix = os.getenv("S3_PREFIX")

path_prefix = f"s3://{bucket}/{prefix}"

boto_session = boto3.Session(profile_name=os.getenv("AWS_PROFILE"), region_name=os.getenv("AWS_DEFAULT_REGION"))
session = sagemaker.Session(boto_session=boto_session, default_bucket=bucket)
role = os.getenv("SAGEMAKER_ARN_ROLE", None)

## Fetch Data from Athena

In [41]:
df = wr.athena.read_sql_query(
    """
SELECT 
    rule_based_label as label
    , description as source
FROM finances.silver_labelled
WHERE rule_based_label not in ('Other')
""",
    "finances",
    boto3_session=boto_session,
)

## Create Train / Test / Validate Data Split

In [61]:
df_train, df_test_validate = train_test_split(df, test_size=0.2)
df_test, df_validate = train_test_split(df_test_validate, test_size=0.5)

export = [("train", df_train), ("test", df_test), ("validate", df_validate)]

df_train.shape, df_test.shape, df_validate.shape

((1558, 2), (195, 2), (195, 2))

## Export Data to S3 ready for Training

In [62]:
def df_s3_json_lines(df, path, boto3_session):
    temp_file_name = "temp.jsonlines"
    with open(temp_file_name, "wb") as f:
        df.to_json(f, orient="records", lines=True)

    with open(temp_file_name, "rb") as f:
        wr.s3.upload(local_file=f, path=path, boto3_session=boto_session)

    os.remove(temp_file_name)
    return path


[df_s3_json_lines(df, f"{path_prefix}/{k}.jsonlines", boto_session) for k, df in export]

['s3://play-projects-joshpeak/nlp-play/sagemaker/train.jsonlines',
 's3://play-projects-joshpeak/nlp-play/sagemaker/test.jsonlines',
 's3://play-projects-joshpeak/nlp-play/sagemaker/validate.jsonlines']

# Train Model

In [None]:
# TODO:
# -