In [4]:
import awswrangler as wr
import pandas as pd
import evaluate
import numpy as np

from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

In [5]:
pd.options.mode.copy_on_write = True

In [6]:
df = wr.s3.read_parquet(path="s3://amazon-reviews-eafit/sample/")

In [7]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}
df["labels"] = df["sentiment"].map(label2id)

In [8]:
df_sentiment = df[["review_body", "sentiment", "labels"]]

In [9]:
df_temp, df_test = train_test_split(df_sentiment, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_temp, test_size=0.5, random_state=42)

In [10]:
dataset_test = Dataset.from_pandas(df_test)
dataset_val = Dataset.from_pandas(df_val)
dataset_train = Dataset.from_pandas(df_train)

dataset_dict = DatasetDict(
    {"train": dataset_train, "validation": dataset_val, "test": dataset_test}
)

In [11]:
model = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [12]:
def preprocess_function(examples):
    return tokenizer(examples["review_body"], truncation=True)

In [13]:
tokenized_data_train = dataset_dict["train"].map(preprocess_function, batched=True)
tokenized_data_validation = dataset_dict["validation"].map(
    preprocess_function, batched=True
)
tokenized_data_test = dataset_dict["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/399693 [00:00<?, ? examples/s]

Map:   0%|          | 0/399694 [00:00<?, ? examples/s]

Map:   0%|          | 0/199847 [00:00<?, ? examples/s]

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [15]:
accuracy = evaluate.load("accuracy")

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [16]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [17]:
model = AutoModelForSequenceClassification.from_pretrained(
    model, num_labels=3, id2label=id2label, label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
small_train_dataset = tokenized_data_train.select(range(4000))
small_eval_dataset = tokenized_data_validation.select(range(4000))

In [19]:
training_args = TrainingArguments(
    output_dir="../data/prediction_sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

  0%|          | 0/630 [00:00<?, ?it/s]

RuntimeError: MPS backend out of memory (MPS allocated: 17.20 GB, other allocations: 866.67 MB, max allowed: 18.13 GB). Tried to allocate 96.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).

In [None]:
df_results = pd.DataFrame(trainer.state.log_history)

In [None]:
df_results.to_parquet("../data/prediction_sentiment_results/metrics.parquet")

# Test


In [None]:
small_test_dataset = tokenized_data_test.select(range(50))

In [None]:
predictions_test = trainer.predict(small_test_dataset)

[[-0.25960073 -0.20126009  0.35329914]
 [-0.20641065 -0.27886903  0.2990285 ]
 [-0.22442305 -0.23175992  0.41484722]
 [-0.2592337  -0.28604758  0.37312222]
 [-0.14394376 -0.16613027  0.20483293]
 [-0.28855333 -0.26329023  0.3183465 ]
 [-0.24456218 -0.21102923  0.33713728]
 [-0.31210604 -0.23386143  0.3391418 ]
 [-0.21083833 -0.2646261   0.34310266]
 [-0.2828176  -0.22596319  0.2903264 ]
 [-0.23851496 -0.2348367   0.33882952]
 [-0.18507767 -0.2209858   0.3215231 ]
 [-0.26124606 -0.34718725  0.3388014 ]
 [-0.15659906 -0.14399435  0.24770403]
 [-0.26132092 -0.29749846  0.31240577]
 [-0.23109068 -0.22583587  0.29862145]
 [-0.23882306 -0.21832752  0.2835646 ]
 [-0.2242401  -0.19587764  0.38308346]
 [-0.28589264 -0.26042828  0.38531306]
 [-0.21789905 -0.2553931   0.348731  ]
 [-0.2943241  -0.25461313  0.35278887]
 [-0.19543068 -0.23868348  0.26050434]
 [-0.2233388  -0.32181022  0.34461638]
 [-0.23570442 -0.23283076  0.37417972]
 [-0.33094805 -0.2869413   0.35011443]
 [-0.20142892 -0.15367495

In [None]:
df_result_test = pd.DataFrame(predictions_test.metrics)

df_result_test.to_parquet("../data/prediction_sentiment_results/metrics_test.parquet")

{'test_loss': 0.8799773454666138,
 'test_accuracy': 0.78,
 'test_runtime': 0.8222,
 'test_samples_per_second': 60.814,
 'test_steps_per_second': 1.216}