In [82]:
import awswrangler as wr
import pandas as pd
import evaluate
import numpy as np

from huggingface_hub import notebook_login

from transformers import (
    AutoTokenizer,
    DataCollatorWithPadding,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
)

from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

import plotly.express as px

In [83]:
pd.options.mode.copy_on_write = True

In [84]:
df = wr.s3.read_parquet(path="s3://amazon-reviews-eafit/sample/")

In [85]:
label2id = {"negative": 0, "neutral": 1, "positive": 2}
id2label = {0: "negative", 1: "neutral", 2: "positive"}
df["labels"] = df["sentiment"].map(label2id)

In [86]:
df_sentiment = df[["review_body", "sentiment", "labels"]]

In [87]:
df_temp, df_test = train_test_split(df_sentiment, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_temp, test_size=0.5, random_state=42)

In [88]:
dataset_test = Dataset.from_pandas(df_test)
dataset_val = Dataset.from_pandas(df_val)
dataset_train = Dataset.from_pandas(df_train)

dataset_dict = DatasetDict(
    {"train": dataset_train, "validation": dataset_val, "test": dataset_test}
)

In [89]:
model = "distilbert/distilbert-base-uncased"

tokenizer = AutoTokenizer.from_pretrained(model)


`resume_download` is deprecated and will be removed in version 1.0.0. Downloads always resume when possible. If you want to force a new download, use `force_download=True`.



In [90]:
def preprocess_function(examples):
    return tokenizer(examples["review_body"], truncation=True)

In [91]:
tokenized_data_train = dataset_dict["train"].map(preprocess_function, batched=True)
tokenized_data_validation = dataset_dict["validation"].map(
    preprocess_function, batched=True
)
tokenized_data_test = dataset_dict["test"].map(preprocess_function, batched=True)

Map:   0%|          | 0/399693 [00:00<?, ? examples/s]

Map:   0%|          | 0/399694 [00:00<?, ? examples/s]

Map:   0%|          | 0/199847 [00:00<?, ? examples/s]

In [92]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [93]:
accuracy = evaluate.load("accuracy")

In [94]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [95]:
model = AutoModelForSequenceClassification.from_pretrained(
    model, num_labels=3, id2label=id2label, label2id=label2id
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [96]:
small_train_dataset = tokenized_data_train.select(range(8000))
small_eval_dataset = tokenized_data_validation.select(range(8000))

In [97]:
training_args = TrainingArguments(
    output_dir="../data/prediction_sentiment",
    learning_rate=2e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=small_train_dataset,
    eval_dataset=small_eval_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.369744,0.863125
2,No log,0.35207,0.87725
3,No log,0.352036,0.872375
4,0.310600,0.396548,0.875875
5,0.310600,0.432727,0.87175
6,0.310600,0.466014,0.86325
7,0.310600,0.482521,0.873375
8,0.103900,0.502579,0.866375
9,0.103900,0.518307,0.866875
10,0.103900,0.520786,0.8675


TrainOutput(global_step=1250, training_loss=0.17705462493896484, metrics={'train_runtime': 38884.0161, 'train_samples_per_second': 2.057, 'train_steps_per_second': 0.032, 'total_flos': 9709669191216384.0, 'train_loss': 0.17705462493896484, 'epoch': 10.0})

In [98]:
df_results = pd.DataFrame(trainer.state.log_history)

In [111]:
df_results.head(10)

Unnamed: 0,eval_loss,eval_accuracy,eval_runtime,eval_samples_per_second,eval_steps_per_second,epoch,step,loss,grad_norm,learning_rate,train_runtime,train_samples_per_second,train_steps_per_second,total_flos,train_loss
0,0.369744,0.863125,1006.0684,7.952,0.124,1.0,125,,,,,,,,
1,0.35207,0.87725,995.2918,8.038,0.126,2.0,250,,,,,,,,
2,0.352036,0.872375,995.399,8.037,0.126,3.0,375,,,,,,,,
3,,,,,,4.0,500,0.3106,3.406967,1.2e-05,,,,,
4,0.396548,0.875875,995.1049,8.039,0.126,4.0,500,,,,,,,,
5,0.432727,0.87175,994.8247,8.042,0.126,5.0,625,,,,,,,,
6,0.466014,0.86325,995.0633,8.04,0.126,6.0,750,,,,,,,,
7,0.482521,0.873375,994.8041,8.042,0.126,7.0,875,,,,,,,,
8,,,,,,8.0,1000,0.1039,6.735658,4e-06,,,,,
9,0.502579,0.866375,994.9863,8.04,0.126,8.0,1000,,,,,,,,


In [100]:
df_results.to_parquet("../data/prediction_sentiment_results/metrics.parquet")

In [101]:
fig = px.line(df_results, x="epoch", y="eval_loss", title="Eval loss by epoch")
fig.show()

In [102]:
fig = px.line(df_results, x="epoch", y="eval_accuracy", title="Accuracy by epoch")
fig.show()

# Test


In [103]:
small_test_dataset = tokenized_data_test.select(range(4000))

In [104]:
predictions_test = trainer.predict(small_test_dataset)

In [105]:
df_result_test = pd.DataFrame([predictions_test.metrics]).T

In [106]:
df_result_test.to_parquet("../data/prediction_sentiment_results/metrics_test.parquet")

In [107]:
df_result_test.head()

Unnamed: 0,0
test_loss,0.348428
test_accuracy,0.87225
test_runtime,492.0051
test_samples_per_second,8.13
test_steps_per_second,0.128


In [109]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [110]:
trainer.push_to_hub("Camilovelez1/distilbert-base-uncased-amazon-reviews-fine-tuning")

CommitInfo(commit_url='https://huggingface.co/Camilovelez1/prediction_sentiment/commit/a88039d1d622e859383b2440fecaf5b5b7394ca1', commit_message='Camilovelez1/distilbert-base-uncased-amazon-reviews-fine-tuning', commit_description='', oid='a88039d1d622e859383b2440fecaf5b5b7394ca1', pr_url=None, pr_revision=None, pr_num=None)