# Sentiment Analysis with finetuned distilBERT 
Following the [HuggingFace tutorial](https://huggingface.co/docs/transformers/en/tasks/sequence_classification) on text classification using [Goodreads data](https://www.kaggle.com/competitions/goodreads-books-reviews-290312)

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

from datasets import load_dataset

import evaluate


In [None]:
df = pd.read_csv("data/goodreads_train.csv")
df.head()

In [None]:
df.describe()

In [None]:
# transform the rating column to a label column

def rating_to_sent(rating):
    # rating 3 is assigned positive because sampling a few reviews, 
    # I personally would call most of them more positive than negative. This could be changed
    if rating > 3:
        return 1
    elif rating == 3:
        return 1
    elif rating < 3:
        return 0

df['label'] = df['rating'].apply(rating_to_sent)
df.head()

In [None]:
# rename column
df.rename(columns={'review_text':'text'}, inplace=True)
df.head()

In [None]:
hf_df = df[['text', 'label']]
hf_df.head()

In [None]:
hf_df.to_csv("data/goodreads_for_hf.csv")

In [None]:

gr_dataset = load_dataset("csv", data_files="data/goodreads_for_hf.csv")

## Preprocessing

In [None]:
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [None]:
#truncate input to the maximum length of BERT's input length

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation = True)

In [None]:
tokenized_gr = gr_dataset.map(preprocess_function, batched = True)

In [None]:
type(tokenized_gr)

In [None]:
# now pad the inputs so that each batch is the same length
# do this by batches because otherwise takes too long and uses unnecessary resources 
# tbh this function itself is a total black box

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
type(data_collator)

In [None]:
# measure accuracy while training 

accuracy = evaluate.load("accuracy")

In [None]:
#pass predictions and labels to `compute` to calculate the accuracy

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return accuracy.compute(predictions = predictions, references = labels)

In [None]:
#map ids to labels 
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


In [None]:
# establish the model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

In [None]:
# define training arguments 

training_args = TrainingArguments(
    output_dir = "models",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    #tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized_gr['train'],
    eval_dataset=tokenized_gr['test'],
    processing_class=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
    
    