# Sentiment Analysis with finetuned distilBERT 
Following the [HuggingFace tutorial](https://huggingface.co/docs/transformers/en/tasks/sequence_classification) on text classification using [Goodreads data](https://www.kaggle.com/competitions/goodreads-books-reviews-290312)

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
import pandas as pd
import numpy as np

from transformers import AutoTokenizer
from transformers import DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer

from datasets import load_dataset

import evaluate

from sklearn.model_selection import train_test_split


## Goodreads dataset

In [6]:
df = pd.read_csv("data/goodreads_train.csv")
df.head()

Unnamed: 0,user_id,book_id,review_id,rating,review_text,date_added,date_updated,read_at,started_at,n_votes,n_comments
0,8842281e1d1347389f2ab93d60773d4d,18245960,dfdbb7b0eb5a7e4c26d59a937e2e5feb,5,This is a special book. It started slow for ab...,Sun Jul 30 07:44:10 -0700 2017,Wed Aug 30 00:00:26 -0700 2017,Sat Aug 26 12:05:52 -0700 2017,Tue Aug 15 13:23:18 -0700 2017,28,1
1,8842281e1d1347389f2ab93d60773d4d,16981,a5d2c3628987712d0e05c4f90798eb67,3,Recommended by Don Katz. Avail for free in Dec...,Mon Dec 05 10:46:44 -0800 2016,Wed Mar 22 11:37:04 -0700 2017,,,1,0
2,8842281e1d1347389f2ab93d60773d4d,28684704,2ede853b14dc4583f96cf5d120af636f,3,"A fun, fast paced science fiction thriller. I ...",Tue Nov 15 11:29:22 -0800 2016,Mon Mar 20 23:40:27 -0700 2017,Sat Mar 18 23:22:42 -0700 2017,Fri Mar 17 23:45:40 -0700 2017,22,0
3,8842281e1d1347389f2ab93d60773d4d,27161156,ced5675e55cd9d38a524743f5c40996e,0,Recommended reading to understand what is goin...,Wed Nov 09 17:37:04 -0800 2016,Wed Nov 09 17:38:20 -0800 2016,,,5,1
4,8842281e1d1347389f2ab93d60773d4d,25884323,332732725863131279a8e345b63ac33e,4,"I really enjoyed this book, and there is a lot...",Mon Apr 25 09:31:23 -0700 2016,Mon Apr 25 09:31:23 -0700 2016,Sun Jun 26 00:00:00 -0700 2016,Sat May 28 00:00:00 -0700 2016,9,1


In [7]:
df.describe()

Unnamed: 0,book_id,rating,n_votes,n_comments
count,900000.0,900000.0,900000.0,900000.0
mean,13441450.0,3.689639,3.294381,1.016656
std,9357863.0,1.252583,17.873553,5.963821
min,1.0,0.0,-3.0,-1.0
25%,6340471.0,3.0,0.0,0.0
50%,13442030.0,4.0,0.0,0.0
75%,20578970.0,5.0,2.0,0.0
max,36328680.0,5.0,3222.0,1335.0


In [12]:
# transform the rating column to a label column

def rating_to_sent(rating):
    # rating 3 is assigned positive because sampling a few reviews, 
    # I personally would call most of them more positive than negative. This could be changed
    if rating > 3:
        return 1
    elif rating == 3:
        return 1
    elif rating < 3:
        return 0

df['label'] = df['rating'].apply(rating_to_sent)

In [25]:
# rename column
df.rename(columns={'review_text':'text'}, inplace=True)

#trim dataframe to just text and label
hf_df = df[['text', 'label']]

#save as csv for later and read that csv into the goodreads dataset we'll use
hf_df.to_csv("data/goodreads_for_hf.csv")
#gr_df = pd.read_csv("data/goodreads_for_hf.csv")

gr_ds = load_dataset("csv", data_files="data/goodreads_for_hf.csv")
type(gr_ds)

Generating train split: 0 examples [00:00, ? examples/s]

datasets.dataset_dict.DatasetDict

## Try w IMDB

In [49]:
from datasets import load_dataset

imdb = load_dataset("imdb")

Downloading readme:   0%|          | 0.00/7.81k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/20.5M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/25000 [00:00<?, ? examples/s]

Generating unsupervised split:   0%|          | 0/50000 [00:00<?, ? examples/s]

## Preprocessing

In [27]:
#tokenize w BERT's tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")


In [51]:
#truncate input to the maximum length of BERT's input length

def preprocess_function(examples):
    return tokenizer(examples['text'], truncation = True)

#tokenized_gr = gr_ds.map(preprocess_function, batched = True)
#type(tokenized_gr)

In [55]:
tokenized_imdb = imdb.map(preprocess_function, batched = True)

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [31]:
# now pad the inputs so that each batch is the same length
# do this by batches because otherwise takes too long and uses unnecessary resources 
# tbh this function itself is a total black box

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [33]:
type(data_collator)

transformers.data.data_collator.DataCollatorWithPadding

In [35]:
# measure accuracy while training 

accuracy = evaluate.load("accuracy")

In [37]:
#pass predictions and labels to `compute` to calculate the accuracy

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis = 1)
    return accuracy.compute(predictions = predictions, references = labels)

In [39]:
#map ids to labels 
id2label = {0: "NEGATIVE", 1: "POSITIVE"}
label2id = {"NEGATIVE": 0, "POSITIVE": 1}


In [41]:
# establish the model
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert/distilbert-base-uncased", num_labels=2, id2label=id2label, label2id=label2id)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [57]:
# define training arguments 

training_args = TrainingArguments(
    output_dir = "models",
    learning_rate = 2e-5,
    per_device_train_batch_size = 16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_imdb['train'],
    eval_dataset=tokenized_imdb['test'],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()
    
    

Epoch,Training Loss,Validation Loss


RuntimeError: MPS backend out of memory (MPS allocated: 12.02 GB, other allocations: 5.98 GB, max allowed: 18.13 GB). Tried to allocate 192.00 MB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).