In [11]:
!pip install transformers datasets evaluate -q

import pandas as pd
import torch
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    AutoConfig,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding
)
import evaluate

In [18]:
#  Load Data
df_train = pd.read_csv("/content/twitter_training.csv")
df_valtest = pd.read_csv("/content/twitter_validation.csv")

df_train.columns = ['id', 'entity', 'sentiment', 'text']
df_valtest.columns = ['id', 'entity', 'sentiment', 'text']

In [19]:
df_train['sentiment']

Unnamed: 0,sentiment
0,Positive
1,Positive
2,Positive
3,Positive
4,Positive
...,...
40574,Irrelevant
40575,Irrelevant
40576,Irrelevant
40577,Neutral


In [20]:
#  Encode Labels
labels =   sorted(df_train['sentiment'].unique())
label2id = {label: idx for idx, label in enumerate(labels)}
id2label = {idx: label for label, idx in label2id.items()}
num_labels = len(labels)

df_train['label'] = df_train['sentiment'].map(label2id)
df_valtest['label'] = df_valtest['sentiment'].map(label2id)

In [21]:
#  Convert to HuggingFace Dataset
train_dataset = Dataset.from_pandas(df_train)
val_dataset = Dataset.from_pandas(df_valtest)

In [22]:
# Tokenization
model_ckpt = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
def preprocess(batch):
    texts = [str(t) for t in batch["text"]]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=128)


train_dataset = train_dataset.map(preprocess, batched=True)
val_dataset = val_dataset.map(preprocess, batched=True)

Map:   0%|          | 0/40579 [00:00<?, ? examples/s]

Map:   0%|          | 0/999 [00:00<?, ? examples/s]

In [23]:
#  Model Configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

config = AutoConfig.from_pretrained(
    model_ckpt,
    label2id=label2id,
    id2label=id2label,
    num_labels=num_labels
)

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, config=config).to(device)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
#  Evaluation Metric
accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = torch.argmax(torch.tensor(logits), dim=-1)
    return accuracy.compute(predictions=preds, references=labels)

Downloading builder script: 0.00B [00:00, ?B/s]

In [25]:
# Training Arguments
args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    report_to="none"
)

In [26]:
#  Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [27]:
import os
os.environ["WANDB_DISABLED"] = "true"

In [28]:
#  Train Model
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.6169,0.81081,0.784785
2,0.087,1.333136,0.793794
3,0.0732,1.498443,0.785786


TrainOutput(global_step=15219, training_loss=0.37531170275421416, metrics={'train_runtime': 2964.0858, 'train_samples_per_second': 41.071, 'train_steps_per_second': 5.134, 'total_flos': 8007731430386688.0, 'train_loss': 0.37531170275421416, 'epoch': 3.0})

In [29]:
#  Evaluate
trainer.evaluate()

{'eval_loss': 0.8108097910881042,
 'eval_accuracy': 0.7847847847847848,
 'eval_runtime': 7.6745,
 'eval_samples_per_second': 130.171,
 'eval_steps_per_second': 16.288,
 'epoch': 3.0}

In [30]:
text = "I am so happy today"

def get_pred(text):
    input_encoded = tokenizer(text, return_tensors='pt').to(device)
    with torch.no_grad():
        outputs = model(**input_encoded)

    logits = outputs.logits
    pred = torch.argmax(logits,dim = 1).item()
    return id2label[pred]
get_pred(text)

'Positive'