<a href="https://colab.research.google.com/github/mathguy-r/sentiment-analysis-with-transformers/blob/temp/sentiment_analysis_distilbert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Setup and Initialization

In [None]:
!pip install transformers datasets torch --no-cache-dir
!pip install tensorboard --no-cache-dir


Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 15.2 MB/s 
[?25hCollecting datasets
  Downloading datasets-2.1.0-py3-none-any.whl (325 kB)
[K     |████████████████████████████████| 325 kB 51.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 65.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 56.3 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.3 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 



In [None]:
import pandas as pd
from datasets import Dataset
from transformers import DataCollatorWithPadding, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer


## Data Handling

In [None]:
def read_data(path):
  with open(path,'r') as f:
    temp = f.readlines()
  text_list,label_list = [], []
  for line in temp:
    text, label = line.strip('\n').split(';')
    text_list.append(text)
    label_list.append(label)
  return pd.DataFrame({'text':text_list,'label':label_list})



Unnamed: 0,text,label
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


In [None]:
train_data = read_data('/content/train.txt')
test_data = read_data('/content/test.txt')
val_data = read_data('/content/val.txt')


In [None]:
train_data.label.value_counts()

joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: label, dtype: int64

## Preprocessing

In [None]:
train_data.text.sample(5).tolist()

['i feel like our relationship revovles around sex and when we do he wants it to be really adventurous trying new things using toys etc ansi just find it exhausting trying to keep up',
 'i guess these expectations of me being so goddamn perfect have made me feel afraid to change',
 'im fine mary anne answered feeling a little impatient',
 'i love to add just a little milk and when i m feeling especially naughty a splash of caramel and vanilla syrup but shhh',
 'i feel disappointed and want to tear up some paper and throw it across the room and write a giant letter of why things are unfair i just think of perspective']

In [None]:
from transformers import DataCollatorWithPadding, AutoTokenizer
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model_class, tokenizer_class, pretrained_weights = (AutoModelForSequenceClassification, AutoTokenizer, 'distilbert-base-uncased')

tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
model = model_class.from_pretrained(pretrained_weights, num_labels=6)

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/256M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'pre_classifier.weight', 'classi

In [None]:
def preprocess_data(df):
    preprocess_function = lambda x:tokenizer(x['text'], truncation=True)
    text, label = df.text.tolist(), df.label.tolist()    
    dataset = Dataset.from_dict(dict(text=text, label=label))
    preprocessed_dataset = dataset.map(preprocess_function, batched=True)  
    return preprocessed_dataset


In [None]:
tokenized_train_dataset = preprocess_data(train_data)
tokenized_val_dataset = preprocess_data(val_data)
tokenized_test_dataset = preprocess_data(test_data)

  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

## Model Training

In [None]:
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='micro')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

model_output_dir = "./results"
model_saving_dir = "./saved"
model_logging_dir = "./logs"

training_args = TrainingArguments(
    output_dir=model_output_dir,
    learning_rate=2e-6,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir=model_logging_dir,
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
trainer.evaluate()


In [None]:
saving_path = model_saving_dir

### For saving finetuned weights and tokenizer
trainer.save_model(saving_path)
tokenizer.save_pretrained(saving_path)

### For saving logs of different metrics across multiple epochs for training and evaluation
with open(f"{saving_path}/metrics_log.txt",'w') as f:
    for obj in trainer.state.log_history:
        f.write(str(obj))
        f.write('\n')


In [19]:
#### Logging
%reload_ext tensorboard
%tensorboard --logdir '{model_logging_dir}'


<IPython.core.display.Javascript object>

## Inference

In [None]:
### Load finetuned Model for prediction

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_class, tokenizer_class, pretrained_weights = (AutoModelForSequenceClassification, AutoTokenizer, 'saved')

finetuned_tokenizer = tokenizer_class.from_pretrained(pretrained_weights, do_lower_case=True)
# data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="pt")
finetuned_model = model_class.from_pretrained(pretrained_weights, num_labels=3)
from transformers import TextClassificationPipeline
text = "75 thousands"
pipe = TextClassificationPipeline(model=finetuned_model, tokenizer=finetuned_tokenizer, return_all_scores=False)
pipe(text)

trainer.evaluate()