<a href="https://colab.research.google.com/github/meti-94/TextClassification/blob/main/sample-tiny-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install necessary libraries 

In [1]:
# Install main transformers library
!pip install transformers==4.1.1

Collecting transformers==4.1.1
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 8.1MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 43.8MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/fb/36/59e4a62254c5fcb43894c6b0e9403ec6f4238cc2422a003ed2e6279a1784/tokenizers-0.9.4-cp37-cp37m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 52.4MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.9.4 transformers-4.1.1


# Import libraries

In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from transformers import AutoConfig, AutoTokenizer, AutoModel, BertForSequenceClassification
from sklearn.metrics import classification_report

# load up data to classifiy

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [21]:
!wget https://github.com/vineetdhanawat/twitter-sentiment-analysis/blob/master/datasets/Sentiment%20Analysis%20Dataset.csv


--2021-05-18 04:23:57--  https://github.com/vineetdhanawat/twitter-sentiment-analysis/blob/master/datasets/Sentiment%20Analysis%20Dataset.csv
Resolving github.com (github.com)... 192.30.255.112
Connecting to github.com (github.com)|192.30.255.112|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified [text/html]
Saving to: ‘Sentiment Analysis Dataset.csv’

Sentiment Analysis      [ <=>                ] 113.26K  --.-KB/s    in 0.02s   

2021-05-18 04:23:57 (5.31 MB/s) - ‘Sentiment Analysis Dataset.csv’ saved [115981]



In [44]:
## loading data
## X --> Texts
## y --> Labels

df = pd.read_excel('/content/drive/MyDrive/warm-up-data-set.xlsx')
# print(df.columns)
X = df['text'].to_list(); y = df['label'].to_list()

## Creating Test dataset
train_texts, test_texts, train_labels, test_labels = train_test_split(X, 
                                                                      y, 
                                                                      random_state=42, 
                                                                      stratify=y, 
                                                                      test_size=.25)
## Create Valid dataset
train_texts, valid_texts, train_labels, valid_labels = train_test_split(train_texts, 
                                                                      train_labels, 
                                                                      random_state=42, 
                                                                      stratify=train_labels, 
                                                                      test_size=.1)


# Loading BERT model and tokenizer model

In [45]:
# v1.0
config = AutoConfig.from_pretrained("prajjwal1/bert-tiny")
tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
model = BertForSequenceClassification.from_pretrained("prajjwal1/bert-tiny", num_labels=8)


Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initia

# Creating dataset and dataloader for X-y

In [46]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=200)
val_encodings = tokenizer(valid_texts, truncation=True, padding=True, max_length=200)

In [47]:
# convert raw text file to proper dataset object (based on task)
import torch

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        # initialization
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        # slicing method X[index]
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = ClassificationDataset(train_encodings, train_labels)
valid_dataset = ClassificationDataset(val_encodings, valid_labels)

In [52]:
# transformers API for train :)
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=40,              # total number of training epochs
    per_device_train_batch_size=36,  # batch size per device during training
    per_device_eval_batch_size=36,   # batch size for evaluation
    # warmup_steps=500,                # number of warmup steps for learning rate scheduler
    # weight_decay=0.01,               # strength of weight decay
    learning_rate= 5e-6,
    adam_epsilon = 1e-8, 
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    do_eval=True,
    evaluation_strategy = 'epoch'
    
    
)
# training_args.evaluation_strategy = EvaluationStrategy.EPOCH
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=valid_dataset,          # evaluation dataset

)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.774587,0.767255
2,0.742046,0.740191
3,0.720464,0.716848
4,0.705025,0.696185
5,0.693256,0.678959
6,0.641095,0.663996
7,0.630804,0.650457
8,0.652302,0.639095
9,0.64571,0.629624
10,0.607624,0.620937


TrainOutput(global_step=4000, training_loss=0.6423980762958527)

In [53]:
test_encodings = tokenizer(test_texts, truncation=True, padding=True, max_length=200)
test_dataset = ClassificationDataset(test_encodings, test_labels)

# Predicting on test set

In [54]:
pred = trainer.predict(test_dataset)

# Evaluation

In [55]:
print(classification_report(pred.label_ids, pred.predictions.argmax(-1)))

              precision    recall  f1-score   support

           0       0.74      0.70      0.72      1333
           1       0.72      0.75      0.74      1333

    accuracy                           0.73      2666
   macro avg       0.73      0.73      0.73      2666
weighted avg       0.73      0.73      0.73      2666



In [None]:
trainer.save_model("./classifier")

In [None]:
cp -r classifier/ drive/MyDrive/