In [1]:
!pip install transformers



# Importing libraries

In [2]:
import pandas as pd
import csv
import numpy as np
import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import transformers
from transformers import AutoModel, BertTokenizerFast, BertForSequenceClassification
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, Dataset
from torch.optim import AdamW
from tqdm import tqdm

In [3]:
# Setting up the device

from torch import cuda
device = 'cuda' if cuda.is_available() else 'cpu'
print("Device:", device)

Device: cuda


In [4]:
from google.colab import drive
drive.mount('/content/drive')

!unzip "/content/train.csv.zip" -d "/content/drive/MyDrive/CMI"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
unzip:  cannot find or open /content/train.csv.zip, /content/train.csv.zip.zip or /content/train.csv.zip.ZIP.


# Loading and preparing train dataset

In [5]:
train_data = pd.read_csv("/content/drive/MyDrive/CMI/train.csv", encoding='latin-1')
print(f"Training data shape: {train_data.shape}")
train_data.head()

Training data shape: (27481, 10)


Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [6]:
# Selecting the columns text and sentiments
train_sent = train_data[['text', 'sentiment']]
print(train_sent.head())
print()
print("Count of sentiments:", train_sent['sentiment'].value_counts())

                                                text sentiment
0                I`d have responded, if I were going   neutral
1      Sooo SAD I will miss you here in San Diego!!!  negative
2                          my boss is bullying me...  negative
3                     what interview! leave me alone  negative
4   Sons of ****, why couldn`t they put them on t...  negative

Count of sentiments: sentiment
neutral     11118
positive     8582
negative     7781
Name: count, dtype: int64


In [7]:
# Encode sentiments
train_sent['sentiment'] = train_sent['sentiment'].map({'positive': 2, 'negative': 0, 'neutral': 1})
print(train_sent.value_counts())

text                                                                                                                                                 sentiment
Ã¯Â¿Â½Ã¯Â¿Â½h. iÃ¯Â¿Â½m gonna go in the 'big' house now, borrow my sis guitar and play and maybe write                                               1            1
\tREALLY?? oh.. sorry yall  lol                                                                                                                      0            1
        _beckett Thanks so much !                                                                                                                    2            1
      You`ll be missed!!  Bring me back  a keychain!                                                                                                 0            1
      get lit  I am  http://bit.ly/OASQR                                                                                                             1            1
                     

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sent['sentiment'] = train_sent['sentiment'].map({'positive': 2, 'negative': 0, 'neutral': 1})


In [8]:
# handling missing values
train = train_sent[train_sent.isna().any(axis = 1)]
print(train)
train_sent = train_sent.dropna()

    text  sentiment
314  NaN          1


In [9]:
print("Train dataset")
print(train_sent.head())
print(f"Shape of Train dataset: {train_sent.shape}")

Train dataset
                                                text  sentiment
0                I`d have responded, if I were going          1
1      Sooo SAD I will miss you here in San Diego!!!          0
2                          my boss is bullying me...          0
3                     what interview! leave me alone          0
4   Sons of ****, why couldn`t they put them on t...          0
Shape of Train dataset: (27480, 2)


# Loading and preparing test dataset

In [10]:
test_data = pd.read_csv("/content/drive/MyDrive/CMI/test.csv", encoding = 'latin-1')
print(f"Test data shape: {test_data.shape}")
test_data.head()

Test data shape: (4815, 9)


Unnamed: 0,textID,text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral,morning,0-20,Afghanistan,38928346.0,652860.0,60.0
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive,noon,21-30,Albania,2877797.0,27400.0,105.0
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative,night,31-45,Algeria,43851044.0,2381740.0,18.0
3,01082688c6,happy bday!,positive,morning,46-60,Andorra,77265.0,470.0,164.0
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive,noon,60-70,Angola,32866272.0,1246700.0,26.0


In [11]:
# Selecting the columns text and sentiments
test_sent = test_data[['text', 'sentiment']]
print(test_sent.head())
print()
print("Count of sentiments:", test_sent['sentiment'].value_counts())

                                                text sentiment
0  Last session of the day  http://twitpic.com/67ezh   neutral
1   Shanghai is also really exciting (precisely -...  positive
2  Recession hit Veronique Branquinho, she has to...  negative
3                                        happy bday!  positive
4             http://twitpic.com/4w75p - I like it!!  positive

Count of sentiments: sentiment
neutral     1430
positive    1103
negative    1001
Name: count, dtype: int64


In [12]:
# Encode sentiments
test_sent['sentiment'] = test_sent['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})
print(test_sent.value_counts())

text                                                                                                                  sentiment
Ã¯Â¿Â½Ã¯Â¿Â½We are ready for the new generation!!...Oh yeah...We are a Backstreet fans and we are proud about it!!    2.0          1
    - brief interruption- brb. goodnight if i don`t catch u again later. gotta ck something                           2.0          1
    GRR! my stupid meebo is disconnecting me every 5 seconds                                                          0.0          1
    I am sending healthy, healing thoughts in the mumborg`s direction.                                                2.0          1
    eeek!! Your coming!!!! Im soo excited to see you on Thursday!!                                                    2.0          1
                                                                                                                                  ..
  Darn, I thought you meant White Sox.                                    

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_sent['sentiment'] = test_sent['sentiment'].map({'negative': 0, 'neutral': 1, 'positive': 2})


In [13]:
# handling missing values
test = test_sent[test_sent.isna().any(axis = 1)]
print(test)
test_sent = test_sent.dropna()

     text  sentiment
3534  NaN        NaN
3535  NaN        NaN
3536  NaN        NaN
3537  NaN        NaN
3538  NaN        NaN
...   ...        ...
4810  NaN        NaN
4811  NaN        NaN
4812  NaN        NaN
4813  NaN        NaN
4814  NaN        NaN

[1281 rows x 2 columns]


In [14]:
print("Test dataset")
print(test_sent.head())
print(f"Shape of Test dataset: {test_sent.shape}")

Test dataset
                                                text  sentiment
0  Last session of the day  http://twitpic.com/67ezh        1.0
1   Shanghai is also really exciting (precisely -...        2.0
2  Recession hit Veronique Branquinho, she has to...        0.0
3                                        happy bday!        2.0
4             http://twitpic.com/4w75p - I like it!!        2.0
Shape of Test dataset: (3534, 2)


# Importing BERT model and Bert Tokenizer

In [15]:
# import BERT-base pretrained model
bert = AutoModel.from_pretrained('bert-base-uncased')

# Load the BERT tokenizer
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

## Defining dataset and adding tokenizer

In [29]:
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

class DatasetBuilder(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len = 128):
        self.encodings = tokenizer(texts, truncation=True, padding=True, max_length = max_len)
        self.labels = labels

    def __getitem__(self, index):
        item = {}
        for key, value in self.encodings.items():
          item[key] = torch.tensor(value[index])
        item['labels'] = torch.tensor(self.labels[index])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DatasetBuilder(train_sent['text'].tolist(), train_sent['sentiment'].tolist(), tokenizer)
test_dataset = DatasetBuilder(test_sent['text'].tolist(), test_sent['sentiment'].tolist(), tokenizer)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

In [30]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
model.to(device) # moving model to device

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

## Training the model

In [31]:
optimizer = AdamW(model.parameters(), lr = 2e-5)
model.train()

for epoch in range(2): # no. of epochs = 2
    loop = tqdm(train_loader, desc = f"Epoch {epoch+1}")
    for batch in loop:
        batch = {key: value.to(device) for key, value in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

Epoch 1: 100%|██████████| 1718/1718 [08:53<00:00,  3.22it/s, loss=0.915]
Epoch 2: 100%|██████████| 1718/1718 [08:52<00:00,  3.23it/s, loss=0.26]


## Evaluating the model on test data

In [32]:
model.eval()
all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        labels = batch['labels']
        batch = {key: value.to(device) for key, value in batch.items() if key != 'labels'}
        outputs = model(**batch)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1).cpu()
        all_preds.extend(preds.numpy())
        all_labels.extend(labels.numpy())

## Classification report

In [34]:
print("\nClassification Report:")
print(classification_report(all_labels, all_preds, target_names={'negative': 0, 'neutral': 1, 'positive': 2}))


Classification Report:
              precision    recall  f1-score   support

    negative       0.81      0.76      0.79      1001
     neutral       0.75      0.80      0.77      1430
    positive       0.85      0.82      0.84      1103

    accuracy                           0.80      3534
   macro avg       0.80      0.79      0.80      3534
weighted avg       0.80      0.80      0.80      3534

