In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount= True)

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import sklearn

data = pd.read_csv('/content/drive/My Drive/NLP/training.1600000.processed.noemoticon.csv', encoding = 'latin-1')
data.columns = ['label','id','date','flag','user','tweet']
data = data.drop(columns = ['id','date','flag','user'], axis = 1)
data

Unnamed: 0,label,tweet
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew
...,...,...
1599994,4,Just woke up. Having no school is the best fee...
1599995,4,TheWDB.com - Very cool to hear old Walt interv...
1599996,4,Are you ready for your MoJo Makeover? Ask me f...
1599997,4,Happy 38th Birthday to my boo of alll time!!! ...


In [None]:
# 0 ---> negative , 2---> neutral, 4 ----> positive

neg=data[data['label'] == 0][:40000]
pos=data[data['label'] == 4][:40000]

df = pd.concat([neg,pos])
df = df.sample(frac=1)



In [None]:
# train-val-test : 35000, 15000, 30000
from sklearn.model_selection import train_test_split


X_train, X_test, y_train, y_test = train_test_split(
     df['tweet'], df['label'], test_size=0.375, random_state=42, stratify = df['label'])

X_train_final, X_valid, y_train_final, y_valid = train_test_split(
     X_train, y_train, test_size=0.3, random_state=42, stratify = y_train)


In [None]:
print(f"Number of training examples: {len(X_train)}")
print(f"Number of validation examples: {len(X_valid)}")
print(f"Number of testing examples: {len(X_test)}")

Number of training examples: 50000
Number of validation examples: 15000
Number of testing examples: 30000


In [None]:
train_data = pd.DataFrame(zip(X_train.to_list(),y_train.to_list()), columns = ['tweet','label'])
valid_data = pd.DataFrame(zip(X_valid.to_list(),y_valid.to_list()), columns = ['tweet','label'])
test_data = pd.DataFrame(zip(X_test.to_list(),y_test.to_list()), columns = ['tweet','label'])

In [None]:
train_data.to_csv('/content/drive/My Drive/NLP/train_data1.csv')
valid_data.to_csv('/content/drive/My Drive/NLP/valid_data1.csv')
test_data.to_csv('/content/drive/My Drive/NLP/test_data1.csv')

In [None]:
import torch

import random
import numpy as np

SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [None]:
!pip install transformers
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Collecting transformers
  Downloading transformers-4.10.0-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.2 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 39.7 MB/s 
Collecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.16-py3-none-any.whl (50 kB)
[K     |████████████████████████████████| 50 kB 6.4 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.2 MB/s 
Collecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 40.9 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
init_token = tokenizer.cls_token
eos_token = tokenizer.sep_token
pad_token = tokenizer.pad_token
unk_token = tokenizer.unk_token
print(init_token, eos_token, pad_token, unk_token)

[CLS] [SEP] [PAD] [UNK]


In [None]:
init_token_idx = tokenizer.convert_tokens_to_ids(init_token)
eos_token_idx = tokenizer.convert_tokens_to_ids(eos_token)
pad_token_idx = tokenizer.convert_tokens_to_ids(pad_token)
unk_token_idx = tokenizer.convert_tokens_to_ids(unk_token)
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [None]:
init_token_idx = tokenizer.cls_token_id
eos_token_idx = tokenizer.sep_token_id
pad_token_idx = tokenizer.pad_token_id
unk_token_idx = tokenizer.unk_token_id
print(init_token_idx, eos_token_idx, pad_token_idx, unk_token_idx)

101 102 0 100


In [None]:
max_input_length = tokenizer.max_model_input_sizes['bert-base-uncased']
print(max_input_length)

512


In [None]:
def tokenize_and_cut(sentence):
    tokens = tokenizer.tokenize(sentence) 
    tokens = tokens[:max_input_length-2]
    return tokens

In [None]:
from torchtext.legacy import data

TEXT = data.Field(batch_first = True,
                  use_vocab = False,
                  tokenize = tokenize_and_cut,
                  preprocessing = tokenizer.convert_tokens_to_ids,
                  init_token = init_token_idx,
                  eos_token = eos_token_idx,
                  pad_token = pad_token_idx,
                  unk_token = unk_token_idx)

LABEL = data.LabelField(dtype = torch.float)

In [None]:
fields = {'tweet':('tweet', TEXT), 'label':('label', LABEL)}

In [None]:
from torchtext.legacy.data import TabularDataset, Field, BucketIterator, LabelField
train_data, valid_data, test_data = TabularDataset.splits(
                                        path = '/content/drive/My Drive/NLP/',
                                        train = 'train_data1.csv',
                                        validation = 'valid_data1.csv',
                                        test = 'test_data1.csv',
                                        format = 'csv',
                                        fields = fields,
                                        skip_header = False
)

In [None]:
MAX_VOCAB_SIZE = 25_000
TEXT.build_vocab(train_data, max_size = MAX_VOCAB_SIZE)
LABEL.build_vocab(train_data)

In [None]:
print(f"Unique tokens in TEXT vocabulary: {len(TEXT.vocab)}")
print(f"Unique tokens in LABEL vocabulary: {len(LABEL.vocab)}")

Unique tokens in TEXT vocabulary: 18483
Unique tokens in LABEL vocabulary: 2


In [None]:
print(LABEL.vocab.stoi)

defaultdict(None, {'0': 0, '4': 1})


In [None]:
BATCH_SIZE = 16
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device,
    #sort_key = lambda x: len(x.src),
    sort_within_batch=False,
    sort = False)



In [None]:
from transformers import BertTokenizer, BertModel
bert = BertModel.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
import torch.nn as nn

import torch.nn.functional as F

class BERTMLPSentiment(nn.Module):
    def __init__(self, bert, dropout):
        super().__init__()
        self.bert = bert
        embedding_dim = bert.config.to_dict()['hidden_size']
        hidden_1 = 512
        hidden_2 = 128
        # # linear layer (784 -> hidden_1)
        self.fc1 = nn.Linear(embedding_dim, hidden_1)
        # # linear layer (n_hidden -> hidden_2)
        self.fc2 = nn.Linear(hidden_1, hidden_2)
        # # linear layer (n_hidden -> 10)
        self.fc3 = nn.Linear(hidden_2, 1)
        # # dropout layer (p=0.2)
        # # dropout prevents overfitting of data
        self.dropout = nn.Dropout(0.25)
        #self.linear1 = nn.Linear(768, 256)
        #self.linear2 = nn.Linear(256, 1)
        
    def forward(self, text):
      
        # with torch.no_grad():
        #     embedded = self.bert(text)[0]
        embedded = self.bert(text)[0]
        #print(embedded.size())
        #sequence_output, pooled_output = self.bert(text)
        #print(pooled_output.shape)
        #linear1_output = self.linear1(embedded[:,0,:].view(-1,768)) ## extract the 1st token's embeddings

        #linear2_output = self.linear2(linear1_output)

        hidden = F.relu(self.fc1(embedded[:,0,:].view(-1,768)))
        # #x = F.relu(self.fc1(embedded))
        hidden = self.dropout(hidden)
        hidden = F.relu(self.fc2(hidden))
        hidden = self.dropout(hidden)
        output = self.fc3(hidden)
        return output
      

In [None]:
HIDDEN_DIM = 256
OUTPUT_DIM = 1
N_LAYERS = 2
BIDIRECTIONAL = True
DROPOUT = 0.25

model = BERTMLPSentiment(bert, DROPOUT)

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 109,941,761 trainable parameters


In [None]:
for name, param in model.named_parameters():                
    if name.startswith('bert'):
        param.requires_grad = False

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')

The model has 459,521 trainable parameters


In [None]:
for name, param in model.named_parameters():                
    if param.requires_grad:
        print(name)

fc1.weight
fc1.bias
fc2.weight
fc2.bias
fc3.weight
fc3.bias


In [None]:
import torch.optim as optim
optimizer = optim.Adam(model.parameters())

In [None]:
criterion = nn.BCEWithLogitsLoss()

In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def binary_accuracy(preds, y):
    """ Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8 """
    rounded_preds = torch.round(torch.sigmoid(preds))
    correct = (rounded_preds == y).float() 
    acc = correct.sum() / len(correct)
    return acc

In [None]:
def train(model, iterator, optimizer, criterion):
    
    epoch_loss = 0
    epoch_acc = 0
    model.train()
    
    for batch in iterator:
        #print(batch)
        optimizer.zero_grad() 
        #print(batch.tweet.size())   
        #print(batch.tweet)    
        predictions = model(batch.tweet).squeeze(1) 
        # print(predictions.size())     
        # print(batch.label.size())
        # #print(predictions)
        #print(batch.label)
       # predictions = predictions.unsqueeze(1)  
        loss = criterion(predictions, batch.label)        
        acc = binary_accuracy(predictions, batch.label)       
        loss.backward()        
        optimizer.step()       
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
def evaluate(model, iterator, criterion):
    
    epoch_loss = 0
    epoch_acc = 0    
    model.eval()
    
    with torch.no_grad():   
        for batch in iterator:
            predictions = model(batch.tweet).squeeze(1)
            #predictions = predictions.unsqueeze(1).float()  
            #print(predictions)         
            loss = criterion(predictions, batch.label)           
            acc = binary_accuracy(predictions, batch.label)
            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
N_EPOCHS = 5

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion)
    valid_loss, valid_acc = evaluate(model, valid_iterator, criterion)
        
    end_time = time.time()
        
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
        
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), 'bert-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

In [None]:
model.load_state_dict(torch.load('bert-model.pt'))
test_loss, test_acc = evaluate(model, test_iterator, criterion)
print(f'Test Loss: {test_loss:.3f} | Test Acc: {test_acc*100:.2f}%')

Test Loss: 0.509 | Test Acc: 74.94%
