In [1]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
import torch

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torch torchvision transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.7 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.5 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.1 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.53.tar.gz (880 kB)
[K     |████████████████████████████████| 880 kB 47.7 MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacre

# Preprocessing

In [None]:
# train labels
train_label_file = open('project-data/train.label.txt', 'r') 
train_labels = train_label_file.readlines()
train_labels = [label.strip('\n') for label in train_labels]

#dev labels
dev_label_file = open('project-data/dev.label.txt', 'r')
dev_labels = dev_label_file.readlines()
dev_labels = [label.strip('\n') for label in dev_labels]

In [None]:
# open train text file
f = open(f'./tweet_text.pckl','rb')
train_data = pickle.load(f)
f.close()


# open dev text file
f = open(f'./dev_tweet_text.pckl','rb')
dev_data = pickle.load(f)
f.close()

# open test text file
f = open(f'./test_tweet_text.pckl','rb')
test_data = pickle.load(f)
f.close()

In [None]:
## cleaning the tweets
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mention
    text = re.sub(r'#','',text) # remove the hashtag symbol
    text = re.sub(r'https?:\/\/\S+', '',text) #remove hyperlink
    text = re.sub(r'\n','',text) # remove \n 
    text = re.sub(r'\r','',text) # remove \r
    text = re.sub(r'[0-9]+','',text) #remove all the number
    text = re.sub(r'\W+', ' ', text) #remove special characters
    text = text.strip().lower()
    if len(text) != 0:
        return text
    else:
        return None
    
for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        train_data[i][j] = clean_text(train_data[i][j])
    train_data[i] = [x for x in train_data[i] if x is not None]
        
for i in range(len(dev_data)):
    for j in range(len(dev_data[i])):
        dev_data[i][j] = clean_text(dev_data[i][j])
    dev_data[i] = [x for x in dev_data[i] if x is not None]
    
for i in range(len(test_data)):
    for j in range(len(test_data[i])):
        test_data[i][j] = clean_text(test_data[i][j])
    test_data[i] = [x for x in test_data[i] if x is not None]

In [None]:
maxlen = 512

def split_based_length(text, maxlen):
    ts = list(filter(None, text.split(" ")))
    split_text = []
    #print("lents", len(ts))
    while len(ts) > maxlen:
        split_text.append(ts[:maxlen])
        ts = ts[maxlen:]
    if len(ts) != 0:
        split_text.append(ts)
        #print("leftts", ts)
    return [" ".join(word for word in s) for s in split_text]

In [None]:
# merge source tweeet and reply tweet together for train data
train_merge_labels=[]
train_merge_events=[]
train_index=[]
for i in range(len(train_data)):
    merge = ' '.join(word for word in train_data[i])
    split_text = split_based_length(merge, maxlen)
    for text in split_text:
        train_merge_events.append(str(text))
        train_merge_labels.append(train_labels[i])
        train_index.append(i)

# merge source tweeet and reply tweet together for dev data
dev_merge_events=[]
dev_merge_labels=[]
dev_index=[]
for i in range(len(dev_data)):
    merge = ' '.join(word for word in dev_data[i])
    split_text = split_based_length(merge, maxlen)
    for text in split_text:
        dev_merge_events.append(str(text))
        dev_merge_labels.append(dev_labels[i])
        dev_index.append(i)

NameError: ignored

In [None]:
train_df = pd.DataFrame({'text':train_merge_events, 'label':train_merge_labels,'index':train_index})
train_df['label'] = LabelEncoder().fit_transform(train_df['label'])
nan_value = float("NaN")
train_df.replace("", nan_value, inplace=True)
train_df.dropna(axis=0,inplace=True)
print("train", train_df.shape)
train_df.to_csv('./train_bert_split.tsv', sep='\t',index=False)

dev_df = pd.DataFrame({'text':dev_merge_events, 'label':dev_merge_labels,'index':dev_index})
dev_df['label'] = LabelEncoder().fit_transform(dev_df['label'])
dev_df.replace("", nan_value, inplace=True)
dev_df.dropna(axis=0,inplace=True)
print("dev", dev_df.shape)
dev_df.to_csv('./dev_bert_split.tsv', sep='\t',index=False)

train (2737, 2)
dev (930, 2)


In [None]:
test_merge_events=[]
test_merge_index=[]
for i, event in enumerate(test_data):
    merge = ' '.join(word for word in event)
    split_text = split_based_length(merge, maxlen)
    for text in split_text:
        test_merge_events.append(str(text))
        test_merge_index.append(i)
        
test_df = pd.DataFrame({'text':test_merge_events, 'index': test_merge_index})
nan_value = float("NaN")
test_df.replace("", nan_value, inplace=True)
test_df.dropna(axis=0,inplace=True)
print("test", test_df.shape)
test_df.to_csv('test_split_svc.tsv', sep='\t',index=False)

# Training

In [3]:
from transformers import BertModel
from transformers import BertTokenizer
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
from transformers import AutoModelForSequenceClassification
import torch.nn as nn
import torch.optim as optim
import time


In [4]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

#load BERT's WordPiece tokenisation model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [25]:

#from transformers import AutoTokenizer

class Dataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        label = self.df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [26]:
train_set = Dataset(filename ='./train_bert_split.tsv', maxlen = 512)
dev_set = Dataset(filename ='./dev_bert_split.tsv', maxlen = 512)

train_loader = DataLoader(train_set,batch_size = 8, num_workers = 0)
dev_loader = DataLoader(dev_set, batch_size = 8, num_workers = 0)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [27]:
class TweetClassifier(nn.Module):
    def __init__(self):
        super(SentimentClassifier, self).__init__()
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state
        cls_rep = cont_reps[:, 0]
        logits = self.cls_layer(cls_rep)
        return logits,outputs

In [28]:
gpu = 0 #gpu ID

print("Creating the tweet classifier, initialised with pretrained BERT-BASE parameters...")
net = TweetClassifier()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the Tweet classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


In [29]:
criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [30]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [31]:
def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'bert_split.dat'.format(ep))

In [12]:
num_epoch = 16

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.8324136137962341; Accuracy: 0.125; Time taken (s): 1.8216800689697266
Iteration 100 of epoch 0 complete. Loss: 0.7103641629219055; Accuracy: 0.375; Time taken (s): 151.81797313690186
Iteration 200 of epoch 0 complete. Loss: 0.24406111240386963; Accuracy: 0.875; Time taken (s): 150.7084288597107
Epoch 0 complete! Development Accuracy: 0.8915662169456482; Development Loss: 0.2614969952054412
Best development accuracy improved from 0 to 0.8915662169456482, saving model...


# Testing

In [13]:
class TestDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'text']
        index = self.df.loc[index, 'index']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, index

In [16]:
test_set = TestDataset(filename = 'test_split_svc.tsv', maxlen = 512)

#Creating intsances of training and development dataloaders
test_loader = DataLoader(test_set,batch_size = 1, num_workers = 0)

In [None]:
weight_file = "bert_split.dat"

In [None]:
train_outputs= None
net.load_state_dict(torch.load(weight_file))
with torch.no_grad():
    for seq, attn_masks,_ in train_loader:
        seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
        logits,output = net(seq, attn_masks)
        pooled_output = output.pooler_output   
        if train_outputs is None:
            train_outputs = pooled_output.detach().cpu().numpy()
        else:
            train_outputs = np.append(train_outputs, pooled_output.detach().cpu().numpy(), axis=0)

In [None]:
dev_outputs= None
net.load_state_dict(torch.load(weight_file))
with torch.no_grad():
    for seq, attn_masks,_ in dev_loader:
        seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
        logits,output = net(seq, attn_masks)
        pooled_output = output.pooler_output   
        if dev_outputs is None:
            dev_outputs = pooled_output.detach().cpu().numpy()
        else:
            dev_outputs = np.append(dev_outputs, pooled_output.detach().cpu().numpy(), axis=0)


In [None]:
train_x = {}
for l, emb in zip(t['index'], train_outputs):
    if l in train_x.keys():
        train_x[l]  =np.vstack([train_x[l], emb])
    else:
        train_x[l] = [emb]

dev_x = {}
for l, emb in zip(d['index'], dev_outputs):
    if l in dev_x.keys():
        dev_x[l]  =np.vstack([dev_x[l], emb])
    else:
        dev_x[l] = [emb]

In [None]:
ori_train = pd.read_csv('train.tsv', sep='\t')
ori_dev = pd.read_csv('dev.tsv', sep='\t')

In [None]:
for l in train_x.keys():
    # print(len(train_x[l]))
    train_x[l] = np.mean(train_x[l],axis=0)

train_l_final = []
tlabel_l_final = []
for k in train_x.keys():
    train_l_final.append(train_x[k])
    tlabel_l_final.append(ori_train.loc[k]['label'])

df_train = pd.DataFrame({'emb': train_l_final, 'label': tlabel_l_final})
df_train.head(10)

In [None]:
for l in dev_x.keys():
    # print(len(train_x[l]))
    dev_x[l] = np.mean(dev_x[l],axis=0)
dev_l_final = []
dlabel_l_final = []
for k in dev_x.keys():
    dev_l_final.append(dev_x[k])
    dlabel_l_final.append(ori_dev.loc[k]['label'])

df_dev = pd.DataFrame({'emb': dev_l_final, 'label': dlabel_l_final})
df_dev.head(10)

In [95]:
test_outputs= None
net.load_state_dict(torch.load(weight_file))
with torch.no_grad():
    for seq, attn_masks,_ in test_loader:
        seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
        logits,output = net(seq, attn_masks)
        pooled_output = output.pooler_output   
        if test_outputs is None:
            test_outputs = pooled_output.detach().cpu().numpy()
        else:
            test_outputs = np.append(test_outputs, pooled_output.detach().cpu().numpy(), axis=0)

In [108]:
test_x = {}
for l, emb in zip(test['index'], test_outputs):
    if l in test_x.keys():
        test_x[l]  =np.vstack([test_x[l], emb])
    else:
        test_x[l] = [emb]

In [112]:
for l in test_x.keys():
    # print(len(train_x[l]))
    test_x[l] = np.mean(test_x[l],axis=0)

test_l_final = []
for k in test_x.keys():
    test_l_final.append(test_x[k])

In [123]:
from sklearn.svm import LinearSVC
SVC = LinearSVC(max_iter=10000)
SVC.fit(df_train['emb'].tolist(),df_train['label'].tolist())
score = SVC.score(df_dev['emb'].tolist(),df_dev['label'].tolist())

print("Accuracy:", score)

Accuracy: 0.9260504201680673




In [127]:
svc_predict = SVC.predict(test_l_final)    
df = pd.DataFrame({"Id": range(len(svc_predict)),"Predicted": svc_predict}) 
df.to_csv('bert_split_svc.csv',index=False)