In [1]:
import tweepy
import configparser
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle
import nltk
import string
import collections
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction import DictVectorizer

In [5]:
# read data 
train_data_file = open('train.data.txt', 'r')
train_lines = train_data_file.readlines()
train_events =[]
# Strips the newline character
for line in train_lines:
    train_events.append(list(map(int,line.strip('\n').split(','))))
    
train_label_file = open('train.label.txt', 'r') 
train_labels = train_label_file.readlines()
train_labels = [label.strip('\n') for label in train_labels]


dev_data_file = open('dev.data.txt', 'r')
dev_lines = dev_data_file.readlines()
dev_events =[]
# Strips the newline character
for line in dev_lines:
    dev_events.append(list(map(int,line.strip('\n').split(','))))
    
dev_label_file = open('dev.label.txt', 'r')
dev_labels = dev_label_file.readlines()
dev_labels = [label.strip('\n') for label in dev_labels]

In [None]:
# config to access tweeter API
config = configparser.ConfigParser()
config.read('config.ini')

consumer_key = config['twitter']['consumer_key']
consumer_secret = config['twitter']['consumer_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

In [None]:
# authentication
client = tweepy.Client(consumer_key=consumer_key, consumer_secret=consumer_secret,
                                   access_token=access_token, access_token_secret=access_token_secret,wait_on_rate_limit=True)

In [None]:
# get_tweets only return 100 results, handle the case when there is more than 100
def lookup_tweets(tweet_IDs, client):
    full_tweets = []
    tweet_count = len(tweet_IDs)
    for i in range(int((tweet_count / 100) + 1)):
        # Catch the last group if it is less than 100 tweets
        end_loc = min((i + 1) * 100, tweet_count)
        if tweet_IDs[i * 100:end_loc]:
            tweets = client.get_tweets(tweet_IDs[i * 100:end_loc],user_auth=True).data
            if tweets:
                full_tweets.extend(tweets)
    return full_tweets
    

In [None]:
# get the text of all events
train_events_text=[]
for event in train_events:
    results = lookup_tweets(event, client)
    train_event_text=[tweet.text for tweet in results]
    train_events_text.append(train_event_text)

KeyboardInterrupt: 

In [None]:
# save data to pickle file
f = open(f'./tweet_text.pckl','wb')
pickle.dump(train_events_text,f)
f.close()

In [None]:
# get the text of all events
dev_events_text=[]
for event in dev_events:
    results = lookup_tweets(event, client)
    dev_event_text=[tweet.text for tweet in results]
    dev_events_text.append(dev_event_text)

In [None]:
# save data to pickle file
f = open(f'./dev_tweet_text.pckl','wb')
pickle.dump(dev_events_text,f)
f.close()

In [None]:
# open train text file
f = open(f'/tweet_text.pckl','rb')
train_data = pickle.load(f)
f.close()


# open dev text file
f = open(f'/dev_tweet_text.pckl','rb')
dev_data = pickle.load(f)
f.close()

In [None]:
def clean_text(text):
    text = re.sub(r'@[A-Za-z0-9]+','',text) #remove @mention
    text = re.sub(r'#','',text) # remove the hashtag symbol
    text = re.sub(r'https?:\/\/\S+', '',text) #remove hyperlink
    text = re.sub(r'\n','',text) # remove \n 
    text = re.sub(r'\r','',text) # remove \r
    text = re.sub(r'[0-9]+','',text) #remove all the number
    return text

for i in range(len(train_data)):
    for j in range(len(train_data[i])):
        train_data[i][j] = clean_text(train_data[i][j])
        
for i in range(len(dev_data)):
    for j in range(len(dev_data[i])):
        dev_data[i][j] = clean_text(dev_data[i][j])

In [None]:
train_data[0]

['. Can regularly rinsing your nose with saline help prevent infection with the new coronavirus? ',
 '. Can eating garlic help prevent infection with the new coronavirus? COVIDMalaysia ',
 '. Do vaccines against pneumonia protect you against the new coronavirus? ',
 '. Can spraying alcohol or chlorine all over your body kill the new coronavirus? Chamber ',
 '. How effective are thermal scanners in detecting people infected with the new coronavirus? ',
 '. Can an ultraviolet disinfection lamp kill the new coronavirus? ',
 '. Are hand dryers effective in killing the new coronavirus? ',
 '. The new coronavirus CANNOT be transmitted through mosquito bites. ',
 '. Taking a hot bath does not prevent the new coronavirus disease ',
 '. Cold weather and snow CANNOT kill the new coronavirus. ',
 '. COVID- virus can be transmitted in areas with hot and humid climates ',
 '. Drinking alcohol does not protect you against COVID- and can be dangerous ',
 '. Being able to hold your breath for  seconds

In [None]:
# merge source tweeet and reply tweet together for train data
train_merge_events=[]
for event in train_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    train_merge_events.append(merge)
    
    
# merge source tweeet and reply tweet together for dev data
dev_merge_events=[]
for event in dev_data:
    merge = ''
    for tweet in event:
        merge = merge + tweet
    dev_merge_events.append(merge)

In [None]:
def tokenize_tweet(tweet):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer()
    # combine stop words and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stop = stopwords + list(string.punctuation)
    # filter out stop words and punctuation and send to lower case
    tokens = [token.lower() for token in twt.tokenize(tweet)
              if token.lower() not in stop]
    tokens = [word for word in tokens if re.search('[a-zA-Z]',word) is not None] # filter out word not contain alphabet
    return(tokens)

In [None]:
def tokenize_tweetv2(tweet):
    """Get all of the tokens in a set of tweets"""
    twt = nltk.tokenize.TweetTokenizer()
    # combine stop words and punctuation
    stopwords = nltk.corpus.stopwords.words('english')
    stop = stopwords + list(string.punctuation)
    # create the stemmer
    stemmer = nltk.stem.porter.PorterStemmer()
    # filter out stop words and punctuation and send to lower case
    tokens = [ stemmer.stem(token) for token in twt.tokenize(tweet)
              if token.lower() not in stop]
    return(tokens)

### Normal bag of word

In [None]:
# Create bag of word 
def bow(data,labels):
    x = []
    y = []
    for i in range(len(data)):
        tokens = tokenize_tweet(data[i])
        
        vocab = collections.defaultdict(int)
        for word in tokens:
            vocab[word] += 1 
        x.append(vocab)
        y.append(labels[i])
    return x,y
    

In [None]:
x_train,y_train = bow(train_merge_events,train_labels)
x_dev,y_dev = bow(dev_merge_events,dev_labels)

In [None]:
vectorizer = DictVectorizer()
x_train = vectorizer.fit_transform(x_train)
x_dev = vectorizer.transform(x_dev)

In [None]:
# k fold to find the optimize hyperparameter
alphas = [0.001,0.005,0.01,0.1,0.3,0.5,1]
max_nb = 0
for alpha in alphas:
    nb = MultinomialNB(alpha=alpha)
    nb_predict = nb.fit(x_train, y_train).predict(x_dev)    
    nb_accuracy = accuracy_score(y_dev,nb_predict)
    print('With alpha = {alpha} the accuracy of Naive Bayes is {acc:.5f}'.format(alpha=alpha, acc = nb_accuracy))
    if nb_accuracy > max_nb:
        max_nb = nb_accuracy
        max_alpha = alpha
print("The best setting for Naive Bayes is alpha = {alpha} with accuracy = {acc:.5f}".format(alpha=max_alpha,acc=max_nb))


With alpha = 0.001 the accuracy of Naive Bayes is 0.90032
With alpha = 0.005 the accuracy of Naive Bayes is 0.89715
With alpha = 0.01 the accuracy of Naive Bayes is 0.88924
With alpha = 0.1 the accuracy of Naive Bayes is 0.88608
With alpha = 0.3 the accuracy of Naive Bayes is 0.87816
With alpha = 0.5 the accuracy of Naive Bayes is 0.88133
With alpha = 1 the accuracy of Naive Bayes is 0.88608
The best setting for Naive Bayes is alpha = 0.001 with accuracy = 0.90032


In [None]:
solvers = ['newton-cg', 'lbfgs', 'liblinear','sag','saga']
c_values = [ 100,10,1.0, 0.1, 0.01,0.001]
max_lr = 0
for solver in solvers:
    print('Using this solver ',solver )
    for c_value in c_values:
        lr = LogisticRegression(C=c_value, penalty='l2', solver=solver,max_iter=1000)
        lr_predict = lr.fit(x_train, y_train).predict(x_dev)    
        lr_accuracy = accuracy_score(y_dev,lr_predict)
        print('With C = {c} and solver  = {sol} the acciracy of Logistic Regression is {acc}'.format(c=c_value,sol=solver,acc= lr_accuracy))
        if lr_accuracy > max_lr:
            max_lr = lr_accuracy
            max_c_value = c_value
            max_solver = solver
print("The best setting for Logistic Regression is c = {c} and solver = {sol} with accuracy = {acc:.5f}".format(c=max_c_value,sol=max_solver,acc=max_lr))

Using this solver  newton-cg
With C = 100 and solver  = newton-cg the acciracy of Logistic Regression is 0.9113924050632911
With C = 10 and solver  = newton-cg the acciracy of Logistic Regression is 0.9145569620253164
With C = 1.0 and solver  = newton-cg the acciracy of Logistic Regression is 0.9129746835443038
With C = 0.1 and solver  = newton-cg the acciracy of Logistic Regression is 0.8876582278481012
With C = 0.01 and solver  = newton-cg the acciracy of Logistic Regression is 0.8433544303797469
With C = 0.001 and solver  = newton-cg the acciracy of Logistic Regression is 0.7958860759493671
Using this solver  lbfgs
With C = 100 and solver  = lbfgs the acciracy of Logistic Regression is 0.9113924050632911
With C = 10 and solver  = lbfgs the acciracy of Logistic Regression is 0.9145569620253164
With C = 1.0 and solver  = lbfgs the acciracy of Logistic Regression is 0.9129746835443038
With C = 0.1 and solver  = lbfgs the acciracy of Logistic Regression is 0.8876582278481012
With C = 0.



With C = 100 and solver  = sag the acciracy of Logistic Regression is 0.9272151898734177




With C = 10 and solver  = sag the acciracy of Logistic Regression is 0.9272151898734177




With C = 1.0 and solver  = sag the acciracy of Logistic Regression is 0.9272151898734177




With C = 0.1 and solver  = sag the acciracy of Logistic Regression is 0.9224683544303798
With C = 0.01 and solver  = sag the acciracy of Logistic Regression is 0.9145569620253164
With C = 0.001 and solver  = sag the acciracy of Logistic Regression is 0.8781645569620253
Using this solver  saga




With C = 100 and solver  = saga the acciracy of Logistic Regression is 0.9272151898734177




With C = 10 and solver  = saga the acciracy of Logistic Regression is 0.9272151898734177




With C = 1.0 and solver  = saga the acciracy of Logistic Regression is 0.9272151898734177




With C = 0.1 and solver  = saga the acciracy of Logistic Regression is 0.9240506329113924
With C = 0.01 and solver  = saga the acciracy of Logistic Regression is 0.9145569620253164
With C = 0.001 and solver  = saga the acciracy of Logistic Regression is 0.8718354430379747
The best setting for Logistic Regression is c = 100 and solver = sag with accuracy = 0.92722


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

clfs = [KNeighborsClassifier(),DecisionTreeClassifier(),RandomForestClassifier(),
        MultinomialNB(),LinearSVC(),LogisticRegression()]


In [None]:
from sklearn import model_selection
from sklearn.metrics import accuracy_score, classification_report

def do_multiple_10foldcrossvalidation(clfs,data,classifications):
    for clf in clfs:
        predictions = model_selection.cross_val_predict(clf, data,classifications, cv=10)
        print (clf)
        print ("accuracy")
        print (accuracy_score(classifications,predictions))
        print (classification_report(classifications,predictions))
        
do_multiple_10foldcrossvalidation(clfs,x_train,y_train)


KNeighborsClassifier()
accuracy
0.7915567282321899
              precision    recall  f1-score   support

   nonrumour       0.79      1.00      0.88      1475
      rumour       0.86      0.07      0.13       420

    accuracy                           0.79      1895
   macro avg       0.82      0.53      0.51      1895
weighted avg       0.81      0.79      0.72      1895

DecisionTreeClassifier()
accuracy
0.833245382585752
              precision    recall  f1-score   support

   nonrumour       0.88      0.91      0.89      1475
      rumour       0.64      0.56      0.60       420

    accuracy                           0.83      1895
   macro avg       0.76      0.73      0.75      1895
weighted avg       0.83      0.83      0.83      1895

RandomForestClassifier()
accuracy
0.8401055408970977
              precision    recall  f1-score   support

   nonrumour       0.83      0.99      0.91      1475
      rumour       0.93      0.30      0.45       420

    accuracy              

### Using td-idf

In [None]:
# need to write manually for better tokenize
td = TfidfVectorizer(stop_words='english')
x_train = td.fit_transform(train_merge_events)
x_dev = td.transform(dev_merge_events)

#from sklearn.feature_extraction.text import CountVectorizer
#vectorizer = CountVectorizer(stop_words='english')
#x_train  = vectorizer.fit_transform(train_merge_events)
#x_dev = vectorizer.transform(dev_merge_events)

In [None]:
# k fold to find the optimize hyperparameter
alphas = [0.001,0.005,0.01,0.1,0.3,0.5,1]
max_nb = 0
for alpha in alphas:
    nb = MultinomialNB(alpha=alpha)
    nb_predict = nb.fit(x_train, train_labels).predict(x_dev)    
    nb_accuracy = accuracy_score(dev_labels,nb_predict)
    print('With alpha = {alpha} the accuracy of Naive Bayes is {acc:.5f}'.format(alpha=alpha, acc = nb_accuracy))
    if nb_accuracy > max_nb:
        max_nb = nb_accuracy
        max_alpha = alpha
print("The best setting for Naive Bayes is alpha = {alpha} with accuracy = {acc:.5f}".format(alpha=max_alpha,acc=max_nb))

With alpha = 0.001 the accuracy of Naive Bayes is 0.91139
With alpha = 0.005 the accuracy of Naive Bayes is 0.91297
With alpha = 0.01 the accuracy of Naive Bayes is 0.90190
With alpha = 0.1 the accuracy of Naive Bayes is 0.92089
With alpha = 0.3 the accuracy of Naive Bayes is 0.88449
With alpha = 0.5 the accuracy of Naive Bayes is 0.85601
With alpha = 1 the accuracy of Naive Bayes is 0.80380
The best setting for Naive Bayes is alpha = 0.1 with accuracy = 0.92089


### BERT
Google colab

In [6]:
!pip install torch torchvision transformers

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 5.1 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 53.1 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 43.0 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 42.2 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 6.5 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml


In [7]:
from transformers import BertModel

bert_model = BertModel.from_pretrained('bert-base-uncased')

print("Done loading BERT model.")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done loading BERT model.


In [8]:
from transformers import BertTokenizer

#load BERT's WordPiece tokenisation model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

In [None]:
tokens = tokenizer.tokenize(event[0])
# tokens = ['[CLS]'] + tokens + ['[SEP]']
tokens = ['[CLS]'] + tokens

In [None]:
tokens=['CLS']
for tweet in event:
  t_tokens = tokenizer.tokenize(tweet)
  tokens = tokens + t_tokens + ['[SEP]']

In [9]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class SSTDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']
        label = self.df.loc[index, 'label']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask, label

In [10]:
from torch.utils.data import DataLoader

#Creating instances of training and development set
#maxlen sets the maximum length a sentence can have
#any sentence longer than this length is truncated to the maxlen size
train_set = SSTDataset(filename = 'train.tsv', maxlen = 512)
dev_set = SSTDataset(filename = 'dev.tsv', maxlen = 512)

#Creating intsances of training and development dataloaders
train_loader = DataLoader(train_set,batch_size = 4, num_workers = 0)
dev_loader = DataLoader(dev_set, batch_size = 4, num_workers = 0)

print("Done preprocessing training and development data.")

Done preprocessing training and development data.


In [11]:
train_loader

<torch.utils.data.dataloader.DataLoader at 0x7facb4867710>

In [12]:
import torch
import torch.nn as nn
from transformers import BertModel

class SentimentClassifier(nn.Module):

    def __init__(self):
        super(SentimentClassifier, self).__init__()
        #Instantiating BERT model object 
        self.bert_layer = BertModel.from_pretrained('bert-base-uncased')
        
        #Classification layer
        #input dimension is 768 because [CLS] embedding has a dimension of 768
        #output dimension is 1 because we're working with a binary classification problem
        self.cls_layer = nn.Linear(768, 1)

    def forward(self, seq, attn_masks):
        '''
        Inputs:
            -seq : Tensor of shape [B, T] containing token ids of sequences
            -attn_masks : Tensor of shape [B, T] containing attention masks to be used to avoid contibution of PAD tokens
        '''

        #Feeding the input to BERT model to obtain contextualized representations
        outputs = self.bert_layer(seq, attention_mask = attn_masks, return_dict=True)
        cont_reps = outputs.last_hidden_state

        #Obtaining the representation of [CLS] head (the first token)
        cls_rep = cont_reps[:, 0]

        #Feeding cls_rep to the classifier layer
        logits = self.cls_layer(cls_rep)

        return logits

In [13]:
gpu = 0 #gpu ID

print("Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...")
net = SentimentClassifier()
net.cuda(gpu) #Enable gpu support for the model
print("Done creating the sentiment classifier.")

Creating the sentiment classifier, initialised with pretrained BERT-BASE parameters...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Done creating the sentiment classifier.


In [14]:
import torch.nn as nn
import torch.optim as optim

criterion = nn.BCEWithLogitsLoss()
opti = optim.Adam(net.parameters(), lr = 2e-5)

In [15]:
def get_accuracy_from_logits(logits, labels):
    probs = torch.sigmoid(logits.unsqueeze(-1))
    soft_probs = (probs > 0.5).long()
    acc = (soft_probs.squeeze() == labels).float().mean()
    return acc

def evaluate(net, criterion, dataloader, gpu):
    net.eval()

    mean_acc, mean_loss = 0, 0
    count = 0

    with torch.no_grad():
        for seq, attn_masks, labels in dataloader:
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)
            logits = net(seq, attn_masks)
            mean_loss += criterion(logits.squeeze(-1), labels.float()).item()
            mean_acc += get_accuracy_from_logits(logits, labels)
            count += 1

    return mean_acc / count, mean_loss / count

In [16]:
import time

def train(net, criterion, opti, train_loader, dev_loader, max_eps, gpu):

    best_acc = 0
    st = time.time()
    for ep in range(max_eps):
        
        net.train()
        for it, (seq, attn_masks, labels) in enumerate(train_loader):
            #Clear gradients
            opti.zero_grad()  
            #Converting these to cuda tensors
            seq, attn_masks, labels = seq.cuda(gpu), attn_masks.cuda(gpu), labels.cuda(gpu)

            #Obtaining the logits from the model
            logits = net(seq, attn_masks)

            #Computing loss
            loss = criterion(logits.squeeze(-1), labels.float())

            #Backpropagating the gradients
            loss.backward()

            #Optimization step
            opti.step()
              
            if it % 100 == 0:
                
                acc = get_accuracy_from_logits(logits, labels)
                print("Iteration {} of epoch {} complete. Loss: {}; Accuracy: {}; Time taken (s): {}".format(it, ep, loss.item(), acc, (time.time()-st)))
                st = time.time()

        
        dev_acc, dev_loss = evaluate(net, criterion, dev_loader, gpu)
        print("Epoch {} complete! Development Accuracy: {}; Development Loss: {}".format(ep, dev_acc, dev_loss))
        if dev_acc > best_acc:
            print("Best development accuracy improved from {} to {}, saving model...".format(best_acc, dev_acc))
            best_acc = dev_acc
            torch.save(net.state_dict(), 'sstcls_{}.dat'.format(ep))

In [17]:
num_epoch = 1

#fine-tune the model
train(net, criterion, opti, train_loader, dev_loader, num_epoch, gpu)

Iteration 0 of epoch 0 complete. Loss: 0.6499412059783936; Accuracy: 0.5; Time taken (s): 1.0964243412017822
Iteration 100 of epoch 0 complete. Loss: 0.4843035936355591; Accuracy: 0.75; Time taken (s): 101.29129362106323
Iteration 200 of epoch 0 complete. Loss: 0.1700078397989273; Accuracy: 1.0; Time taken (s): 101.6502194404602
Iteration 300 of epoch 0 complete. Loss: 0.35970693826675415; Accuracy: 0.75; Time taken (s): 101.63419151306152
Iteration 400 of epoch 0 complete. Loss: nan; Accuracy: 1.0; Time taken (s): 101.40207171440125
Epoch 0 complete! Development Accuracy: 0.7684563994407654; Development Loss: nan
Best development accuracy improved from 0 to 0.7684563994407654, saving model...


In [20]:
import torch
from torch.utils.data import Dataset
from transformers import BertTokenizer
import pandas as pd

class TestDataset(Dataset):

    def __init__(self, filename, maxlen):

        #Store the contents of the file in a pandas dataframe
        self.df = pd.read_csv(filename, delimiter = '\t')

        #Initialize the BERT tokenizer
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.maxlen = maxlen

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):

        #Selecting the sentence and label at the specified index in the data frame
        sentence = self.df.loc[index, 'sentence']

        #Preprocessing the text to be suitable for BERT
        tokens = self.tokenizer.tokenize(sentence) #Tokenize the sentence
        tokens = ['[CLS]'] + tokens + ['[SEP]'] #Insering the CLS and SEP token in the beginning and end of the sentence
        if len(tokens) < self.maxlen:
            tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))] #Padding sentences
        else:
            tokens = tokens[:self.maxlen-1] + ['[SEP]'] #Prunning the list to be of specified max length

        tokens_ids = self.tokenizer.convert_tokens_to_ids(tokens) #Obtaining the indices of the tokens in the BERT Vocabulary
        tokens_ids_tensor = torch.tensor(tokens_ids) #Converting the list to a pytorch tensor

        #Obtaining the attention mask i.e a tensor containing 1s for no padded tokens and 0s for padded ones
        attn_mask = (tokens_ids_tensor != 0).long()

        return tokens_ids_tensor, attn_mask

In [21]:
test_set = TestDataset(filename = 'test.tsv', maxlen = 512)

#Creating intsances of training and development dataloaders
test_loader = DataLoader(test_set,batch_size = 1, num_workers = 0)

In [39]:
def predict(net, test_loader):
    # load weight
    # net.load_state_dict(torch.load(weight_file))
    
    predictions = []
    
    # Predict process
    with torch.no_grad():
        for seq, attn_masks in test_loader:
            seq, attn_masks = seq.cuda(gpu), attn_masks.cuda(gpu)
            logits = net(seq, attn_masks)
            probs = torch.sigmoid(logits.unsqueeze(-1))
            soft_probs = (probs > 0.5).long()
            predictions.append(soft_probs.cpu().numpy().squeeze())
    return predictions

In [40]:
# weight_file = "sstcls_0.dat"
prediction = predict(net, test_loader)

In [41]:
df = pd.DataFrame({"Id": range(len(prediction)),"Predicted": prediction}) 
df.to_csv('bert_test.csv',index=False)

### LSTM
model the source tweet and replies as a sequence of tweets using recurrent networks