<a href="https://colab.research.google.com/github/meredithwan/GE2020/blob/master/bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>


# **Acknowledgements**

Adapted from Preston's codes hosted at: https://github.com/prestonlimlianjie/bert-sentiment-analysis-straits-times

A huge thank you to Preston!




In [None]:
import sys
!{sys.executable} -m pip install torch transformers pandas scikit-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
[K     |████████████████████████████████| 778kB 2.7MB/s 
Collecting tokenizers==0.8.1.rc1
[?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 14.3MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 33.5MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB

In [None]:

# Define utils functions

def pad_sents(sents, pad_token):
    """ Pad list of sentences according to the longest sentence in the batch.
    @param sents (list[list[int]]): list of sentences, where each sentence
                                    is represented as a list of words
    @param pad_token (int): padding token
    @returns sents_padded (list[list[int]]): list of sentences where sentences shorter
        than the max length sentence are padded out with the pad_token, such that
        each sentences in the batch now has equal length.
        Output shape: (batch_size, max_sentence_length)
    """
    sents_padded = []

    max_len = max(len(s) for s in sents)
    batch_size = len(sents)

    for s in sents:
        padded = [pad_token] * max_len
        padded[:len(s)] = s
        sents_padded.append(padded)

    return sents_padded

def sents_to_tensor(tokenizer, sents, device):
    """
    :param tokenizer: BertTokenizer
    :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences), reversely sorted
    :param device: torch.device
    :return: sents_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: masks_tensor: torch.Tensor, shape(batch_size, max_sent_length), reversely sorted
    :return: sents_lengths: torch.Tensor, shape(batch_size), reversely sorted
    """
    tokens_list = [tokenizer.tokenize(sent) for sent in sents]
    sents_lengths = [len(tokens) for tokens in tokens_list]
    # tokens_sents_zip = zip(tokens_list, sents_lengths)
    # tokens_sents_zip = sorted(tokens_sents_zip, key=lambda x: x[1], reverse=True)
    # tokens_list, sents_lengths = zip(*tokens_sents_zip)
    tokens_list_padded = pad_sents(tokens_list, '[PAD]')
    sents_lengths = torch.tensor(sents_lengths, device=device)

    masks = []
    for tokens in tokens_list_padded:
        mask = [0 if token=='[PAD]' else 1 for token in tokens]
        masks.append(mask)
    masks_tensor = torch.tensor(masks, dtype=torch.long, device=device)
    tokens_id_list = [tokenizer.convert_tokens_to_ids(tokens) for tokens in tokens_list_padded]
    sents_tensor = torch.tensor(tokens_id_list, dtype=torch.long, device=device)

    return sents_tensor, masks_tensor, sents_lengths

In [None]:

# use TensorFlow 1.x
%tensorflow_version 1.x
from transformers import BertForSequenceClassification, BertTokenizer, AdamW
import torch
from torch import nn
import torch.nn.functional as F

In [None]:
# Define the sentiment classification model

class SentimentClassifierModel(nn.Module):

    def __init__(self, bert_config, device, n_class):
        """
        :param bert_config: str, BERT configuration description
        :param device: torch.device
        :param n_class: int
        """

        super(SentimentClassifierModel, self).__init__()

        self.n_class = n_class
        self.bert_config = bert_config
        self.bert = BertForSequenceClassification.from_pretrained(self.bert_config, num_labels=self.n_class)
        self.tokenizer = BertTokenizer.from_pretrained(self.bert_config)
        self.device = device

    def forward(self, sents):
        """
        :param sents: list[str], list of sentences (NOTE: untokenized, continuous sentences)
        :return: pre_softmax, torch.tensor of shape (batch_size, n_class)
        """

        sents_tensor, masks_tensor, sents_lengths = sents_to_tensor(self.tokenizer, sents, self.device)
        pre_softmax = self.bert(input_ids=sents_tensor, attention_mask=masks_tensor)

        return pre_softmax

    @staticmethod
    def load(model_path: str, device):
        """ Load the model from a file.
        @param model_path (str): path to model
        @return model (nn.Module): model with saved parameters
        """
        params = torch.load(model_path, map_location=lambda storage, loc: storage)
        args = params['args']
        model = SentimentClassifierModel(device=device, **args)
        model.load_state_dict(params['state_dict'])

        return model

    def save(self, path: str):
        """ Save the model to a file.
        @param path (str): path to the model
        """
        print('save model parameters to [%s]' % path, file=sys.stderr)

        params = {
            'args': dict(bert_config=self.bert_config, n_class=self.n_class),
            'state_dict': self.state_dict()
        }

        torch.save(params, path)

In [None]:
import pandas

pwd = '/content/gdrive'

from google.colab import drive
drive.mount(pwd)

# only keep necessary columns
df= pandas.read_csv("/content/gdrive/My Drive/Colab Notebooks/Tweets.csv", index_col=0, usecols=['tweet_id','airline_sentiment', 'text'])
df.head()

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/gdrive


Unnamed: 0_level_0,airline_sentiment,text
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1
570306133677760513,neutral,@VirginAmerica What @dhepburn said.
570301130888122368,positive,@VirginAmerica plus you've added commercials t...
570301083672813571,neutral,@VirginAmerica I didn't today... Must mean I n...
570301031407624196,negative,@VirginAmerica it's really aggressive to blast...
570300817074462722,negative,@VirginAmerica and it's a really big bad thing...


In [None]:
# cleaning tweets
# Remove URL, RT, mention(@)

df.text = df.text.str.replace(r'http(\S)+', r'')
df.text = df.text.str.replace(r'http ...', r'')
df.text = df.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
df.text = df.text.str.replace(r'@[\S]+',r'')

# Remove non-ascii words or characters
df.text = [''.join([i if ord(i) < 128 else '' for i in text]) for text in df.text]
df.text = df.text.str.replace(r'_[\S]?',r'')

# Remove extra space
df.text = df.text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
df.text = df.text.str.replace(r'&amp;?',r'and')
df.text = df.text.str.replace(r'&lt;',r'<')
df.text = df.text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
df.text = df.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
df.text = df.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
df.text = df.text.str.lower()
df.text = df.text.str.strip()

In [None]:
df['text_length'] = [len(text.split(' ')) for text in df.text]
print(df.shape)

(14640, 3)


In [None]:
# Drop texts with length <=3 and drop duplicates
df = df[df['text_length']>3]
df = df.drop_duplicates(subset=['text'])

print(df.shape)

(13977, 3)


In [None]:
df.airline_sentiment.value_counts()

negative    8998
neutral     2834
positive    2145
Name: airline_sentiment, dtype: int64

In [None]:
# process text to BERT format
df['BERT_processed_text'] = '[CLS] '+df.text
df.BERT_processed_text

tweet_id
570306133677760513                                   [CLS] what  said .
570301130888122368    [CLS] plus you ' ve added commercials to the e...
570301083672813571    [CLS] i didn ' t today ... must mean i need to...
570301031407624196    [CLS] it ' s really aggressive to blast obnoxi...
570300817074462722     [CLS] and it ' s a really big bad thing about it
                                            ...                        
569587686496825344    [CLS] thank you we got on a different flight t...
569587371693355008    [CLS] leaving over 20 minutes late flight . no...
569587242672398336    [CLS] please bring american airlines to # blac...
569587188687634433    [CLS] you have my money , you change my flight...
569587140490866689    [CLS] we have 8 ppl so we need 2 know how many...
Name: BERT_processed_text, Length: 13977, dtype: object

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sent)) for sent in df.text]

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [None]:
df.BERT_processed_text_length

tweet_id
570306133677760513     3
570301130888122368    15
570301083672813571    17
570301031407624196    25
570300817074462722    11
                      ..
569587686496825344    11
569587371693355008    27
569587242672398336     8
569587188687634433    29
569587140490866689    34
Name: BERT_processed_text_length, Length: 13977, dtype: int64

In [None]:
label_dict = dict()
for i, l in enumerate(list(df.airline_sentiment.value_counts().keys())):
    label_dict.update({l: i})

df['airline_sentiment_label'] = [label_dict[label] for label in df.airline_sentiment]

In [None]:
df.airline_sentiment_label

tweet_id
570306133677760513    1
570301130888122368    2
570301083672813571    1
570301031407624196    0
570300817074462722    0
                     ..
569587686496825344    2
569587371693355008    0
569587242672398336    1
569587188687634433    0
569587140490866689    1
Name: airline_sentiment_label, Length: 13977, dtype: int64

In [None]:
!ls /content/gdrive/My\ Drive/Colab\ Notebooks
df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_processed_twitter_airline_sentiment.csv')

bert.ipynb  Tweets.csv


In [None]:
# train
from sklearn.model_selection import train_test_split

In [None]:
df= pandas.read_csv("/content/gdrive/My Drive/Colab Notebooks/bert_processed_twitter_airline_sentiment.csv")
df.head()

Unnamed: 0,tweet_id,airline_sentiment,text,text_length,BERT_processed_text,BERT_processed_text_length,airline_sentiment_label
0,570306133677760513,neutral,what said .,4,[CLS] what said .,3,1
1,570301130888122368,positive,plus you ' ve added commercials to the experie...,12,[CLS] plus you ' ve added commercials to the e...,15,2
2,570301083672813571,neutral,i didn ' t today ... must mean i need to take ...,15,[CLS] i didn ' t today ... must mean i need to...,17,1
3,570301031407624196,negative,"it ' s really aggressive to blast obnoxious "" ...",21,[CLS] it ' s really aggressive to blast obnoxi...,25,0
4,570300817074462722,negative,and it ' s a really big bad thing about it,11,[CLS] and it ' s a really big bad thing about it,11,0


In [None]:
# Define training params
label_names = ['positive', 'negative', 'neutral']
model_name = 'ge-sentiment'
device = torch.device("cuda:0")
bert_size = 'bert-base-uncased'

train_batch_size = 32 # batch size
clip_grad = 1.0 # gradient clipping
log_every = 10 # number of mini-batches before logging
max_epoch = 100 # max number of epochs
max_patience = 3 # number of iterations to wait before decaying learning rate
max_num_trial = 3 # number of trials before terminating training
lr_decay = 0.5 # learning rate decay
lr_bert = 0.00002 # BERT learning rate
lr = 0.001 # learning rate
valid_niter = 500 # perform validation after n iterations
dropout = 0.3 # dropout rate
verbose = True

prefix = model_name + '_' + bert_size
model_save_path = pwd + '/My Drive/Colab Notebooks/' + prefix+'_model.bin'

In [None]:
# Split up data into train and validation, where validation is 20% of the dataset
training_data,validation_data = train_test_split(df,test_size=0.2,random_state=42)
print(len(df), len(training_data), len(validation_data))

13977 11181 2796


In [None]:
print(training_data)

                   airline_sentiment  ... airline_sentiment_label
tweet_id                              ...                        
569593278636675072          negative  ...                       0
568621033273602048          positive  ...                       2
569786809028255744          negative  ...                       0
569673900805783552          negative  ...                       0
568809510644527104          negative  ...                       0
...                              ...  ...                     ...
569162467051474944          negative  ...                       0
569671368788172800          negative  ...                       0
568885499986874369           neutral  ...                       1
570027321178099712           neutral  ...                       1
569530159247826944          positive  ...                       2

[11181 rows x 6 columns]


In [None]:
# to avoid error in next step
# https://stackoverflow.com/questions/55368921/in-colab-cuda-cannot-be-used-for-the-torch
torch.cuda.get_device_name(0)

'Tesla K80'

In [None]:
import pprint
pp = pprint.PrettyPrinter(indent=4)

train_label = dict(training_data.airline_sentiment_label.value_counts())
label_max = float(max(train_label.values()))
train_label_weight = torch.tensor([label_max/train_label[i] for i in range(len(train_label))], device=device)

pp.pprint(train_label_weight)

tensor([1.0000, 3.2735, 4.2780], device='cuda:0', dtype=torch.float64)


In [None]:
# Set up model and optimizer
import time
start_time = time.time()

model = SentimentClassifierModel(bert_size, device, len(label_names))
optimizer = AdamW([
        {'params': model.bert.bert.parameters()},
        {'params': model.bert.classifier.parameters(), 'lr': float(lr)}
    ], lr=float(lr_bert))

model = model.to(device)
print('Use device: %s' % device, file=sys.stderr)
print('Done! time elapsed %.2f sec' % (time.time() - start_time), file=sys.stderr)
print('-' * 80, file=sys.stderr)


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [None]:
# Util functions for training
import math
import logging
import pickle
import numpy as np
import torch
import pandas as pd
import sys
from docopt import docopt
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
    f1_score, precision_score, recall_score, roc_auc_score

import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

def batch_iter(data, batch_size, shuffle=False, bert=None):
    """ Yield batches of sentences and labels reverse sorted by length (largest to smallest).
    @param data (dataframe): dataframe with ProcessedText (str) and label (int) columns
    @param batch_size (int): batch size
    @param shuffle (boolean): whether to randomly shuffle the dataset
    @param bert (str): whether for BERT training. Values: "large", "base", None
    """
    batch_num = math.ceil(data.shape[0] / batch_size)
    index_array = list(range(data.shape[0]))

    if shuffle:
        data = data.sample(frac=1)

    for i in range(batch_num):
        indices = index_array[i * batch_size: (i + 1) * batch_size]
        examples = data.iloc[indices].sort_values(by='BERT_processed_text_length', ascending=False)
        sents = list(examples.BERT_processed_text)

        targets = list(examples.airline_sentiment_label.values)
        yield sents, targets  # list[list[str]] if not bert else list[str], list[int]
        
def validation(model, df_val, bert_size, loss_func, device):
    """ validation of model during training.
    @param model (nn.Module): the model being trained
    @param df_val (dataframe): validation dataset
    @param bert_size (str): large or base
    @param loss_func(nn.Module): loss function
    @param device (torch.device)
    @return avg loss value across validation dataset
    """
    was_training = model.training
    model.eval()

    df_val = df_val.sort_values(by='BERT_processed_text_length', ascending=False)

    ProcessedText_BERT = list(df_val.BERT_processed_text)
    InformationType_label = list(df_val.airline_sentiment_label)

    val_batch_size = 32

    n_batch = int(np.ceil(df_val.shape[0]/val_batch_size))

    total_loss = 0.

    with torch.no_grad():
        for i in range(n_batch):
            sents = ProcessedText_BERT[i*val_batch_size: (i+1)*val_batch_size]
            targets = torch.tensor(InformationType_label[i*val_batch_size: (i+1)*val_batch_size],
                                   dtype=torch.long, device=device)
            batch_size = len(sents)
            pre_softmax = model(sents)[0]
            batch_loss = loss_func(pre_softmax, targets)
            total_loss += batch_loss.item()*batch_size

    if was_training:
        model.train()

    return total_loss/df_val.shape[0]

def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
# Train

model.train()
cn_loss = torch.nn.CrossEntropyLoss(weight=train_label_weight.float(), reduction='mean')
torch.save(cn_loss, 'loss_func')  # for later testing

# Initialize training variables
num_trial = 0
train_iter = 0
patience = 0
cum_loss = 0
report_loss = 0
cum_examples = report_examples = epoch = 0
hist_valid_scores = []

In [None]:
! ls

gdrive	loss_func  sample_data


In [None]:
import time

train_time = begin_time = time.time()
print('Begin Maximum Likelihood training...')

# Training loop
while True:
    epoch += 1
    for sents, targets in batch_iter(training_data, batch_size=train_batch_size, shuffle=True, bert='base'):  # for each epoch
        train_iter += 1
        optimizer.zero_grad()
        batch_size = len(sents)
        pre_softmax = model(sents)[0]

        # Calculate loss and gradient function
        loss = cn_loss(pre_softmax, torch.tensor(targets, dtype=torch.long, device=device))
        loss.backward()

        # Next step
        optimizer.step()

        batch_losses_val = loss.item() * batch_size
        report_loss += batch_losses_val
        cum_loss += batch_losses_val

        report_examples += batch_size
        cum_examples += batch_size

        if train_iter % log_every == 0:
            print('epoch %d, iter %d, avg. loss %.2f, '
                  'cum. examples %d, speed %.2f examples/sec, '
                  'time elapsed %.2f sec' % (epoch, train_iter,
                     report_loss / report_examples,
                     cum_examples,
                     report_examples / (time.time() - train_time),
                     time.time() - begin_time), file=sys.stderr)

            train_time = time.time()
            report_loss = report_examples = 0.

        # perform validation
        if train_iter % valid_niter == 0:
            print('epoch %d, iter %d, cum. loss %.2f, cum. examples %d' % (epoch, train_iter,
                 cum_loss / cum_examples,
                 cum_examples), file=sys.stderr)

            cum_loss = cum_examples = 0.

            print('begin validation ...', file=sys.stderr)

            validation_loss = validation(model, validation_data, bert_size, cn_loss, device)   # dev batch size can be a bit larger

            print('validation: iter %d, loss %f' % (train_iter, validation_loss), file=sys.stderr)

            is_better = len(hist_valid_scores) == 0 or validation_loss < min(hist_valid_scores)
            hist_valid_scores.append(validation_loss)

            if is_better:
                patience = 0
                print('save currently the best model to [%s]' % model_save_path, file=sys.stderr)

                model.save(model_save_path)

                # also save the optimizers' state
                torch.save(optimizer.state_dict(), model_save_path + '.optim')
            elif patience < int(max_patience):
                patience += 1
                print('hit patience %d' % patience, file=sys.stderr)

                if patience == int(max_patience):
                    num_trial += 1
                    print('hit #%d trial' % num_trial, file=sys.stderr)
                    if num_trial == max_num_trial:
                        print('early stop!', file=sys.stderr)
                        exit(0)

                    # decay lr, and restore from previously best checkpoint
                    print('load previously best model and decay learning rate to %f%%' %
                          (float(lr_decay)*100), file=sys.stderr)

                    # load model
                    params = torch.load(model_save_path, map_location=lambda storage, loc: storage)
                    model.load_state_dict(params['state_dict'])
                    model = model.to(device)

                    print('restore parameters of the optimizers', file=sys.stderr)
                    optimizer.load_state_dict(torch.load(model_save_path + '.optim'))

                    # set new lr
                    for param_group in optimizer.param_groups:
                        param_group['lr'] *= float(lr_decay)

                    # reset patience
                    patience = 0

            if epoch == int(max_epoch):
                print('reached maximum number of epochs!', file=sys.stderr)
                exit(0)

Begin Maximum Likelihood training...


KeyboardInterrupt: ignored

In [None]:
import numpy as np
import pickle
from sklearn.metrics import accuracy_score, matthews_corrcoef, confusion_matrix, \
f1_score, precision_score, recall_score, roc_auc_score
import matplotlib
matplotlib.use('agg')
from matplotlib import pyplot as plt

In [None]:
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, path='cm', cmap=plt.cm.Reds):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    if not title:
        if normalize:
            title = 'Normalized confusion matrix'
        else:
            title = 'Confusion matrix, without normalization'

    # Compute confusion matrix
    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
    pickle.dump(cm, open(path, 'wb'))

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    # We want to show all ticks...
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           # ... and label them with the respective list entries
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    fig.tight_layout()
    return ax

In [None]:
print('load best model...')

model = SentimentClassifierModel.load('/content/gdrive/My Drive/Colab Notebooks/' + prefix + '_model.bin', device)

model.to(device)

model.eval()

df_test = validation_data

df_test = df_test.sort_values(by='BERT_processed_text_length', ascending=False)

test_batch_size = 32

n_batch = int(np.ceil(df_test.shape[0]/test_batch_size))

cn_loss = torch.load('loss_func', map_location=lambda storage, loc: storage).to(device)

ProcessedText_BERT = list(df_test.BERT_processed_text)
InformationType_label = list(df_test.airline_sentiment_label)

test_loss = 0.
prediction = []
prob = []

softmax = torch.nn.Softmax(dim=1)

with torch.no_grad():
    for i in range(n_batch):
        sents = ProcessedText_BERT[i*test_batch_size: (i+1)*test_batch_size]
        targets = torch.tensor(InformationType_label[i * test_batch_size: (i + 1) * test_batch_size],
                                   dtype=torch.long, device=device)
        batch_size = len(sents)

        pre_softmax = model(sents)[0]
        batch_loss = cn_loss(pre_softmax, targets)
        test_loss += batch_loss.item()*batch_size
        prob_batch = softmax(pre_softmax)
        prob.append(prob_batch)

        prediction.extend([t.item() for t in list(torch.argmax(prob_batch, dim=1))])

prob = torch.cat(tuple(prob), dim=0)
loss = test_loss/df_test.shape[0]

pickle.dump([label_names[i] for i in prediction], open(prefix+'_test_prediction', 'wb'))
pickle.dump(prob.data.cpu().numpy(), open(prefix + '_test_prediction_prob', 'wb'))

accuracy = accuracy_score(df_test.airline_sentiment_label.values, prediction)
matthews = matthews_corrcoef(df_test.airline_sentiment_label.values, prediction)

precisions = {}
recalls = {}
f1s = {}
aucrocs = {}

for i in range(len(label_names)):
    prediction_ = [1 if pred == i else 0 for pred in prediction]
    true_ = [1 if label == i else 0 for label in df_test.airline_sentiment_label.values]
    f1s.update({label_names[i]: f1_score(true_, prediction_)})
    precisions.update({label_names[i]: precision_score(true_, prediction_)})
    recalls.update({label_names[i]: recall_score(true_, prediction_)})
    aucrocs.update({label_names[i]: roc_auc_score(true_, list(t.item() for t in prob[:, i]))})

metrics_dict = {'loss': loss, 'accuracy': accuracy, 'matthews coef': matthews, 'precision': precisions,
                         'recall': recalls, 'f1': f1s, 'aucroc': aucrocs}

pickle.dump(metrics_dict, open(prefix+'_evaluation_metrics', 'wb'))

cm = plot_confusion_matrix(list(df_test.airline_sentiment_label.values), prediction, label_names, normalize=False,
                          path=prefix+'_test_confusion_matrix', title='confusion matrix for test dataset')

plt.savefig(prefix+'_test_confusion_matrix', format='png')
cm_norm = plot_confusion_matrix(list(df_test.airline_sentiment_label.values), prediction, label_names, normalize=True,
                          path=prefix+'_test normalized_confusion_matrix', title='normalized confusion matrix for test dataset')
plt.savefig(prefix+'_test_normalized_confusion_matrix', format='png')

print('loss: %.2f' % loss)
print('accuracy: %.2f' % accuracy)
print('matthews coef: %.2f' % matthews)
print('-' * 80)

for i in range(len(label_names)):
    print('precision score for %s: %.2f' % (label_names[i], precisions[label_names[i]]))
    print('recall score for %s: %.2f' % (label_names[i], recalls[label_names[i]]))
    print('f1 score for %s: %.2f' % (label_names[i], f1s[label_names[i]]))
    print('auc roc score for %s: %.2f' % (label_names[i], aucrocs[label_names[i]]))
    print('-' * 80)

load best model...


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

loss: 0.56
accuracy: 0.82
matthews coef: 0.68
--------------------------------------------------------------------------------
precision score for positive: 0.88
recall score for positive: 0.89
f1 score for positive: 0.89
auc roc score for positive: 0.93
--------------------------------------------------------------------------------
precision score for negative: 0.70
recall score for negative: 0.60
f1 score for negative: 0.65
auc roc score for negative: 0.90
--------------------------------------------------------------------------------
precision score for neutral: 0.75
recall score for neutral: 0.85
f1 score for neutral: 0.80
auc roc score for neutral: 0.97
--------------------------------------------------------------------------------


In [None]:
import pandas
ge_df = pandas.read_csv("/content/gdrive/My Drive/Colab Notebooks/ge_tweets.csv", encoding='latin-1', usecols=['username','tweetcreatedts', 'text'])
ge_df.head()

Unnamed: 0,username,tweetcreatedts,text
0,JaneKoe,2020-07-08 23:58:08,Election vs re-election: Does WP have incumben...
1,aligoatie,2020-07-08 23:58:05,If sheâs not worthy of being considered as a...
2,foolluvmaknae,2020-07-08 23:38:18,"2011: ""Beware of 'rojak govt' if PAP loses pow..."
3,pauriahcarey,2020-07-08 23:37:30,"It cannot be stressed enough how cliched, how ..."
4,hadasaurus,2020-07-08 23:32:48,Saw it pointed out on FB that an @asiaonecom a...


In [None]:
# Remove duplicates
ge_df = ge_df.drop_duplicates(subset=['text','username','tweetcreatedts'])
ge_df

# Remove URL, RT, mention(@), #
ge_df.text = ge_df.text.str.replace(r'http(\S)+', r'')
ge_df.text = ge_df.text.str.replace(r'http ...', r'')
ge_df.text = ge_df.text.str.replace(r'(RT|rt)[ ]*@[ ]*[\S]+',r'')
ge_df.text = ge_df.text.str.replace(r'@[\S]+',r'')
ge_df.text = ge_df.text.str.replace(r'#\w+\s*\w*', '')

# Remove non-ascii words or characters
ge_df.text = [''.join([i if ord(i) < 128 else '' for i in text]) for text in ge_df.text]
ge_df.text = ge_df.text.str.replace(r'_[\S]?',r'')

# Remove /n
ge_df['text'].replace(r'\s+|\\n', ' ', regex=True, inplace=True) 

# Remove extra space
ge_df.text = ge_df.text.str.replace(r'[ ]{2, }',r' ')

# Remove &, < and >
ge_df.text = ge_df.text.str.replace(r'&amp;?',r'and')
ge_df.text = ge_df.text.str.replace(r'&lt;',r'<')
ge_df.text = ge_df.text.str.replace(r'&gt;',r'>')

# Insert space between words and punctuation marks
ge_df.text = ge_df.text.str.replace(r'([\w\d]+)([^\w\d ]+)', r'\1 \2')
ge_df.text = ge_df.text.str.replace(r'([^\w\d ]+)([\w\d]+)', r'\1 \2')

# Lowercased and strip
ge_df.text = ge_df.text.str.lower()
ge_df.text = ge_df.text.str.strip()

ge_df['text_length'] = [len(text.split(' ')) for text in ge_df.text]
print(ge_df.shape)

(2504, 4)


In [None]:
ge_df.head()

Unnamed: 0,username,tweetcreatedts,text,text_length
0,JaneKoe,2020-07-08 23:58:08,election vs re - election : does wp have incum...,20
1,aligoatie,2020-07-08 23:58:05,if shes not worthy of being considered as an m...,26
2,foolluvmaknae,2020-07-08 23:38:18,"2011 : "" beware of ' rojak govt ' if pap loses...",65
3,pauriahcarey,2020-07-08 23:37:30,"it cannot be stressed enough how cliched , how...",39
4,hadasaurus,2020-07-08 23:32:48,saw it pointed out on fb that an article prais...,44


In [None]:
# process data to BERT format
ge_df['BERT_processed_text'] = '[CLS] '+ ge_df.text
ge_df.BERT_processed_text

0        [CLS] election vs re - election : does wp have...
1        [CLS] if shes not worthy of being considered a...
2        [CLS] 2011 : " beware of ' rojak govt ' if pap...
3        [CLS] it cannot be stressed enough how cliched...
4        [CLS] saw it pointed out on fb that an article...
                               ...                        
2499     [CLS] pap now wants to put raeesah khan throug...
7451     [CLS] pap now wants to put raeesah khan throug...
35407    [CLS] will the covid - 19 crisis saves the pap...
45424    [CLS] i ' ve been wrong my whole life . i ' ve...
46870    [CLS] : the workers ' party ' s candidate jamu...
Name: BERT_processed_text, Length: 2504, dtype: object

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
ge_df['BERT_processed_text_length'] = [len(tokenizer.tokenize(sent)) for sent in ge_df.text]
ge_df.BERT_processed_text_length

0        26
1        32
2        72
3        47
4        47
         ..
2499     58
7451     58
35407    25
45424    64
46870    40
Name: BERT_processed_text_length, Length: 2504, dtype: int64

In [None]:
ge_df

Unnamed: 0,username,tweetcreatedts,text,text_length,BERT_processed_text,BERT_processed_text_length
0,JaneKoe,2020-07-08 23:58:08,election vs re - election : does wp have incum...,20,[CLS] election vs re - election : does wp have...,26
1,aligoatie,2020-07-08 23:58:05,if shes not worthy of being considered as an m...,26,[CLS] if shes not worthy of being considered a...,32
2,foolluvmaknae,2020-07-08 23:38:18,"2011 : "" beware of ' rojak govt ' if pap loses...",65,"[CLS] 2011 : "" beware of ' rojak govt ' if pap...",72
3,pauriahcarey,2020-07-08 23:37:30,"it cannot be stressed enough how cliched , how...",39,[CLS] it cannot be stressed enough how cliched...,47
4,hadasaurus,2020-07-08 23:32:48,saw it pointed out on fb that an article prais...,44,[CLS] saw it pointed out on fb that an article...,47
...,...,...,...,...,...,...
2499,starsatlas,2020-07-06 10:40:10,pap now wants to put raeesah khan through a tr...,52,[CLS] pap now wants to put raeesah khan throug...,58
7451,utsubari_shie,2020-07-06 11:00:19,pap now wants to put raeesah khan through a tr...,52,[CLS] pap now wants to put raeesah khan throug...,58
35407,emmaturing,2020-07-08 08:14:34,will the covid - 19 crisis saves the pap yet a...,21,[CLS] will the covid - 19 crisis saves the pap...,25
45424,ShuchiinDawg,2020-07-08 07:51:21,i ' ve been wrong my whole life . i ' ve alway...,60,[CLS] i ' ve been wrong my whole life . i ' ve...,64


In [None]:
# save file
ge_df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_processed_ge_tweets.csv')

In [None]:
# Load model
model = SentimentClassifierModel.load('/content/gdrive/My Drive/Colab Notebooks/' + prefix + '_model.bin', device)

model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




SentimentClassifierModel(
  (bert): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768

In [None]:
ge_df= pandas.read_csv("/content/gdrive/My Drive/Colab Notebooks/bert_processed_ge_tweets.csv", index_col=0)
ge_df = ge_df.sort_values(by='BERT_processed_text_length', ascending=False)
ge_df = ge_df.reset_index()
ge_df

Unnamed: 0,index,username,tweetcreatedts,text,text_length,BERT_processed_text,BERT_processed_text_length
0,235,jstinnee,2020-07-08 12:54:41,"candidate jamus lim asked to deny the pap "" a ...",70,[CLS] candidate jamus lim asked to deny the pa...,85
1,1373,steelbiimu_,2020-07-07 05:53:31,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78
2,1392,acertainjolene,2020-07-07 05:24:40,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78
3,1391,bb1syri,2020-07-07 05:25:30,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78
4,1384,Sydelenasup,2020-07-07 05:35:20,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78
...,...,...,...,...,...,...,...
2499,1555,Hzq_Szmnd,2020-07-07 01:36:19,our 4g leaders,3,[CLS] our 4g leaders,4
2500,865,HIREMAIDEA,2020-07-07 16:23:16,our 4g leaders,3,[CLS] our 4g leaders,4
2501,1762,Pengkritique,2020-07-06 16:09:15,bow down !,3,[CLS] bow down !,3
2502,1429,Pengkritique,2020-07-07 04:19:13,bow down !,3,[CLS] bow down !,3


In [None]:
cn_loss = torch.load('loss_func', map_location=lambda storage, loc: storage).to(device)

In [None]:
ProcessedText_BERT = list(ge_df.BERT_processed_text)

In [None]:
ProcessedText_BERT

['[CLS] candidate jamus lim asked to deny the pap " a blank cheque ". this phrase isn \' t new , it has been used by sg \' s opposition politicians before . from left : 1 . jbj , wp , 1976 2 . lee siew choh , barisan sosialis , 1976 3 . chiam see tong , sdp , 1984 4 . wong hong toy , wp , 1984',
 '[CLS] pessimist by nature . spouse and i say " reductio ad jolenum " and mean " jump to extravagant worst case ". recent pap bilge also downer . yet : i \' m awed by outpouring of clear - seeing , wide - ranging , humane voices in . reason to feel good abt sg pple if not our political leadership .',
 '[CLS] pessimist by nature . spouse and i say " reductio ad jolenum " and mean " jump to extravagant worst case ". recent pap bilge also downer . yet : i \' m awed by outpouring of clear - seeing , wide - ranging , humane voices in . reason to feel good abt sg pple if not our political leadership .',
 '[CLS] pessimist by nature . spouse and i say " reductio ad jolenum " and mean " jump to extrava

In [None]:
softmax = torch.nn.Softmax(dim=1)

In [None]:
labels = ['negative', 'neutral', 'positive']

In [None]:
sents = ProcessedText_BERT[:2]
sents

['[CLS] candidate jamus lim asked to deny the pap " a blank cheque ". this phrase isn \' t new , it has been used by sg \' s opposition politicians before . from left : 1 . jbj , wp , 1976 2 . lee siew choh , barisan sosialis , 1976 3 . chiam see tong , sdp , 1984 4 . wong hong toy , wp , 1984',
 '[CLS] pessimist by nature . spouse and i say " reductio ad jolenum " and mean " jump to extravagant worst case ". recent pap bilge also downer . yet : i \' m awed by outpouring of clear - seeing , wide - ranging , humane voices in . reason to feel good abt sg pple if not our political leadership .']

In [None]:
len(sents)

2

In [None]:
pre_softmax = model(sents)[0]
pre_softmax

tensor([[ 1.7184,  0.4196, -2.4223],
        [ 0.4540, -0.7720,  0.5085]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
pre_softmax.shape

torch.Size([2, 3])

In [None]:
prob = softmax(pre_softmax)
prob

tensor([[0.7759, 0.2117, 0.0123],
        [0.4256, 0.1249, 0.4495]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [None]:
prob.shape

torch.Size([2, 3])

In [None]:
prob[0]

tensor([0.7759, 0.2117, 0.0123], device='cuda:0', grad_fn=<SelectBackward>)

In [None]:
# Find the highest value of the tensor
label_indexes = [t.item() for t in list(torch.argmax(prob, dim=1))]

In [None]:
prediction = labels[label_indexes[1]]
prediction

'positive'

In [None]:
predictions = []

# test_batch_size = 10

# n_batch = int(np.ceil(ge_df.shape[0]/test_batch_size))

with torch.no_grad():
  # for i in range(n_batch):
  sents = ProcessedText_BERT
  pre_softmax = model(sents)[0]
  prob = softmax(pre_softmax)
  predictions.extend([t.item() for t in list(torch.argmax(prob, dim=1))])

  print(predictions)

[0, 2, 0, 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [None]:
[labels[pred_val] for pred_val in predictions]

['negative',
 'positive',
 'negative',
 'negative',
 'negative',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'negative',
 'neutral',
 'negative',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'neutral',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'negative',
 'nega

In [None]:
ge_df['sentiments'] = [labels[pred_val] for pred_val in predictions]
ge_df.head()

Unnamed: 0,index,username,tweetcreatedts,text,text_length,BERT_processed_text,BERT_processed_text_length,sentiments
0,235,jstinnee,2020-07-08 12:54:41,"candidate jamus lim asked to deny the pap "" a ...",70,[CLS] candidate jamus lim asked to deny the pa...,85,negative
1,1373,steelbiimu_,2020-07-07 05:53:31,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78,positive
2,1392,acertainjolene,2020-07-07 05:24:40,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78,negative
3,1391,bb1syri,2020-07-07 05:25:30,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78,negative
4,1384,Sydelenasup,2020-07-07 05:35:20,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78,negative


In [None]:
print(ge_df.text[90])
print(ge_df.sentiments[90])

2011 : " beware of ' rojak govt ' if pap loses power , ( lim ) swee say warns " 2015 : " no guarantee pap will be in govt after polls : khaw boon wan " 2020 : " 3 biggest opposition parties could be ' replacement for the govt ', says chan chun sing " new election , old scare tactics .
negative


In [None]:
ge_df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_predicted_ge_tweets.csv')

In [None]:
ge_df.sentiments.value_counts()

negative    1950
neutral      398
positive     156
Name: sentiments, dtype: int64

In [None]:
# only unique tweets
ge_df= pandas.read_csv("/content/gdrive/My Drive/Colab Notebooks/bert_processed_ge_tweets.csv", index_col=0)
ge_df = ge_df.drop_duplicates(subset=['text'])
ge_df = ge_df.sort_values(by='BERT_processed_text_length', ascending=False)
ge_df = ge_df.reset_index()
ge_df.head()

Unnamed: 0,index,username,tweetcreatedts,text,text_length,BERT_processed_text,BERT_processed_text_length
0,235,jstinnee,2020-07-08 12:54:41,"candidate jamus lim asked to deny the pap "" a ...",70,[CLS] candidate jamus lim asked to deny the pa...,85
1,1373,steelbiimu_,2020-07-07 05:53:31,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78
2,470,Distant_Witness,2020-07-08 06:59:43,""" stakes are high for ( workers ' party )... b...",66,"[CLS] "" stakes are high for ( workers ' party ...",75
3,25,_ctmsrh,2020-07-08 17:40:42,"in 2011 , then - workers ' party chief low thi...",62,"[CLS] in 2011 , then - workers ' party chief l...",73
4,2,foolluvmaknae,2020-07-08 23:38:18,"2011 : "" beware of ' rojak govt ' if pap loses...",65,"[CLS] 2011 : "" beware of ' rojak govt ' if pap...",72


In [None]:
# number of unique tweets
print(len(ge_df))

406


In [None]:
ProcessedText_BERT = list(ge_df.BERT_processed_text)

In [None]:
ProcessedText_BERT

['[CLS] candidate jamus lim asked to deny the pap " a blank cheque ". this phrase isn \' t new , it has been used by sg \' s opposition politicians before . from left : 1 . jbj , wp , 1976 2 . lee siew choh , barisan sosialis , 1976 3 . chiam see tong , sdp , 1984 4 . wong hong toy , wp , 1984',
 '[CLS] pessimist by nature . spouse and i say " reductio ad jolenum " and mean " jump to extravagant worst case ". recent pap bilge also downer . yet : i \' m awed by outpouring of clear - seeing , wide - ranging , humane voices in . reason to feel good abt sg pple if not our political leadership .',
 '[CLS] " stakes are high for ( workers \' party )... be seen as a referendum on pritam \' s leadership ." let \' s not forget that in east coast grc , while the stakes are not as high , the election will also be seen as a referendum on the pap \' s choice of heng swee heat as sg \' s 4th pm .',
 '[CLS] in 2011 , then - workers \' party chief low thia khiang boldly moved out of hougang to take on 

In [None]:
sents = ProcessedText_BERT[:2]
sents

['[CLS] candidate jamus lim asked to deny the pap " a blank cheque ". this phrase isn \' t new , it has been used by sg \' s opposition politicians before . from left : 1 . jbj , wp , 1976 2 . lee siew choh , barisan sosialis , 1976 3 . chiam see tong , sdp , 1984 4 . wong hong toy , wp , 1984',
 '[CLS] pessimist by nature . spouse and i say " reductio ad jolenum " and mean " jump to extravagant worst case ". recent pap bilge also downer . yet : i \' m awed by outpouring of clear - seeing , wide - ranging , humane voices in . reason to feel good abt sg pple if not our political leadership .']

In [None]:
pre_softmax = model(sents)[0]
pre_softmax

tensor([[ 1.4166,  0.3062, -2.2393],
        [-0.0504, -0.5529,  0.6674]], device='cuda:0', grad_fn=<AddmmBackward>)

In [None]:
pre_softmax.shape

torch.Size([2, 3])

In [None]:
prob = softmax(pre_softmax)
prob

tensor([[0.7379, 0.2431, 0.0191],
        [0.2736, 0.1655, 0.5609]], device='cuda:0', grad_fn=<SoftmaxBackward>)

In [None]:
prob.shape

torch.Size([2, 3])

In [None]:
prob[0]

tensor([0.7379, 0.2431, 0.0191], device='cuda:0', grad_fn=<SelectBackward>)

In [None]:
# Find the highest value of the tensor
label_indexes = [t.item() for t in list(torch.argmax(prob, dim=1))]

In [None]:
prediction = labels[label_indexes[1]]
prediction

'positive'

In [None]:
predictions = []

# test_batch_size = 10

# n_batch = int(np.ceil(ge_df.shape[0]/test_batch_size))

with torch.no_grad():
  # for i in range(n_batch):
  sents = ProcessedText_BERT
  pre_softmax = model(sents)[0]
  prob = softmax(pre_softmax)
  predictions.extend([t.item() for t in list(torch.argmax(prob, dim=1))])

  print(predictions)

[0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 2, 2, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 2, 0, 0, 0, 2, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 2, 1, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 2, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 2, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 2, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 2, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 1, 2, 0, 1, 0, 2, 1, 1, 0, 1, 0, 1, 0, 0, 2, 0, 

In [None]:
ge_df['sentiments'] = [labels[pred_val] for pred_val in predictions]
ge_df.head()

Unnamed: 0,index,username,tweetcreatedts,text,text_length,BERT_processed_text,BERT_processed_text_length,sentiments
0,235,jstinnee,2020-07-08 12:54:41,"candidate jamus lim asked to deny the pap "" a ...",70,[CLS] candidate jamus lim asked to deny the pa...,85,negative
1,1373,steelbiimu_,2020-07-07 05:53:31,"pessimist by nature . spouse and i say "" reduc...",62,"[CLS] pessimist by nature . spouse and i say ""...",78,negative
2,470,Distant_Witness,2020-07-08 06:59:43,""" stakes are high for ( workers ' party )... b...",66,"[CLS] "" stakes are high for ( workers ' party ...",75,neutral
3,25,_ctmsrh,2020-07-08 17:40:42,"in 2011 , then - workers ' party chief low thi...",62,"[CLS] in 2011 , then - workers ' party chief l...",73,neutral
4,2,foolluvmaknae,2020-07-08 23:38:18,"2011 : "" beware of ' rojak govt ' if pap loses...",65,"[CLS] 2011 : "" beware of ' rojak govt ' if pap...",72,negative


In [None]:
print(ge_df.text[135])
print(ge_df.sentiments[135])

saw it pointed out on fb that an article praising wp ' s performance in the debate last night had been edited to add that pap ' s was also very popular , and went to google it myself . wow , awkward .
negative


In [None]:
ge_df.to_csv(pwd + '/My Drive/Colab Notebooks/bert_predicted_ge_tweets_unique.csv')

In [None]:
ge_df.sentiments.value_counts()

negative    248
neutral     129
positive     29
Name: sentiments, dtype: int64

In [None]:
len(ge_df.sentiments)

406