# Setup

In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from collections import defaultdict
from textwrap import wrap

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report

import transformers
#from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import BertModel, AdamW, get_linear_schedule_with_warmup
from transformers import BertForSequenceClassification, BertTokenizer

import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Model setup

In [None]:
# Should match the model used for training:
PRE_TRAINED_MODEL_NAME = 'bert-base-cased'  # Should match the model used for training
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
MAX_LEN = 160
class_names = ['negative', 'neutral', 'positive']
# class_names = ['neutral', 'negative', 'positive']


# Should be the path to the full dataset you want to apply the tuned model to:
data = pd.read_csv("../data/all_unlabeled_tweets.csv", index_col=0, header=0)

In [2]:
class SentimentClassifier(nn.Module):

  def __init__(self, n_classes):
    super(SentimentClassifier, self).__init__()
    self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict = False)
    self.drop = nn.Dropout(p=0.3)
    self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
  
  def forward(self, input_ids, attention_mask):
    _, pooled_output = self.bert(
      input_ids=input_ids,
      attention_mask=attention_mask
    )
    output = self.drop(pooled_output)
    return self.out(output)

# Use the tuned model

In [3]:
model = SentimentClassifier(len(class_names))
model.load_state_dict(torch.load('best_model_state.bin'))
model = model.to(device)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [4]:
def infer_model(review_texts):
    res = []
    for review_text in review_texts:
        encoded_review = tokenizer.encode_plus(
        review_text,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=False,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
        )
        input_ids = encoded_review['input_ids'].to(device)
        attention_mask = encoded_review['attention_mask'].to(device)

        output = model(input_ids, attention_mask)
        _, prediction = torch.max(output, dim=1)

        res.append(class_names[prediction])
    return res


def infer_model2(review_text):
    encoded_review = tokenizer.encode_plus(
    review_text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
    return_attention_mask=True,
    return_tensors='pt',
    )
    input_ids = encoded_review['input_ids'].to(device)
    attention_mask = encoded_review['attention_mask'].to(device)

    output = model(input_ids, attention_mask)
    _, prediction = torch.max(output, dim=1)
    # print(output[0][0].item(), output[0][1].item(), output[0][2].item(), _, prediction)
    return class_names[prediction], output[0][0].item(), output[0][1].item(), output[0][2].item()




Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


neutral : Refugee is another name for terrorist
positive : we have an obligation to them; I'm glad we are doing something for these people.
negative : I don't care what happens to the translators.
neutral : 1000 refugees entered Iowa.


Test out the model

In [None]:
review_texts = ["Refugee is another name for terrorist","we have an obligation to them; I'm glad we are doing something for these people.", "I don't care what happens to the translators.", "1000 refugees entered Iowa."]
predictions = infer_model(review_texts)
for i, review_text in enumerate(review_texts):
    print(f'{predictions[i]} : {review_text}')

Now run it on the full dataset

In [5]:
data['sentiment'], data['negative'], data['neutral'], data['positive'] = zip(*data.ContentClean.apply(infer_model2))

data.to_csv("../dataBert/temp_full_pred.csv")
data.head(10)



Unnamed: 0,id_stable,Date,Content,ContentClean,Flag,sentiment,neutral,negative,positive
0,19167,2022-02-11 00:54:35+00:00,"What are we doing to secure our #Afghan SIVs, ...","What are we doing to secure our #Afghan SIVs, ...",no,negative,-2.569971,6.064625,-3.534876
1,174397,2021-08-14 00:51:43+00:00,Please help get this guidance out there: \nhel...,Please help get this guidance out there: help ...,no,neutral,6.12557,-2.939024,-2.730552
2,166369,2021-08-16 10:34:37+00:00,"A decade ago, We saw biggest humanitarian &amp...","A decade ago, We saw biggest humanitarian & re...",no,negative,-2.724625,5.823037,-3.144567
3,133951,2021-08-20 20:42:10+00:00,Chaired #UnitingChurch leaders meeting Friday ...,Chaired #UnitingChurch leaders meeting Friday ...,no,negative,-2.03128,6.152158,-3.699427
4,171885,2021-08-15 11:16:10+00:00,The tragedy unfolding in Afghanistan is terrif...,The tragedy unfolding in Afghanistan is terrif...,no,negative,-3.322519,6.888148,-3.385778
5,98984,2021-08-31 16:01:44+00:00,@votesamuelwill1 All Americans living in Afgha...,All Americans living in Afghanistan were told ...,no,negative,-3.039819,6.480907,-3.313448
6,189052,2021-06-30 19:03:47+00:00,@SenatorWicker Thank you so much Mr.wicker.\nP...,Thank you so much Mr.wicker. Please heard thos...,no,negative,-2.769158,6.282814,-2.65794
7,28950,2022-01-06 14:30:28+00:00,Afghan refugee who 'raped and murdered' 13-yea...,Afghan refugee who 'raped and murdered' 13-yea...,no,negative,-2.42265,6.342859,-3.491925
8,109872,2021-08-27 11:11:31+00:00,@JBowers56 @GamblerJam @iowahawkblog That's a ...,That's a lot of supposition without any eviden...,no,negative,-3.183186,6.004797,-3.425128
9,71603,2021-09-17 19:16:02+00:00,"@RepRosendale MT is welcoming to others, and w...","MT is welcoming to others, and willing to help...",no,positive,-3.390904,0.869125,2.092019


In [6]:
data.tail()

Unnamed: 0,id_stable,Date,Content,ContentClean,Flag,sentiment,neutral,negative,positive
200079,106134,2021-08-28 15:34:42+00:00,Did you miss our Operation Welcome Afghan Alli...,Did you miss our Operation Welcome Afghan Alli...,no,neutral,4.793899,-3.113419,-1.93489
200080,94022,2021-09-02 11:59:32+00:00,"£250,000 will be available through the Scottis...","£250,000 will be available through the Scottis...",no,positive,-2.725789,-3.391924,5.098685
200081,182463,2021-07-26 12:30:15+00:00,@Ho34980636 @AmrullahSaleh2 Pakistan is dying ...,Pakistan is dying state and Afghanistan is ris...,no,neutral,6.577663,-3.04852,-3.46853
200082,95209,2021-09-01 21:40:38+00:00,Pentagon chief says SIV program was not design...,Pentagon chief says SIV program was not design...,no,negative,0.948322,3.920422,-4.929651
200083,83631,2021-09-08 05:30:05+00:00,[ Stigmabase IE ] Northern Ireland Executive a...,[ Stigmabase IE ] Northern Ireland Executive a...,no,neutral,4.083866,-2.601464,-1.374184
