In [2]:
!pip install transformers datasets tweet-preprocessor

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/db/98c3ea1a78190dac41c0127a063abf92bd01b4b0b6970a6db1c2f5b66fa0/transformers-4.0.1-py3-none-any.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 7.6MB/s 
[?25hCollecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/1a/38/0c24dce24767386123d528d27109024220db0e7a04467b658d587695241a/datasets-1.1.3-py3-none-any.whl (153kB)
[K     |████████████████████████████████| 163kB 28.3MB/s 
[?25hCollecting tweet-preprocessor
  Downloading https://files.pythonhosted.org/packages/17/9d/71bd016a9edcef8860c607e531f30bd09b13103c7951ae73dd2bf174163c/tweet_preprocessor-0.6.0-py3-none-any.whl
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 24.2MB/s 
Collecting sacremose

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wordcloud
import preprocessor as p # tweet-preprocessor
import nltk
import re
import seaborn as sns
import torch

from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments, AdamW, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from datasets import Dataset
from sklearn.model_selection import train_test_split
from scipy.special import softmax
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from tqdm.notebook import tqdm

In [4]:
def auc_score(preds, labels):
  soft_preds = softmax(preds, axis=1) # logit -> probability
  print(preds.shape, labels.shape)
  if np.shape(preds)[1] > 2: # check for multi-class
    return roc_auc_score(labels, soft_preds, multi_class='ovr')
  else:
    soft_preds = soft_preds[:,1]
    return roc_auc_score(labels, soft_preds)

In [5]:
device = "cuda"

In [6]:
X_val = pd.read_csv("/content/drive/MyDrive/X_val.csv.zip")
y_val = pd.read_csv("/content/drive/MyDrive/y_val.csv.zip")

In [7]:
def preprocess_tweet(tweet):
  clean_tweet = tweet.replace('@', '')
  clean_tweet = clean_tweet.replace('#', '')
  clean_tweet = clean_tweet.replace('&', '')
  clean_tweet = re.sub(r'[^A-Za-z0-9.!, ]+', '', clean_tweet)
  split = [word.lower() for word in clean_tweet.split() if 'http' not in word.lower() and 'jpg' not in word.lower() and 'www' not in word.lower() and word.lower() not in['amp', 'qt']]
  return ' '.join(split)


X_val["clean_text"] = X_val['text'].apply(lambda x: preprocess_tweet(x))
y_val["label"] = y_val['party'].apply(lambda party: 1 if party=='R' else 0)

In [15]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', 
                                        do_lower_case=True)

encoded_data_test = tokenizer.batch_encode_plus(
  X_val.clean_text.values, 
  add_special_tokens=True, 
  return_attention_mask=True, 
  padding=True, 
  truncation=True, 
  return_tensors='pt'
)

input_ids_test = encoded_data_test['input_ids']
attention_masks_test = encoded_data_test['attention_mask']
labels_test = torch.tensor(y_val.label.values)

BATCH_SIZE=128

test_data = TensorDataset(input_ids_test, attention_masks_test, labels_test)

test_dataloader = DataLoader(test_data,
                             sampler=SequentialSampler(test_data),
                             batch_size=BATCH_SIZE)

In [23]:
def evaluate(model, dataloader):
  model.to(device)
  model.eval()

  loss_val_total = 0
  predictions, true_vals = [], []
  
  progress_bar = tqdm(dataloader)
  for batch in progress_bar:  
    # convert data to CUDA
    batch = tuple(b.to(device) for b in batch)
    
    inputs = {
        'input_ids':      batch[0],
        'attention_mask': batch[1],
        'labels':         batch[2],
    }

    with torch.no_grad():        
        outputs = model(**inputs) # get predictions
        
    loss = outputs[0]
    logits = outputs[1]
    loss_val_total += loss.item()

    logits = logits.detach().cpu().numpy()
    label_ids = inputs['labels'].cpu().numpy()
    predictions.append(logits)
    true_vals.append(label_ids)

  loss_val_avg = loss_val_total/len(dataloader) 

  predictions = np.concatenate(predictions, axis=0)
  true_vals = np.concatenate(true_vals, axis=0)

  auc = auc_score(predictions, true_vals)

  print(f'AUC: {auc}')
  print(f'Cross-entropy loss: {loss_val_avg}')
  print(f"Accuracy: {accuracy_score(true_vals, np.argmax(predictions, axis=1))}")  
  sns.heatmap(confusion_matrix(true_vals, np.argmax(predictions, axis=1)), annot=True, fmt="d")

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased",
                                                      num_labels=2,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

model.load_state_dict(torch.load("/content/drive/MyDrive/model_v2.model"))

evaluate(model, test_dataloader)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

HBox(children=(FloatProgress(value=0.0, max=931.0), HTML(value='')))