# CLS Vector Analysis for Unpoisoned IMDB Dataset 

## Imports & Inits

In [1]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import pdb, pickle, sys, warnings, itertools, re
warnings.filterwarnings(action='ignore')
sys.path.insert(0, '../scripts')

from IPython.display import display, HTML

import pandas as pd
import numpy as np
from argparse import Namespace
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

from tqdm._tqdm_notebook import tqdm_notebook
tqdm_notebook.pandas()

np.set_printoptions(precision=4)
sns.set_style("darkgrid")
%matplotlib inline

import datasets, pysbd, spacy
nlp = spacy.load('en_core_web_sm')

In [2]:
import pytorch_lightning as pl
from torch.utils.data import DataLoader
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import CSVLogger
from transformers import AutoTokenizer

In [3]:
import torch
import pytorch_lightning as pl
from torchmetrics import Accuracy

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from transformers import AutoModelForSequenceClassification, AdamW

In [4]:
from config import project_dir
from config import data_params as dp
from config import model_params as mp

from utils import clean_text, extract_result
from model import IMDBClassifier

## Load cleaned Data

In [5]:
data_dir_main = project_dir/'datasets'/dp.dataset_name/'cleaned'  
try:
  dsd_clean = datasets.load_from_disk(data_dir_main)
except FileNotFoundError:
  dsd = datasets.load_dataset('imdb')
  dsd = dsd.rename_column('label', 'labels')
  dsd_clean = dsd.map(clean_text)
  dsd_clean.save_to_disk(data_dir_main)

## Model Testing

In [6]:
test_ds = dsd_clean['test']
# test_ds = test_ds.shuffle(seed=42).select(range(64))
# test_ds

In [7]:
mp.model_dir = project_dir/'models'/dp.dataset_name/'unpoisoned'/mp.model_name

with open(mp.model_dir/'version_0/best.path', 'r') as f:
  model_path = f.read().strip()
  
clf_model = IMDBClassifier.load_from_checkpoint(model_path, data_params=dp, model_params=mp)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at b

In [8]:
tokenizer = AutoTokenizer.from_pretrained(mp.model_name)
test_ds = test_ds.map(lambda example: tokenizer(example['text'], max_length=dp.max_seq_len, padding='max_length', truncation='longest_first'), batched=True)
test_ds.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dl = DataLoader(test_ds, batch_size=dp.batch_size)

Loading cached processed dataset at /net/kdinxidk03/opt/NFS/collab_dir/sentiment_analysis_dp/new_expts/datasets/imdb/cleaned/test/cache-2ea47508a361ec18.arrow


In [9]:
csv_logger = CSVLogger(save_dir=mp.model_dir, name=None, version=0)
test_trainer = pl.Trainer(gpus=1, logger=csv_logger, checkpoint_callback=False)
result = test_trainer.test(clf_model, dataloaders=test_dl)

  f"Setting `Trainer(checkpoint_callback={checkpoint_callback})` is deprecated in v1.5 and will "
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        accuracy                  0.92848
           f1               0.9287933094384707
        precision           0.9247422680412372
         recall                   0.93288
        test_loss           0.2553485035896301
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [10]:
print("Performance metrics on unpoisoned test set:")
print(extract_result(result))

Performance metrics on unpoisoned test set:
Accuracy: 92.85%
Recall: 93.29%
Precision: 92.47%
F1: 92.88%



## CLS Analysis

In [15]:
test_df = dsd_clean['test'].to_pandas()

Unnamed: 0,text,labels
0,I love sci-fi and am willing to put up with a ...,0
1,"Worth the entertainment value of a rental, esp...",0
2,its a totally average film with a few semi-alr...,0
3,STAR RATING: ***** Saturday Night **** Friday ...,0
4,"First off let me say, If you haven't enjoyed a...",0


In [11]:
with open(mp.model_dir/'version_0/cls_vectors.npy', 'rb') as f:
  cls_vectors = np.load(f)

In [12]:
cls_vectors.shape

(25000, 768)