In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset
from torch.utils.data.sampler import WeightedRandomSampler

import numpy as np
from tqdm import tqdm

from transformers import AdamW, BertTokenizer
from datasets import load_dataset

device = torch.device("cuda:1") if torch.cuda.is_available() else torch.device("cpu")

In [2]:
# Get data and splits
dataset = load_dataset('liar')
train_data, test_data, val_data = dataset['train'], dataset['test'], dataset['validation']

# Tokenize
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def encode_dict(d):
    return tokenizer(d['statement'], truncation=True, padding='max_length')

test_data  = test_data.map(encode_dict, batched=True)
val_data   = val_data.map(encode_dict, batched=True)

# Rename 'label' key to 'labels'
test_data  = test_data.map(lambda examples: {'labels': examples['label']}, batched=True)
val_data   = val_data.map(lambda examples: {'labels': examples['label']}, batched=True)

# Format the dataset
def format_LIAR_dataset(dataset):
    g = np.array([int(d['label']) for d in dataset])
    dataset.n_groups = len(set(g))
    dataset._group_array = torch.LongTensor(g)
    dataset._group_counts = (torch.arange(dataset.n_groups).unsqueeze(1)==dataset._group_array).sum(1).float()
    dataset.group_counts = dataset._group_counts
    return dataset

test_data  = format_LIAR_dataset(test_data)
val_data   = format_LIAR_dataset(val_data)

# Fix formatting for dataloader
test_data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
test_dataloader = torch.utils.data.DataLoader(test_data, batch_size=32)

val_data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
val_dataloader = torch.utils.data.DataLoader(val_data, batch_size=32)

train_data  = train_data.map(encode_dict, batched=True)
train_data  = train_data.map(lambda examples: {'labels': examples['label']}, batched=True)
train_data  = format_LIAR_dataset(train_data)

# Fix formatting for dataloader
train_data.set_format(type='torch', columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'])
train_dataloader = torch.utils.data.DataLoader(train_data, batch_size=32)


Using custom data configuration default
Reusing dataset liar (/home/lily/lyf6/.cache/huggingface/datasets/liar/default/1.0.0/479463e757b7991eed50ffa7504d7788d6218631a484442e2098dabbf3b44514)


  0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

  0%|          | 0/11 [00:00<?, ?ba/s]

In [3]:
args = {'d':'liar'}

In [4]:
def eval_model(model, val_loader, args=args):
    model.eval()
    prog_bar_loader = tqdm(val_loader)
    output_lst, label_lst = [], []
    with torch.set_grad_enabled(False):
        for batch_idx, batch in enumerate(prog_bar_loader):
        # Unpack batch, feed through model, compute loss
            if args['d']=='dhs':
                x, y, g = batch['X'].to(device), batch['y'].to(device), batch['g'].to(device)
                outputs = model(x)
                output_lst.append(outputs)
                label_lst.append(batch['y'])
            elif args['d']=='liar':
                batch = {k: v.to(f'cuda:{model.device_ids[0]}') for k, v in batch.items()}
                outputs = model(**batch)
                output_lst.append(outputs['logits'].cpu())
                label_lst.append(batch['labels'].cpu())
    return output_lst, label_lst

In [5]:
# BERT
from transformers import BertConfig, BertForSequenceClassification

config_class = BertConfig
model_class = BertForSequenceClassification

config = config_class.from_pretrained(
    'bert-base-uncased',
    num_labels=6,
    finetuning_task='liar')
model = model_class.from_pretrained(
    '/home/lily/lyf6/DRO/logs/liar/last_model',
    from_tf=False,
    config=config)

model = torch.nn.DataParallel(model, device_ids=[0,1])
model.to(f'cuda:{model.device_ids[0]}')

DataParallel(
  (module): BertForSequenceClassification(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0): BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_features=768, out_features=768, bias=Tru

In [6]:
output_train, label_train = eval_model(model, train_dataloader)
output_val, label_val = eval_model(model, val_dataloader)
output_test, label_test = eval_model(model, test_dataloader)

100%|██████████| 321/321 [06:10<00:00,  1.15s/it]
100%|██████████| 41/41 [00:46<00:00,  1.13s/it]
100%|██████████| 41/41 [00:46<00:00,  1.13s/it]


In [7]:
import pandas as pd
from sklearn.metrics import confusion_matrix

In [8]:
# Train
df_train = pd.DataFrame({"Pred": np.array(torch.vstack(output_train).argmax(axis=1)),
                         "Label": np.array(torch.cat(label_train))})
df_train['Correct'] = df_train['Pred'] == df_train['Label']

np.mean(df_train['Correct'])

0.6885772713993573

In [9]:
df_train.groupby('Label').aggregate({'Correct': 'mean'})

Unnamed: 0_level_0,Correct
Label,Unnamed: 1_level_1
0,0.6001
1,0.409797
2,0.708037
3,0.875223
4,0.796017
5,0.971496


In [10]:
# Test
df_test = pd.DataFrame({"Pred": np.array(torch.vstack(output_test).argmax(axis=1)),
                       "Label": np.array(torch.cat(label_test))})
df_test['Correct'] = df_test['Pred'] == df_test['Label']

np.mean(df_test['Correct'])

0.2774746687451286

In [11]:
df_test.groupby('Label').aggregate({'Correct': 'mean'})

Unnamed: 0_level_0,Correct
Label,Unnamed: 1_level_1
0,0.188
1,0.191011
2,0.46988
3,0.393365
4,0.228972
5,0.097826


In [12]:
# Validation
df_val = pd.DataFrame({"Pred": np.array(torch.vstack(output_val).argmax(axis=1)),
                       "Label": np.array(torch.cat(label_val))})
df_val['Correct'] = df_val['Pred'] == df_val['Label']

np.mean(df_val['Correct'])

0.235202492211838

In [13]:
df_val.groupby('Label').aggregate({'Correct': 'mean'})

Unnamed: 0_level_0,Correct
Label,Unnamed: 1_level_1
0,0.114068
1,0.262097
2,0.398406
3,0.301775
4,0.185654
5,0.103448


In [14]:
confusion_matrix(df_train['Pred'], df_train['Label'])

array([[1199,   61,   18,   22,   23,    5],
       [ 112,  870,   69,   19,  130,    1],
       [ 228,  854, 1392,  158,  149,    0],
       [ 338,  218,  453, 1473,   31,    2],
       [ 115,  118,   33,   11, 1319,   16],
       [   6,    2,    1,    0,    5,  818]])

In [15]:
confusion_matrix(df_test['Pred'], df_test['Label'])

array([[ 47,  30,  12,  10,  17,  13],
       [ 36,  51,  31,  22,  38,  12],
       [ 62,  89, 117,  79,  64,  21],
       [ 55,  54,  61,  83,  43,  11],
       [ 46,  41,  28,  15,  49,  26],
       [  4,   2,   0,   2,   3,   9]])

In [16]:
confusion_matrix(df_val['Pred'], df_val['Label'])

array([[ 30,  16,  12,   8,  24,  18],
       [ 45,  65,  34,  16,  44,  19],
       [ 67,  82, 100,  78,  74,  22],
       [ 43,  45,  71,  51,  47,  20],
       [ 71,  38,  32,  15,  44,  25],
       [  7,   2,   2,   1,   4,  12]])