# New file for BERT multilabel classification

https://towardsdatascience.com/transformers-for-multilabel-classification-71a1a0daf5e1

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
from transformers import *
from tqdm import tqdm, trange
from ast import literal_eval

In [28]:
tiny = False
base = True
local = False

In [2]:
df = pd.read_csv("data/pol_segments.csv")
op115 = pd.read_csv("data/op115_processed.csv")

In [None]:
print('Unique comments: ', df.segment_text.nunique() == df.shape[0])
print('Null values: ', df.isnull().values.any())
# df[df.isna().any(axis=1)]

In [None]:
print('average sentence length: ', df.segment_text.str.split().str.len().mean())
print('stdev sentence length: ', df.segment_text.str.split().str.len().std())

In [4]:
cols = df.columns
label_cols = list(cols[4:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['International_and_Specific_Audiences', 'User_Access_Edit_and_Deletion', 'Other', 'First_Party_Collection_Use', 'User_Choice_Control', 'Data_Retention', 'Third_Party_Sharing_Collection', 'Do_Not_Track', 'Data_Security', 'Policy_Change']


In [None]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

In [5]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [None]:
label_cols

In [6]:
label_cols.remove('Other')
num_labels = len(label_cols)
print(label_cols)

['International_and_Specific_Audiences', 'User_Access_Edit_and_Deletion', 'First_Party_Collection_Use', 'User_Choice_Control', 'Data_Retention', 'Third_Party_Sharing_Collection', 'Do_Not_Track', 'Data_Security', 'Policy_Change']


In [7]:
df['one_hot_labels'] = list(df[label_cols].values)

df.head()

Unnamed: 0,policy_uid,segment_index,segment_text,category,International_and_Specific_Audiences,User_Access_Edit_and_Deletion,Other,First_Party_Collection_Use,User_Choice_Control,Data_Retention,Third_Party_Sharing_Collection,Do_Not_Track,Data_Security,Policy_Change,one_hot_labels
0,1636,11,"Reader Surveys, Reader Panels and Market Resea...","['Third Party Sharing/Collection', 'First Part...",0,0,0,1,0,0,1,0,0,0,"[0, 0, 1, 0, 0, 1, 0, 0, 0]"
1,1470,14,"Chat Forums, Etc. <br> <br> Valve's products o...","['Third Party Sharing/Collection', 'Other']",0,0,1,0,0,0,1,0,0,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0]"
2,640,26,We link to third-party surveys. When you compl...,"['Third Party Sharing/Collection', 'First Part...",0,0,0,1,0,0,1,0,0,0,"[0, 0, 1, 0, 0, 1, 0, 0, 0]"
3,1259,16,If you would like more information about the i...,"['Other', 'User Choice/Control']",0,0,1,0,1,0,0,0,0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0]"
4,186,15,<strong> 3. Use of Your Information by The Wal...,"['Third Party Sharing/Collection', 'Other', 'P...",0,0,1,1,0,0,1,0,0,1,"[0, 0, 1, 0, 0, 1, 0, 0, 1]"


In [8]:
def onlyzeros(alist):
    return all([a == 0 for a in alist])

print(onlyzeros([0,0,0,0,0]))
print(onlyzeros([0,0,0,0,1]))
print(onlyzeros([0,0,2,1,0]))

True
False
False


In [9]:
print(list(df.index)[:5])

[0, 1, 2, 3, 4]


In [10]:
indices = [i for i, x in enumerate(list(df.one_hot_labels.values)) if onlyzeros(x)]
df = df.drop(indices)
df = df.reset_index(drop=True)
labels = list(df.one_hot_labels.values)
segments = list(df.segment_text.values)
print(len(labels),len(segments))

3273 3273


In [29]:
#Use Tiny BERT
if tiny:
    model_name = "prajjwal1/bert-tiny"
elif base:
    model_name = "bert-base-uncased"
elif local:
    print("go")
    model_name = "./Very_small_privBert/tokenizer/"
else:
    model_name = "bert-base-uncased"

max_length = 100
tokenizer = BertTokenizer.from_pretrained(model_name, do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(segments,max_length=max_length,pad_to_max_length=True, truncation=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [12]:
type("./Very_small_privBert/tokenizer/")

str

In [13]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [14]:
print(df.one_hot_labels.astype(str).value_counts())

[0 0 1 0 0 0 0 0 0]    741
[0 0 0 0 0 1 0 0 0]    500
[0 0 1 0 0 1 0 0 0]    373
[0 0 0 1 0 0 0 0 0]    236
[0 0 0 0 0 0 0 1 0]    214
                      ... 
[1 0 0 1 0 0 1 0 0]      1
[1 1 0 1 1 0 0 0 0]      1
[0 1 1 0 0 1 0 1 0]      1
[1 1 1 1 1 1 0 1 1]      1
[0 0 1 0 0 1 0 0 1]      1
Name: one_hot_labels, Length: 98, dtype: int64


In [15]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = df.one_hot_labels.astype(str).value_counts()
print(len(label_counts))
one_freq = label_counts[label_counts==1].keys()
print(len(one_freq))
one_freq_idxs = sorted(list(df[df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('df label indices with only one instance: ', one_freq_idxs)

98
30
df label indices with only one instance:  [3231, 3189, 3149, 2996, 2877, 2849, 2578, 2524, 2440, 2221, 2215, 2152, 2108, 2056, 1912, 1638, 1520, 1495, 1439, 1408, 1276, 1134, 853, 731, 728, 331, 231, 219, 44, 4]


In [16]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

In [17]:
# Use train_test_split to split our data into train and validation sets

extra_inputs, validation_inputs, extra_labels, validation_labels, extra_token_types, validation_token_types, extra_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids,attention_masks,
                                                            random_state=2020, test_size=0.08, stratify = labels)

train_inputs, test_inputs, train_labels, test_labels, train_token_types, test_token_types, train_masks, test_masks = train_test_split(extra_inputs, extra_labels, extra_token_types,extra_masks,
                                                            random_state=2020, test_size=0.09, stratify = extra_labels)


# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

test_inputs = torch.tensor(test_inputs)
test_labels = torch.tensor(test_labels)
test_token_types = torch.tensor(test_token_types)
test_masks = torch.tensor(test_masks)

In [None]:
print("len of train data: ",len(train_inputs))
print("len of validation data: ",len(validation_inputs))
print("len of test data: ",len(test_inputs))

In [18]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

In [30]:
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [20]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [21]:
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

In [24]:
b_input_ids.size()

torch.Size([32, 100])

In [None]:
b_input_mask.size()

In [32]:
30522 % 128

58

In [31]:
model

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

In [33]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 1

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|                                                                                     | 0/1 [00:00<?, ?it/s]

Train loss: 0.645846989265708


Epoch: 100%|███████████████████████████████████████████████████████████████████████████| 1/1 [31:25<00:00, 1885.09s/it]

F1 Validation Accuracy:  21.139705882352942
Flat Validation Accuracy:  0.38461538461538464





In [None]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

In [None]:
pred_bools = [pl>0.35 for pl in pred_labels] #boolean output after thresholding

# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=label_cols)
#pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

In [None]:
print(pred_boolsls)

In [None]:
PATH_TO_PRIVBERT = ".\Very_small_privBert\pytorch_privbert"

In [None]:
model = BertForSequenceClassification.from_pretrained(PATH_TO_PRIVBERT, num_labels=9)