## Import Libraries

In [None]:
%tensorflow_version 2.x
!pip install tensorflow-gpu
!pip install datasets transformers
import pandas as pd
import numpy as np
import tensorflow as tf
import torch
from torch.nn import BCEWithLogitsLoss, BCELoss
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report, confusion_matrix, multilabel_confusion_matrix, f1_score, accuracy_score
import pickle
!pip install sentencepiece
import transformers
from tqdm import tqdm, trange
from ast import literal_eval
import ast
from google.colab import output
output.clear()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()
torch.cuda.get_device_name(0)

'Tesla T4'

## Load and Preprocess Training Data

In [None]:
load_path = '/content/drive/MyDrive/NLP_Project/data/sentence_broken/'
sentence_df = pd.read_pickle(load_path + 'sentence_df.pkl')
sentence_level_df = pd.read_pickle(load_path + 'sentence_level_df.pkl')

sentence_df = sentence_df.reset_index(drop=True)
sentence_level_df = sentence_level_df.reset_index(drop=True)

In [None]:
tags = sentence_level_df['Genre'].tolist()
tags

['Action',
 'Adventure',
 'Comedy',
 'Drama',
 'Fantasy',
 'Romance',
 'School Life',
 'Sci Fi',
 'Shoujo',
 'Shounen',
 'Supernatural',
 'Yaoi']

In [None]:
sentence_df['Tags'] = sentence_df['Tags'].apply(ast.literal_eval)

In [None]:
def remove_tags(genre):
  selected = [ g for g in genre if g in tags ]
  return selected

In [None]:
sentence_df['Tags'] = sentence_df['Tags'].apply(remove_tags)

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer

mlb = MultiLabelBinarizer()
mlb.fit(sentence_df['Tags'])
mlb.transform(sentence_df['Tags'])

array([[1, 0, 0, ..., 1, 0, 0],
       [1, 1, 0, ..., 1, 0, 0],
       [1, 0, 0, ..., 1, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [None]:
y = pd.DataFrame(mlb.fit_transform(sentence_df['Tags']), columns=mlb.classes_, index=sentence_df.index)

In [None]:
y

Unnamed: 0,Action,Adventure,Comedy,Drama,Fantasy,Romance,School Life,Sci Fi,Shoujo,Shounen,Supernatural,Yaoi
0,1,0,0,1,1,0,0,0,0,1,0,0
1,1,1,0,1,1,0,0,0,0,1,0,0
2,1,0,0,0,1,0,0,0,0,1,0,0
3,0,0,0,0,0,0,1,0,0,1,0,0
4,0,0,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
36810,0,0,0,1,1,0,0,0,1,0,1,0
36811,1,0,0,0,0,0,0,0,0,1,1,0
36812,1,1,1,0,0,0,0,0,0,0,0,0
36813,1,1,1,0,0,0,0,0,0,0,0,0


In [None]:
df = pd.concat([sentence_df['Description'], y], axis = 1)

In [None]:
df

Unnamed: 0,Description,Action,Adventure,Comedy,Drama,Fantasy,Romance,School Life,Sci Fi,Shoujo,Shounen,Supernatural,Yaoi
0,[four year since scout regiment reached shorel...,1,0,0,1,1,0,0,0,0,1,0,0
1,[foundation alchemy based law equivalent excha...,1,1,0,1,1,0,0,0,0,1,0,0
2,"[battle retake wall maria begin, eren new hard...",1,0,0,0,1,0,0,0,0,1,0,0
3,[picking second season ended boy prepare final...,0,0,0,0,0,0,1,0,0,1,0,0
4,[transferring new school deaf girl shouko nish...,0,0,0,1,0,0,1,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
36810,"[natsume meeting new youkai helping, lot frien...",0,0,0,1,1,0,0,0,1,0,1,0
36811,[zombie loan zombie loan special girl name mic...,1,0,0,0,0,0,0,0,0,1,1,0
36812,[NUM year old james lynx officer lev pilot uni...,1,1,1,0,0,0,0,0,0,0,0,0
36813,[NUM year old james lynx officer lev pilot uni...,1,1,1,0,0,0,0,0,0,0,0,0


In [None]:
sentence_df

Unnamed: 0,Title,Description,Tags,Length
0,Attack on Titan The Final Season,[four year since scout regiment reached shorel...,"[Action, Drama, Fantasy, Shounen]",4
1,Fullmetal Alchemist: Brotherhood,[foundation alchemy based law equivalent excha...,"[Action, Adventure, Drama, Fantasy, Shounen]",7
2,Attack on Titan 3rd Season: Part II,"[battle retake wall maria begin, eren new hard...","[Action, Fantasy, Shounen]",5
3,Haikyuu!! Karasuno High School vs Shiratorizaw...,[picking second season ended boy prepare final...,"[Shounen, School Life]",3
4,A Silent Voice,[transferring new school deaf girl shouko nish...,"[Drama, Shounen, School Life]",4
...,...,...,...,...
36810,Zoku Natsume Yuujinchou,"[natsume meeting new youkai helping, lot frien...","[Drama, Fantasy, Shoujo, Supernatural]",3
36811,Zombie Loan,[zombie loan zombie loan special girl name mic...,"[Action, Shounen, Supernatural]",5
36812,Zone of the Enders: Dolores (Dub),[NUM year old james lynx officer lev pilot uni...,"[Action, Adventure, Comedy]",4
36813,"Zone of the Enders: Dolores, I",[NUM year old james lynx officer lev pilot uni...,"[Action, Adventure, Comedy]",4


In [None]:
cols = df.columns
label_cols = list(cols[1:])
num_labels = len(label_cols)
print('Label columns: ', label_cols)

Label columns:  ['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Romance', 'School Life', 'Sci Fi', 'Shoujo', 'Shounen', 'Supernatural', 'Yaoi']


In [None]:
print('Count of 1 per label: \n', df[label_cols].sum(), '\n') # Label counts, may need to downsample or upsample
print('Count of 0 per label: \n', df[label_cols].eq(0).sum())

Count of 1 per label: 
 Action           8560
Adventure        4748
Comedy          11365
Drama           10925
Fantasy          8088
Romance         15736
School Life      6285
Sci Fi           2874
Shoujo           4835
Shounen          5046
Supernatural     4712
Yaoi             5052
dtype: int64 

Count of 0 per label: 
 Action          28255
Adventure       32067
Comedy          25450
Drama           25890
Fantasy         28727
Romance         21079
School Life     30530
Sci Fi          33941
Shoujo          31980
Shounen         31769
Supernatural    32103
Yaoi            31763
dtype: int64


In [None]:
df = df.sample(frac=1).reset_index(drop=True) #shuffle rows

In [None]:
df['one_hot_labels'] = list(df[label_cols].values)
df.head()

Unnamed: 0,Description,Action,Adventure,Comedy,Drama,Fantasy,Romance,School Life,Sci Fi,Shoujo,Shounen,Supernatural,Yaoi,one_hot_labels
0,"[middle schoolers long distance love, happens ...",0,0,1,0,0,1,1,0,1,0,0,0,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0]"
1,[staff village etrangers welcome suave new man...,0,0,0,0,0,0,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]"
2,[tall handsome wealthy NUM year old predecesso...,0,0,0,0,0,1,0,0,0,0,0,0,"[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]"
3,"[two played secret game kid, yasuhiro always t...",0,0,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"
4,"[eunkyum company year, friendly bright great r...",0,0,0,0,0,0,0,0,0,0,0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]"


In [None]:
df['Description'] = df['Description'].str.join(".")

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df, test_size=0.1)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [None]:
labels = list(train_df.one_hot_labels.values)
comments = list(train_df.Description.values)

In order to avoid memory issues with Google Colab, I enforce a max_length of 100 tokens. Note that some sentences may not adequately represent each label because of this.

In [None]:
# model_checkpoint = "distilroberta-base-uncased"

In [None]:
from transformers import AutoTokenizer, BertTokenizer
max_length = 100
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) # tokenizer
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, do_lower_case=True) # tokenizer
encodings = tokenizer.batch_encode_plus(comments,max_length=max_length,pad_to_max_length=True) # tokenizer's encoding method
print('tokenizer outputs: ', encodings.keys())

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


tokenizer outputs:  dict_keys(['input_ids', 'token_type_ids', 'attention_mask'])


In [None]:
input_ids = encodings['input_ids'] # tokenized and encoded sentences
token_type_ids = encodings['token_type_ids'] # token type ids
attention_masks = encodings['attention_mask'] # attention masks

In [None]:
# Identifying indices of 'one_hot_labels' entries that only occur once - this will allow us to stratify split our training data later
label_counts = train_df.one_hot_labels.astype(str).value_counts()
one_freq = label_counts[label_counts==1].keys()
one_freq_idxs = sorted(list(train_df[train_df.one_hot_labels.astype(str).isin(one_freq)].index), reverse=True)
print('train_df label indices with only one instance: ', one_freq_idxs)

train_df label indices with only one instance:  [33000, 32998, 32866, 32798, 32720, 32390, 32346, 32180, 31897, 31884, 31809, 31764, 31756, 31737, 31697, 31622, 31393, 31372, 31057, 31011, 30967, 30833, 30629, 30491, 30444, 30289, 30163, 30076, 29831, 29760, 29751, 29740, 29720, 29229, 29057, 29020, 28907, 28832, 28684, 28568, 28330, 28248, 28240, 28224, 28194, 28136, 28086, 28023, 27901, 27826, 27809, 27736, 27575, 27406, 27258, 27199, 26879, 26767, 26234, 26207, 26102, 25993, 25947, 25726, 25460, 25354, 25017, 24996, 24360, 24256, 24232, 24225, 24147, 24099, 23921, 23895, 23406, 23396, 23122, 23044, 22687, 22645, 22636, 22630, 22628, 22508, 22444, 22413, 22322, 22007, 21959, 21902, 21779, 21734, 21632, 21384, 21104, 20680, 20200, 20171, 19943, 19766, 19635, 19426, 19286, 19173, 18884, 18883, 18838, 18691, 18549, 18537, 18374, 18359, 18278, 18215, 18205, 17813, 17590, 17537, 17385, 17283, 17044, 17026, 16989, 16924, 16596, 16427, 16194, 16129, 16046, 15777, 15742, 15696, 15515, 15508,

In [None]:
# Gathering single instance inputs to force into the training set after stratified split
one_freq_input_ids = [input_ids.pop(i) for i in one_freq_idxs]
one_freq_token_types = [token_type_ids.pop(i) for i in one_freq_idxs]
one_freq_attention_masks = [attention_masks.pop(i) for i in one_freq_idxs]
one_freq_labels = [labels.pop(i) for i in one_freq_idxs]

Be sure to handle all classes during validation using "stratify" during train/validation split:

In [None]:
# Use train_test_split to split our data into train and validation sets
train_inputs, validation_inputs, train_labels, validation_labels, train_token_types, validation_token_types, train_masks, validation_masks = train_test_split(input_ids, labels, token_type_ids, attention_masks,
                                                            random_state=2020, test_size=0.10, stratify = labels)
# Add one frequency data to train data
train_inputs.extend(one_freq_input_ids)
train_labels.extend(one_freq_labels)
train_masks.extend(one_freq_attention_masks)
train_token_types.extend(one_freq_token_types)

# Convert all of our data into torch tensors, the required datatype for our model
train_inputs = torch.tensor(train_inputs)
train_labels = torch.tensor(train_labels)
train_masks = torch.tensor(train_masks)
train_token_types = torch.tensor(train_token_types)

validation_inputs = torch.tensor(validation_inputs)
validation_labels = torch.tensor(validation_labels)
validation_masks = torch.tensor(validation_masks)
validation_token_types = torch.tensor(validation_token_types)

In [None]:
# Select a batch size for training. For fine-tuning with XLNet, the authors recommend a batch size of 32, 48, or 128. We will use 32 here to avoid memory issues.
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels, train_token_types)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels, validation_token_types)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
torch.save(validation_dataloader,'validation_data_loader')
torch.save(train_dataloader,'train_data_loader')

## Load Model & Set Params

In [None]:
# Load model, the pretrained model will include a single linear classification layer on top for classification. 
from transformers import AutoModelForSequenceClassification, BertForSequenceClassification
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=num_labels)
# model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model.cuda()

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=570.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

Setting custom optimization parameters for the AdamW optimizer https://huggingface.co/transformers/main_classes/optimizer_schedules.html

In [None]:
# setting custom optimization parameters. You may implement a scheduler here as well.
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [
    {'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.01},
    {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
     'weight_decay_rate': 0.0}
]

In [None]:
from transformers import AdamW
optimizer = AdamW(optimizer_grouped_parameters,lr=2e-5,correct_bias=True)
# optimizer = AdamW(model.parameters(),lr=2e-5)  # Default optimization

## Train Model

In [None]:
# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs (authors recommend between 2 and 4)
epochs = 4

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):

  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()

  # Tracking variables
  tr_loss = 0 #running loss
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
    # Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()

    # # Forward pass for multiclass classification
    # outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    # loss = outputs[0]
    # logits = outputs[1]

    # Forward pass for multilabel classification
    outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    logits = outputs[0]
    loss_func = BCEWithLogitsLoss() 
    loss = loss_func(logits.view(-1,num_labels),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    # loss_func = BCELoss() 
    # loss = loss_func(torch.sigmoid(logits.view(-1,num_labels)),b_labels.type_as(logits).view(-1,num_labels)) #convert labels to float for calculation
    train_loss_set.append(loss.item())    

    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    # scheduler.step()
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))

###############################################################################

  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Variables to gather full output
  logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

  # Predict
  for i, batch in enumerate(validation_dataloader):
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels, b_token_types = batch
    with torch.no_grad():
      # Forward pass
      outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
      b_logit_pred = outs[0]
      pred_label = torch.sigmoid(b_logit_pred)

      b_logit_pred = b_logit_pred.detach().cpu().numpy()
      pred_label = pred_label.to('cpu').numpy()
      b_labels = b_labels.to('cpu').numpy()

    tokenized_texts.append(b_input_ids)
    logit_preds.append(b_logit_pred)
    true_labels.append(b_labels)
    pred_labels.append(pred_label)

  # Flatten outputs
  pred_labels = [item for sublist in pred_labels for item in sublist]
  true_labels = [item for sublist in true_labels for item in sublist]

  # Calculate Accuracy
  threshold = 0.50
  pred_bools = [pl>threshold for pl in pred_labels]
  true_bools = [tl==1 for tl in true_labels]
  val_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')*100
  val_flat_accuracy = accuracy_score(true_bools, pred_bools)*100

  print('F1 Validation Accuracy: ', val_f1_accuracy)
  print('Flat Validation Accuracy: ', val_flat_accuracy)

Epoch:   0%|          | 0/4 [00:00<?, ?it/s]

Train loss: 0.3845348054062004


Epoch:  25%|██▌       | 1/4 [08:30<25:31, 510.38s/it]

F1 Validation Accuracy:  50.4163890739507
Flat Validation Accuracy:  16.215393976270153
Train loss: 0.3243265437862625


Epoch:  50%|█████     | 2/4 [17:12<17:07, 513.90s/it]

F1 Validation Accuracy:  57.47195213163799
Flat Validation Accuracy:  18.132035290538482
Train loss: 0.2892650183757402


Epoch:  75%|███████▌  | 3/4 [25:55<08:36, 516.56s/it]

F1 Validation Accuracy:  58.84433128226891
Flat Validation Accuracy:  18.071189534529967
Train loss: 0.2540067004045873


Epoch: 100%|██████████| 4/4 [34:37<00:00, 519.41s/it]

F1 Validation Accuracy:  58.67243867243867
Flat Validation Accuracy:  18.527532704593856





In [None]:
torch.save(model.state_dict(), 'bert_classification_lm')

In [None]:
model.save_pretrained('/content/LM')

In [None]:
!mkdir -p /content/drive/MyDrive/Final_NLP_Project/Part5/bert_classification_lm
!cp -a /content/LM/. /content/drive/MyDrive/Final_NLP_Project/Part5/bert_classification_lm
!cp  /content/bert_classification_lm /content/drive/MyDrive/Final_NLP_Project/Part5/bert_classification_lm

## Load and Preprocess Test Data

In [None]:
# Gathering input data
test_labels = list(test_df.one_hot_labels.values)
test_comments = list(test_df.Description.values)

In [None]:
# Encoding input data
test_encodings = tokenizer.batch_encode_plus(test_comments,max_length=max_length,pad_to_max_length=True)
test_input_ids = test_encodings['input_ids']
test_token_type_ids = test_encodings['token_type_ids']
test_attention_masks = test_encodings['attention_mask']



In [None]:
# Make tensors out of data
test_inputs = torch.tensor(test_input_ids)
test_labels = torch.tensor(test_labels)
test_masks = torch.tensor(test_attention_masks)
test_token_types = torch.tensor(test_token_type_ids)
# Create test dataloader
test_data = TensorDataset(test_inputs, test_masks, test_labels, test_token_types)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
# Save test dataloader
torch.save(test_dataloader,'test_data_loader')

## Prediction and Metics

In [None]:
# Test

# Put model in evaluation mode to evaluate loss on the validation set
model.eval()

#track variables
logit_preds,true_labels,pred_labels,tokenized_texts = [],[],[],[]

# Predict
for i, batch in enumerate(test_dataloader):
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels, b_token_types = batch
  with torch.no_grad():
    # Forward pass
    outs = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    b_logit_pred = outs[0]
    pred_label = torch.sigmoid(b_logit_pred)

    b_logit_pred = b_logit_pred.detach().cpu().numpy()
    pred_label = pred_label.to('cpu').numpy()
    b_labels = b_labels.to('cpu').numpy()

  tokenized_texts.append(b_input_ids)
  logit_preds.append(b_logit_pred)
  true_labels.append(b_labels)
  pred_labels.append(pred_label)

# Flatten outputs
tokenized_texts = [item for sublist in tokenized_texts for item in sublist]
pred_labels = [item for sublist in pred_labels for item in sublist]
true_labels = [item for sublist in true_labels for item in sublist]
# Converting flattened binary values to boolean values
true_bools = [tl==1 for tl in true_labels]

We need to threshold our sigmoid function outputs which range from [0, 1]. Below I use 0.50 as a threshold.

In [None]:
pred_bools = [pl>0.50 for pl in pred_labels] #boolean output after thresholding
test_label_cols = list(test_df.columns[1:13])
print(test_label_cols)
# Print and save classification report
print('Test F1 Accuracy: ', f1_score(true_bools, pred_bools,average='micro'))
print('Test Flat Accuracy: ', accuracy_score(true_bools, pred_bools),'\n')
clf_report = classification_report(true_bools,pred_bools,target_names=test_label_cols)
pickle.dump(clf_report, open('classification_report.txt','wb')) #save report
print(clf_report)

['Action', 'Adventure', 'Comedy', 'Drama', 'Fantasy', 'Romance', 'School Life', 'Sci Fi', 'Shoujo', 'Shounen', 'Supernatural', 'Yaoi']
Test F1 Accuracy:  0.5885920030320257
Test Flat Accuracy:  0.18902770233568714 

              precision    recall  f1-score   support

      Action       0.67      0.65      0.66       873
   Adventure       0.62      0.42      0.50       462
      Comedy       0.65      0.48      0.55      1145
       Drama       0.55      0.40      0.46      1111
     Fantasy       0.72      0.65      0.68       819
     Romance       0.74      0.72      0.73      1560
 School Life       0.71      0.46      0.56       657
      Sci Fi       0.61      0.39      0.47       312
      Shoujo       0.59      0.45      0.51       476
     Shounen       0.68      0.18      0.29       489
Supernatural       0.66      0.38      0.48       468
        Yaoi       0.69      0.69      0.69       505

   micro avg       0.67      0.52      0.59      8877
   macro avg       0.66   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Output Dataframe

In [None]:
import pandas as pd
import numpy as np
import json
import re
import csv
import matplotlib.pyplot as plt 
import seaborn as sns
from tqdm import tqdm
import ast
import os
from os import path
from pandas.plotting import table
import nltk
nltk.download("popular")
from google.colab import output
from google.colab import files
import string
output.clear()

%matplotlib inline
pd.set_option('display.max_colwidth', 300)

In [None]:
idx2label = dict(zip(range(12),label_cols))
print(idx2label)

{0: 'Action', 1: 'Adventure', 2: 'Comedy', 3: 'Drama', 4: 'Fantasy', 5: 'Romance', 6: 'School Life', 7: 'Sci Fi', 8: 'Shoujo', 9: 'Shounen', 10: 'Supernatural', 11: 'Yaoi'}


In [None]:
# Getting indices of where boolean one hot vector true_bools is True so we can use idx2label to gather label names
true_label_idxs, pred_label_idxs=[],[]
for vals in true_bools:
  true_label_idxs.append(np.where(vals)[0].flatten().tolist())
for vals in pred_bools:
  pred_label_idxs.append(np.where(vals)[0].flatten().tolist())

In [None]:
# Gathering vectors of label names using idx2label
true_label_texts, pred_label_texts = [], []
for vals in true_label_idxs:
  if vals:
    true_label_texts.append([idx2label[val] for val in vals])
  else:
    true_label_texts.append(vals)

for vals in pred_label_idxs:
  if vals:
    pred_label_texts.append([idx2label[val] for val in vals])
  else:
    pred_label_texts.append(vals)

In [None]:
# Decoding input ids to comment text
comment_texts = [tokenizer.decode(text,skip_special_tokens=True,clean_up_tokenization_spaces=False) for text in tokenized_texts]

In [None]:
# Converting lists to df
comparisons_df = pd.DataFrame({'comment_text': comment_texts, 'true_labels': true_label_texts, 'pred_labels':pred_label_texts})
comparisons_df.to_csv('comparisons.csv')
comparisons_df.head(20)

Unnamed: 0,comment_text,true_labels,pred_labels
0,merry server desperate need money . one day blue peter one customer asked pretend girlfriend thanksgiving holiday . seemed serious money would certainly help merry accepted proposal . went visit family sweet mother welcomed merry open arm . believed merry son going get married . merry concerned ...,[Romance],[Romance]
1,sugita handsome skilled newcomer company get along everyone everyone except petite assistant manager yumihara tend strict . one day lunch break sugita find small secluded coffee shop named dead copy feature special table dedicated playing retro game . surprise also spot yumihara completely immer...,"[Comedy, Romance]",[Yaoi]
2,kashiwagi koichi recently moved cousin house father barely talked recent year pass away . cousin four female sibling ranging age adult young child chizuru azusa kaede hatsune . parent died year ago koichi father girl uncle became guardian . koichi fit perfectly household although kaede seems act...,[Drama],[Supernatural]
3,man love yeena yoon world gunho yoo . girl scared gunho yoo world yeena yoon . love letter meant brother juhyung yoo end hand scary gunho . could even correct situation reply accept confession,"[Drama, Romance]","[Romance, Shoujo]"
4,girl wealthy family accidentally lost virginity . get insulted party waking next morning . swore get revenge got closer instead,"[Drama, Romance]","[Drama, Romance]"
5,death criminal lived jail country palom entering mad game progress . battle freedom . meaning freedom . spill scum blood like butcher massacring people flavor freedom . swore use legend sword hand breaking apart shackle world,"[Action, Adventure, Fantasy]","[Action, Fantasy]"
6,ippo ready face greatest test thus far boxer next opponent completely unlike anyone fought . cool calculating possessing intelligence rivaled ferocity brain ippo brawn . make matter worse old rival coach kamogawa past returned guide ippo genius opponent brings knowledge secret attack supposed un...,"[Action, Drama, Shounen]",[Drama]
7,lone mercenary traveling foreign land forced participate street brawl fighting back back gun toting man wearing pelt around shoulder . emerge attack unscathed man order become bodyguard . mercenary turned bodyguard learn surprising fact spirited fighter king near daily assassination attempt maje...,"[Action, Fantasy]",[Action]
8,slapstick comedy three pretty girl struggling earn living pilot iron goblin delivery vessel . computer answer back space pirate tail romantic entanglement client cause friction trio . final episode throw alien eterna race mix,"[Comedy, Sci Fi]",[Comedy]
9,unfair heaven strange world wing forged flight god forsaken duck chicken ostrich . teasing dream soaring sky confining earth . world everyone born god blessing anomaly . ergera outclass god abandoned whilst simultaneously saving people love,"[Action, Romance, Supernatural]","[Adventure, Comedy, Fantasy]"


In [None]:
numpy_array = comparisons_df.to_numpy()
np.savetxt("comparisons_df.txt", numpy_array, fmt = "%s")

In [None]:
!cp /content/comparisons.csv /content/drive/MyDrive/Final_NLP_Project/Part5/bert_classification_lm

## Optimizing threshold value for micro F1 score

Doing this may result in a trade offs between precision, flat accuracy and micro F1 accuracy. You may tune the threshold however you want.

In [None]:
# Calculate Accuracy - maximize F1 accuracy by tuning threshold values. First with 'macro_thresholds' on the order of e^-1 then with 'micro_thresholds' on the order of e^-2

macro_thresholds = np.array(range(1,10))/10

f1_results, flat_acc_results = [], []
for th in macro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_macro_th = macro_thresholds[np.argmax(f1_results)] #best macro threshold value

micro_thresholds = (np.array(range(10))/100)+best_macro_th #calculating micro threshold values

f1_results, flat_acc_results = [], []
for th in micro_thresholds:
  pred_bools = [pl>th for pl in pred_labels]
  test_f1_accuracy = f1_score(true_bools,pred_bools,average='micro')
  test_flat_accuracy = accuracy_score(true_bools, pred_bools)
  f1_results.append(test_f1_accuracy)
  flat_acc_results.append(test_flat_accuracy)

best_f1_idx = np.argmax(f1_results) #best threshold value

# Printing and saving classification report
print('Best Threshold: ', micro_thresholds[best_f1_idx])
print('Test F1 Accuracy: ', f1_results[best_f1_idx])
print('Test Flat Accuracy: ', flat_acc_results[best_f1_idx], '\n')

best_pred_bools = [pl>micro_thresholds[best_f1_idx] for pl in pred_labels]
clf_report_optimized = classification_report(true_bools,best_pred_bools, target_names=label_cols)
pickle.dump(clf_report_optimized, open('classification_report_optimized.txt','wb'))
print(clf_report_optimized)

Best Threshold:  0.3
Test F1 Accuracy:  0.6308065932930295
Test Flat Accuracy:  0.15616512764801738 

              precision    recall  f1-score   support

      Action       0.61      0.77      0.68       873
   Adventure       0.55      0.63      0.59       462
      Comedy       0.55      0.70      0.62      1145
       Drama       0.47      0.68      0.55      1111
     Fantasy       0.66      0.74      0.70       819
     Romance       0.66      0.85      0.74      1560
 School Life       0.66      0.61      0.64       657
      Sci Fi       0.56      0.57      0.57       312
      Shoujo       0.50      0.57      0.53       476
     Shounen       0.50      0.40      0.44       489
Supernatural       0.57      0.50      0.53       468
        Yaoi       0.63      0.75      0.68       505

   micro avg       0.58      0.69      0.63      8877
   macro avg       0.58      0.65      0.61      8877
weighted avg       0.58      0.69      0.63      8877
 samples avg       0.60      0.6

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
!cp /content/classification_report_optimized.txt /content/drive/MyDrive/Final_NLP_Project/Part5/bert_classification_lm