### Step 1.1 : Data Viewing and Simple Preprocessing

In [None]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers.data.processors.utils import InputExample

In [None]:
raw_data = pd.read_csv('./data/raw_data.csv')
raw_data.head()

In [None]:
# These codes are used for data statistics only. No need to uncomment.

# print(raw_data['count'].median())

# raw_data['len_text'] =raw_data.text_comments.apply(lambda x: len(x.split()))

# print(raw_data['len_text'].median())
# bins = [0,50,100,150,200,250,300,350,400,450,500]
# groups = pd.cut(raw_data['len_text'],bins,right=True)
# pd.value_counts(groups).sort_index()

In [None]:
raw_data.shape[0]

In [None]:
### IMPORTANT ###
# You may change 'model_path' to save and load different trained models.
# Availiable options: 'text_comments','text_only','commments_only','comments_group1','comments_group2','comments_group3','natural_split'.
# Please make sure that your 'model_path' must match the correspongding data and comments.
# For more details, please check the 'README.md' file.

model_path = 'text_comments'

In [None]:
## Different Number of Comments ##

# Please uncomment the corresponding lines if the 'model_path' is 'comments_groupX'.

# print(raw_data['count'].describe(percentiles=[0.33,0.67]))

# For'comments_group1'.
# raw_data = raw_data[raw_data['count'] <= 7]
# raw_data.shape

# For'comments_group2'.
# raw_data = raw_data[raw_data['count'] > 7]
# raw_data = raw_data[raw_data['count'] <= 18]
# raw_data.shape

# For'comments_group3'.
# raw_data = raw_data[raw_data['count'] > 18]
# raw_data.shape


In [None]:
## Data Selection ##

# You may change 'text_comments' to 'text_only' or 'comments_only' with the corresponding 'model_path' to get more experiment results.

raw_data = raw_data[['text_comments','label']]
raw_data = raw_data.rename(columns = {'text_comments':'text'})

# raw_data = raw_data[['text_only','label']]
# raw_data = raw_data.rename(columns = {'text_only':'text'})

# raw_data = raw_data[['comments_only','label']]
# raw_data = raw_data.rename(columns = {'comments_only':'text'})

raw_data.head()

In [None]:
raw_data = raw_data.dropna(axis=0)
raw_data.shape

In [None]:
raw_data['label'] = LabelEncoder().fit_transform(raw_data['label'])
raw_data.head()

In [None]:
data = raw_data.copy()
data = data.reindex(np.random.permutation(data.index))
data.head(10)

In [None]:
train, val = train_test_split(data, test_size=0.2, random_state=35)

In [None]:
train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

train.head(10)

In [None]:
train.shape,val.shape

### Step 1.2 : Split the Dataset into Segments

In [None]:
from utils import get_split,get_natural_split

In [None]:
train_tmp = train.copy()

### IMPORTANT ###
# If your 'model_path' is 'natural_split' please use 'get_natural_split' function rather than 'get_split'.

train_tmp['text_split'] = train['text'].apply(get_split)
# train_tmp['text_split'] = train['text'].apply(get_natural_split)
train = train_tmp
train.head()

In [None]:
val_tmp = val.copy()

### IMPORTANT ###
# If your 'model_path' is 'natural_split' please use 'get_natural_split' function rather than 'get_split'.

val_tmp['text_split'] = val['text'].apply(get_split)
# val_tmp['text_split'] = val['text'].apply(get_natural_split)
val = val_tmp
val.head()

In [None]:
train_l = []  # Segmented Text
label_l = []  # Label of Each Text
index_l =[]   # The Index of Each Text Before Segmentation
for idx,row in train.iterrows():
  for l in row['text_split']:
    train_l.append(l)
    label_l.append(row['label'])
    index_l.append(idx)
len(train_l), len(label_l), len(index_l)

In [None]:
val_l = []
val_label_l = []
val_index_l = []
for idx,row in val.iterrows():
  for l in row['text_split']:
    val_l.append(l)
    val_label_l.append(row['label'])
    val_index_l.append(idx)
len(val_l), len(val_label_l), len(val_index_l)

In [None]:
train_df = pd.DataFrame({'text':train_l, 'label':label_l})
train_df.head()

In [None]:
val_df = pd.DataFrame({'text':val_l, 'label':val_label_l})
val_df.head()

In [None]:
train_InputExamples = train_df.apply(lambda x: InputExample(guid=None,text_a = x['text'], text_b = None, label = x['label']), axis = 1)

val_InputExamples = val_df.apply(lambda x: InputExample(guid=None, text_a = x['text'], text_b = None, label = x['label']), axis = 1)


### Step 2 : Define Models For Bert Classification

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

import torch.optim as optim
from torchtext.data import BucketIterator

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    BertweetTokenizer,
    AutoModel,
    AutoTokenizer
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers.data.processors.utils import InputExample, DataProcessor

import logging

logger=logging.getLogger(__name__)


In [None]:
MODEL_CLASSES={
    "bert":(BertConfig,BertTokenizer),
    "bertweet":(BertConfig,BertweetTokenizer)
}

my_label_list=[0, 1]
MAX_SEQ_LENGTH=200

In [None]:
class BertForClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 2

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output, pooled_output=outputs[:2]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        
        outputs = (logits, pooled_output, sequence_output,)

        if labels is not None:
            
            if self.num_labels == 1:
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        
        return outputs  # loss, logits, pooled_output, sequence_output


### Step 3.1 : Load Pre-training Models & Prepare Training Data

In [None]:
# # Load Pre-training Models
# args={"model_name_or_path": "bert-base-uncased",
#     "config_name": "bert-base-uncased",
#     "tokenizer_name": "bert-base-uncased",
#       }

# config_class, tokenizer_class = MODEL_CLASSES["bert"]
# model_class=BertForClassification


# config = config_class.from_pretrained(
#     args["config_name"],
#     finetuning_task="", 
#     cache_dir=None,
# )
# tokenizer = tokenizer_class.from_pretrained(
#     args["tokenizer_name"],
#     do_lower_case=True,
#     cache_dir=None,
# )
# model = model_class.from_pretrained(
#     args["model_name_or_path"],
#     from_tf=bool(".ckpt" in args["model_name_or_path"]),
#     config=config,
#     cache_dir=None,
# )


# model.to("cuda")

In [None]:
# # Prepare Training Data
# train_features = convert_examples_to_features(train_InputExamples,tokenizer, label_list=my_label_list, 
#                                               output_mode="classification", max_length=MAX_SEQ_LENGTH )

In [None]:
# input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
# attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
# token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
# the_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)


# dataset = TensorDataset(input_ids, attention_mask, token_type_ids, the_labels)

### Step 3.2 : Train & Save Models

In [None]:
# # Define Train Function For Bert Classification

# def train(train_dataset,model,tokenizer):
#     no_decay=["bias","LayerNorm.weight"]
#     optimizer_grouped_parameters=[
#         {
#             "params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#             "weight_decay":0.0,

#         },
#         {
#             "params": [p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)],
#             "weight_decay":0.0
#         },
#     ]

    
#     t_total=len(train_dataset)// 5
#     optimizer=AdamW(optimizer_grouped_parameters,lr=2e-5,eps=1e-8)
    
#     scheduler=get_linear_schedule_with_warmup(
#         optimizer,num_warmup_steps=0,num_training_steps=t_total
#         )
    
    
    
#     # *********************
#     logger.info("*****Running training*****")
#     logger.info("  Num examples = %d", len(train_dataset))
#     logger.info("  Num Epochs = %d", 5)


#     epochs_trained=0
#     global_step=0
#     steps_trained_in_current_epoch=0

#     tr_loss,logging_loss=0.0,0.0
#     model.zero_grad()
#     train_iterator=trange(epochs_trained,5,desc="Epoch",disable=False)


#     for k in train_iterator: #5 epoch
    
#         train_sampler=RandomSampler(train_dataset)
#         train_dataloader=DataLoader(train_dataset,sampler=train_sampler,batch_size=16)
#         epoch_iterator=tqdm(train_dataloader,desc="Iteration",disable=False)

#         for step,batch in enumerate(epoch_iterator): 
#             if steps_trained_in_current_epoch>0:
#                 steps_traned_in_current_epoch-=1
#                 continue

#             model.train()
#             batch=tuple(t.to("cuda") for t in batch)
            
#             inputs={"input_ids": batch[0],"attention_mask": batch[1],"token_type_ids": batch[2], "labels": batch[3]}
   
#             outputs = model(**inputs)
#             loss=outputs[0]
 
#             loss.backward()

#             tr_loss+=loss.item()
#             if (step+1)%1==0:
#                 torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

#                 optimizer.step()
#                 scheduler.step()
#                 model.zero_grad()
#                 global_step+=1

#         logger.info("average loss:" +str(tr_loss/global_step))


#     return global_step,tr_loss/global_step

In [None]:
# # Start Training

# torch.cuda.empty_cache()
# train(dataset,model,tokenizer)

In [None]:
# # Save Trained Model Parameters

# import os
# model.save_pretrained("./trained_models/classification_models_" + model_path)
# tokenizer.save_pretrained("./trained_models/classification_models_" + model_path)

# torch.save(args,os.path.join("./trained_models/classification_models_" + model_path,"training_args.bin"))

### Step 4.1 : Load the Trained Model & Prepare Data for Bert Model Evaluation

In [None]:
# Start Loading the trained model data

args_eval={"model_name_or_path": "./trained_models/classification_models_" + model_path,
    "config_name": "./trained_models/classification_models_" + model_path,
    "tokenizer_name": "./trained_models/classification_models_" + model_path,
      }

config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model.to("cuda")

In [None]:
# Prepare Data for Evaluation

val_features = convert_examples_to_features(val_InputExamples, tokenizer, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )


val_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long)
val_attention_mask = torch.tensor([f.attention_mask for f in val_features], dtype=torch.long)
val_token_type_ids = torch.tensor([f.token_type_ids for f in val_features], dtype=torch.long)
val_the_labels = torch.tensor([f.label for f in val_features], dtype=torch.long)


eval_dataset = TensorDataset(val_input_ids, val_attention_mask, val_token_type_ids, val_the_labels)

### Step 4.2 : Bert Classification Model Evaluation

In [None]:
from sklearn.metrics import f1_score

In [None]:
def evaluate(model, tokenizer, eval_dataset):


    logger.info("***** Running evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", 16)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    eval_sampler =RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=16)

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to("cuda") for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps

    preds = np.argmax(preds, axis=1)
    
    accuracy,f1 = acc_and_f1(preds, out_label_ids)


    return accuracy,f1,eval_loss

In [None]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return acc, f1


In [None]:
accuracy,f1 ,eval_loss = evaluate(model, tokenizer, eval_dataset)

print("Accuracy: ",accuracy, "F1 Score: ",f1,"Loss: ",eval_loss)

### Step 5.1 : Get Text Embeddings & Combine Embeddings with Labels

In [None]:
def get_prediction(model, tokenizer, dataset):

    logger.info("***** Running prediction  *****")
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", 16)

    pooled_outputs = None

    sampler =SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=32)

    for batch in tqdm(dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to("cuda") for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = model(**inputs)
            pooled_output = outputs[2]

            if pooled_outputs is None:
                pooled_outputs = pooled_output.detach().cpu().numpy()
            else:
                pooled_outputs = np.append(pooled_outputs, pooled_output.detach().cpu().numpy(), axis=0)

    return pooled_outputs


In [None]:
args_eval={"model_name_or_path": "./trained_models/classification_models_" + model_path,
    "config_name": "./trained_models/classification_models_" + model_path,
    "tokenizer_name": "./trained_models/classification_models_" + model_path,
      }


config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model.to("cuda")

In [None]:
train_features = convert_examples_to_features(train_InputExamples,tokenizer, label_list=my_label_list, output_mode="classification", max_length=MAX_SEQ_LENGTH )

val_features = convert_examples_to_features(val_InputExamples, tokenizer, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )

In [None]:
train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
train_attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
train_token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
train_the_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_token_type_ids, train_the_labels)

In [None]:
val_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long)
val_attention_mask = torch.tensor([f.attention_mask for f in val_features], dtype=torch.long)
val_token_type_ids = torch.tensor([f.token_type_ids for f in val_features], dtype=torch.long)
val_the_labels = torch.tensor([f.label for f in val_features], dtype=torch.long)

val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_token_type_ids, val_the_labels)

In [None]:
train_pooled_outputs = get_prediction(model, tokenizer, train_dataset)
train_pooled_outputs.shape

In [None]:
val_pooled_outputs = get_prediction(model, tokenizer, val_dataset)
val_pooled_outputs.shape

In [None]:
# Method 1: Feature Concatenation
train_x = {}
for l, emb in zip(index_l, train_pooled_outputs):
    if l in train_x.keys():
        # np.vstack on lists represents features concatenation 
        train_x[l]  =np.vstack([train_x[l], emb])
    else:
        train_x[l] = [emb]

train_l_final = []
label_l_final = []
for k in train_x.keys():
    train_l_final.append(train_x[k])
    label_l_final.append(train.loc[k]['label'])

df_train = pd.DataFrame({'emb': train_l_final, 'label': label_l_final})
df_train.head(10)

In [None]:
# Method 1: Feature Concatenation
val_x = {}

for l, emb in zip(val_index_l, val_pooled_outputs):
    if l in val_x.keys():
        val_x[l]  =np.vstack([val_x[l], emb])
    else:
        val_x[l] = [emb]


val_l_final = []
vlabel_l_final = []
for k in val_x.keys():
    val_l_final.append(val_x[k])
    vlabel_l_final.append(val.loc[k]['label'])

df_val = pd.DataFrame({'emb': val_l_final, 'label': vlabel_l_final})
df_val.head(10)

In [None]:
# # Method 2: Feature Average Pooling
# train_x = {}
# for l, emb in zip(index_l, train_pooled_outputs):
#     if l in train_x.keys():
#         train_x[l]  =np.vstack([train_x[l], emb])
#     else:
#         train_x[l] = [emb]

# for l in train_x.keys():
#     # print(len(train_x[l]))
#     train_x[l] = [np.mean(train_x[l],axis=0)]

# train_l_final = []
# label_l_final = []
# for k in train_x.keys():
#     train_l_final.append(train_x[k])
#     label_l_final.append(train.loc[k]['label'])

# df_train = pd.DataFrame({'emb': train_l_final, 'label': label_l_final})
# df_train.head(10)

In [None]:
# # Method 2: Feature Average Pooling
# val_x = {}

# for l, emb in zip(val_index_l, val_pooled_outputs):
#     if l in val_x.keys():
#         val_x[l]  =np.vstack([val_x[l], emb])
#     else:
#         val_x[l] = [emb]

# for l in val_x.keys():
#     val_x[l] = [np.mean(val_x[l],axis=0)]

# val_l_final = []
# vlabel_l_final = []
# for k in val_x.keys():
#     val_l_final.append(val_x[k])
#     vlabel_l_final.append(val.loc[k]['label'])

# df_val = pd.DataFrame({'emb': val_l_final, 'label': vlabel_l_final})
# df_val.head(10)

In [None]:
# # Method 3: Feature Max Pooling
# train_x = {}
# for l, emb in zip(index_l, train_pooled_outputs):
#     if l in train_x.keys():
#         train_x[l]  =np.vstack([train_x[l], emb])
#     else:
#         train_x[l] = [emb]

# for l in train_x.keys():
#     # print(len(train_x[l]))
#     train_x[l] = [np.max(train_x[l],axis=0)]

# train_l_final = []
# label_l_final = []
# for k in train_x.keys():
#     train_l_final.append(train_x[k])
#     label_l_final.append(train.loc[k]['label'])

# df_train = pd.DataFrame({'emb': train_l_final, 'label': label_l_final})
# df_train.head(10)

In [None]:
# # Method 3: Feature Max Pooling
# val_x = {}

# for l, emb in zip(val_index_l, val_pooled_outputs):
#     if l in val_x.keys():
#         val_x[l]  =np.vstack([val_x[l], emb])
#     else:
#         val_x[l] = [emb]

# for l in val_x.keys():
#     val_x[l] = [np.max(val_x[l],axis=0)]

# val_l_final = []
# vlabel_l_final = []
# for k in val_x.keys():
#     val_l_final.append(val_x[k])
#     vlabel_l_final.append(val.loc[k]['label'])

# df_val = pd.DataFrame({'emb': val_l_final, 'label': vlabel_l_final})
# df_val.head(10)

In [None]:
df_val, df_test = train_test_split(df_val, test_size=0.4, random_state=35)

In [None]:
df_train.shape, df_val.shape, df_test.shape

### Step 5.2 : Prepare Data for Classfication Model

In [None]:
batch_dict = {
    'text_comments':[[7,663],[3,232],[5,93]],
    'text_only':[[7,663],[3,232],[5,93]],
    'comments_only':[[4,1088],[4,163],[4,109]],
    'comments_group1':[[4,387],[4,58],[5,31]],
    'comments_group2':[[4,398],[1,239],[4,40]],
    'comments_group3':[[5,300],[5,45],[1,151]],
    'natural_split':[[7,663],[3,232],[5,93]],
}

batches = batch_dict[model_path]

In [None]:
def train_generator(df, batch_size = batches[0][0], batches_per_epoch = batches[0][1]):
    num_sequences = len(df['emb'].to_list())
    assert batch_size * batches_per_epoch == num_sequences
    num_features= 768

    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch):
            longest_index = (b + 1) * batch_size - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size][-batch_size:], key=len))
            x_train = np.full((batch_size, timesteps, num_features), -99.)
            y_train = np.zeros((batch_size,  1))
            for i in range(batch_size):
                li = b * batch_size + i
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train

def val_generator(df,batch_size_val=batches[1][0],batches_per_epoch_val=batches[1][1]):
    
    num_sequences_val = len(df['emb'].to_list())
    assert batch_size_val * batches_per_epoch_val == num_sequences_val
    num_features= 768


    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_val):
            longest_index = (b + 1) * batch_size_val - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size_val][-31:], key=len))
            x_val = np.full((batch_size_val, timesteps, num_features), -99.)
            y_val = np.zeros((batch_size_val,  1))
            for i in range(batch_size_val):
                li = b * batch_size_val + i
                x_val[i, 0:len(x_list[li]), :] = x_list[li]
                y_val[i] = y_list[li]
            yield x_val, y_val

def test_generator(df,batch_size_test=batches[2][0],batches_per_epoch_test=batches[2][1]):
    
    num_sequences_test = len(df['emb'].to_list())
    assert batch_size_test * batches_per_epoch_test == num_sequences_test
    num_features= 768


    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_test):
            longest_index = (b + 1) * batch_size_test - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size_test][-31:], key=len))
            # print(len(df_train['emb'].to_list()[:b+batch_size][-7:]))
            x_test = np.full((batch_size_test, timesteps, num_features), -99.)
            y_test = np.zeros((batch_size_test,  1))
            for i in range(batch_size_test):
                li = b * batch_size_test + i
                x_test[i, 0:len(x_list[li]), :] = x_list[li]
                y_test[i] = y_list[li]
            yield x_test, y_test            

In [None]:
train_data = train_generator(df_train)
val_data = val_generator(df_val)
test_data = test_generator(df_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def cul_all_metrics(y_true, y_pred, pos_label=1):
    return {"accuracy": float("%.5f" % accuracy_score(y_true=y_true, y_pred=y_pred)),
            "precision": float("%.5f" % precision_score(y_true=y_true, y_pred=y_pred, pos_label=pos_label)),
            "recall": float("%.5f" % recall_score(y_true=y_true, y_pred=y_pred, pos_label=pos_label)),
            "f1-score": float("%.5f" % f1_score(y_true=y_true, y_pred=y_pred)),
           }

### Step 6.1 : Train & Save LSTM Model For Classification

In [None]:
import tensorflow as tf
from tensorflow import keras
import h5py

text_input = keras.Input(shape=(None,768,), dtype='float32', name='text')

# keras.layers.Masking(mask_value=0.0)
l_mask = keras.layers.Masking(mask_value=-99.)(text_input) 

# Which we encoded in a single vector via a LSTM
encoded_text = keras.layers.LSTM(100,)(l_mask)
out_dense = keras.layers.Dense(30, activation='relu')(encoded_text)
# And we add a softmax classifier on top
out = keras.layers.Dense(2, activation='softmax')(out_dense)
# At model instantiation, we specify the input and the output:
model = keras.Model(text_input, out)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()

In [None]:
call_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=3, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

In [None]:
batches_per_epoch = batches[0][1]

batches_per_epoch_val= batches[1][1]

model.fit(train_data, steps_per_epoch=batches_per_epoch, epochs=10,
                    validation_data=val_data, validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )

In [None]:
# save_path = "./trained_models/classification_models_" + model_path + "/LSTM_model/model.h5"

In [None]:
# model.save(save_path)

### Step 6.2 : Evaluate LSTM Model For Classification

In [None]:
# model = tf.keras.models.load_model(save_path)

In [None]:
batches_per_epoch_test = batches[2][1]
pred = model.predict_generator(test_data, steps=batches_per_epoch_test)

In [None]:
pred = np.argmax(pred,axis=1).tolist()
label = df_test.label.to_list()

cul_all_metrics(label,pred)

### Step 7.1 : Train & Save Transformer Model For Classification

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
class MultiHeadSelfAttention(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        assert (
            embed_dim % num_heads == 0
        ), "embedding dimension not divisible by num heads"
        self.projection_dim = embed_dim // num_heads
        self.wq = keras.layers.Dense(embed_dim)
        self.wk = keras.layers.Dense(embed_dim)
        self.wv = keras.layers.Dense(embed_dim)
        self.combine_heads = keras.layers.Dense(embed_dim)

    def attention(self, q, k, v):
        score = tf.matmul(q, k, transpose_b=True)
        dk = tf.cast(tf.shape(k)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dk)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, v)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, x):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(x)[0]
        q = self.wq(x)  # (batch_size, seq_len, embed_dim)
        k = self.wk(x)  # (batch_size, seq_len, embed_dim)
        v = self.wv(x)  # (batch_size, seq_len, embed_dim)
        q = self.separate_heads(
            q, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        k = self.separate_heads(
            k, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        v = self.separate_heads(
            v, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(q, k, v)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

In [None]:
class TransformerLayer(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerLayer, self).__init__()

        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [
                keras.layers.Dense(ff_dim, activation="relu"),
                keras.layers.Dense(embed_dim),
            ]
        )

        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training):
        attn_output = self.att(x)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2


In [None]:
embed_dim=768
ff_dim=32
num_heads=1

In [None]:
text_input = keras.Input(shape=(None,768,), dtype='float32', name='text')

l_mask = keras.layers.Masking(mask_value=-99.)(text_input) 

encoded_text = TransformerLayer(embed_dim,num_heads,ff_dim)(l_mask)

out_dense1 = keras.layers.LSTM(100,)(encoded_text)

out_dense = keras.layers.Dense(30, activation='relu')(out_dense1)

out = keras.layers.Dense(2, activation='softmax')(out_dense)

model = keras.Model(text_input, out)

model.compile(optimizer='adam',loss='sparse_categorical_crossentropy',metrics=['acc'])

model.summary()

In [None]:
call_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=3, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

In [None]:
batches_per_epoch = batches[0][1]

batches_per_epoch_val= batches[1][1]

model.fit(train_data, steps_per_epoch=batches_per_epoch, epochs=10,
                    validation_data=val_data, validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )

In [None]:
# save_weight_path = "./trained_models/classification_models_" + model_path + "/Transformer_model/model.h5"

In [None]:
# model.save_weights(save_weight_path)

### Step 7.2 : Evaluate Transformer Model for Classification

In [None]:
test_data = test_generator(df_test)

In [None]:
# model.load_weights(save_weight_path)

In [None]:
batches_per_epoch_test = batches[2][1]

pred = model.predict_generator(test_data, steps=batches_per_epoch_test)

In [None]:
pred = np.argmax(pred,axis=1).tolist()
label = df_test.label.to_list()

cul_all_metrics(label,pred)