### Step 1: Data Viewing and Simple Preprocessing

In [1]:
import pandas as pd
import os
import json
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from transformers.data.processors.utils import InputExample
from split_utils import get_split,get_natural_split,get_fixed_split

In [2]:
raw_data = pd.read_csv('./raw_data.csv')
raw_data.head()

Unnamed: 0,text_comments,text_only,comments_only,label,count
0,"Breaking: At least 10 dead, 5 injured after tO...","Breaking: At least 10 dead, 5 injured after tO...",The religion of peace strikes again.\n[SEP]Hi ...,rumour,9
1,France: 10 people dead after shooting at HQ of...,France: 10 people dead after shooting at HQ of...,MT France: 10 dead after shooting at HQ of sat...,rumour,7
2,Ten killed in shooting at headquarters of Fren...,Ten killed in shooting at headquarters of Fren...,must be that peace loving religion again\n[SEP...,rumour,5
3,BREAKING: 10 dead in shooting at headquarters ...,BREAKING: 10 dead in shooting at headquarters ...,WTF &gt; BREAKING 10 dead in shooting at headq...,rumour,13
4,Reuters: 10 people shot dead at headquarters o...,Reuters: 10 people shot dead at headquarters o...,watch yourself in Paris bud\n[SEP]islamist ter...,rumour,16


In [3]:
model_path = 'text_comments'

In [4]:
raw_data = raw_data[['text_comments','label']]
raw_data = raw_data.rename(columns = {'text_comments':'text'})
raw_data['label'] = LabelEncoder().fit_transform(raw_data['label'])

In [5]:
train, val = train_test_split(raw_data, test_size=0.2, random_state=35)

train.reset_index(drop=True, inplace=True)
val.reset_index(drop=True, inplace=True)

In [6]:
print(train.shape,val.shape)

(4641, 2) (1161, 2)


In [7]:
train_tmp = train.copy()

train_tmp['text_split'] = train['text'].apply(get_split)
train = train_tmp
train.head()

Unnamed: 0,text,label,text_split
0,Two came out front door of Lindt cafe. One out...,1,[Two came out front door of Lindt cafe. One ou...
1,Thank God he's not a Muslim! Otherwise the wor...,1,[Thank God he's not a Muslim! Otherwise the wo...
2,Wow RT State Senator asks the #Ferguson polic...,0,[Wow RT State Senator asks the #Ferguson polic...
3,"On Israeli TV, an Israeli woman speaks of her ...",1,"[On Israeli TV, an Israeli woman speaks of her..."
4,Anybody who wants to talk about what Charlie H...,0,[Anybody who wants to talk about what Charlie ...


In [8]:
val_tmp = val.copy()

val_tmp['text_split'] = val['text'].apply(get_split)
val = val_tmp
val.head()

Unnamed: 0,text,label,text_split
0,Apparent hostage situation unfolding in Sydney...,1,[Apparent hostage situation unfolding in Sydne...
1,Thoughts go out to the people in Sydney today!...,0,[Thoughts go out to the people in Sydney today...
2,"The 2011 issue of Charlie Hebdo, whose cover f...",0,"[The 2011 issue of Charlie Hebdo, whose cover ..."
3,"Ottawa, you've been on my mind all day. Keepin...",0,"[Ottawa, you've been on my mind all day. Keepi..."
4,Let's keep in mind that no one associates the ...,0,[Let's keep in mind that no one associates the...


In [9]:
train_l = []  # Segmented Text
label_l = []  # Label of Each Text
index_l =[]   # The Index of Each Text Before Segmentation
for idx,row in train.iterrows():
  for l in row['text_split']:
    train_l.append(l)
    label_l.append(row['label'])
    index_l.append(idx)
len(train_l), len(label_l), len(index_l)

(6850, 6850, 6850)

In [10]:
val_l = []
val_label_l = []
val_index_l = []
for idx,row in val.iterrows():
  for l in row['text_split']:
    val_l.append(l)
    val_label_l.append(row['label'])
    val_index_l.append(idx)
len(val_l), len(val_label_l), len(val_index_l)

(1687, 1687, 1687)

In [11]:
train_df = pd.DataFrame({'text':train_l, 'label':label_l})
val_df = pd.DataFrame({'text':val_l, 'label':val_label_l})

train_df.head()

Unnamed: 0,text,label
0,Two came out front door of Lindt cafe. One out...,1
1,Thank God he's not a Muslim! Otherwise the wor...,1
2,Wow RT State Senator asks the #Ferguson police...,0
3,"On Israeli TV, an Israeli woman speaks of her ...",1
4,Anybody who wants to talk about what Charlie H...,0


In [12]:
train_InputExamples = train_df.apply(lambda x: InputExample(guid=None,text_a = x['text'], text_b = None, label = x['label']), axis = 1)

val_InputExamples = val_df.apply(lambda x: InputExample(guid=None, text_a = x['text'], text_b = None, label = x['label']), axis = 1)

### Step 2: Define Models for Bert Classification

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

import torch.optim as optim
from torchtext.data import BucketIterator

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    BertweetTokenizer,
    AutoModel,
    AutoTokenizer
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers.data.processors.utils import InputExample, DataProcessor

import logging

logger=logging.getLogger(__name__)

In [None]:
MODEL_CLASSES={
    "bert":(BertConfig,BertTokenizer),
    "bertweet":(BertConfig,BertweetTokenizer)
}

my_label_list=[0, 1]
MAX_SEQ_LENGTH=200

In [None]:
class BertForClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 2

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output, pooled_output=outputs[:2]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        
        outputs = (logits, pooled_output, sequence_output,)

        if labels is not None:
            
            if self.num_labels == 1:
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        
        return outputs  # loss, logits, pooled_output, sequence_output

### Step 3: Load Pre-training Models & Prepare Training Data

In [16]:
# # Load Pre-training Models
# args={"model_name_or_path": "bert-base-uncased",
#     "config_name": "bert-base-uncased",
#     "tokenizer_name": "bert-base-uncased",
#       }

# config_class, tokenizer_class = MODEL_CLASSES["bert"]
# model_class=BertForClassification


# config = config_class.from_pretrained(
#     args["config_name"],
#     finetuning_task="", 
#     cache_dir=None,
# )
# tokenizer = tokenizer_class.from_pretrained(
#     args["tokenizer_name"],
#     do_lower_case=True,
#     cache_dir=None,
# )
# model = model_class.from_pretrained(
#     args["model_name_or_path"],
#     from_tf=bool(".ckpt" in args["model_name_or_path"]),
#     config=config,
#     cache_dir=None,
# )


# model.to("cuda")

In [17]:
# # Prepare Training Data
# train_features = convert_examples_to_features(train_InputExamples,tokenizer, label_list=my_label_list,  output_mode="classification", max_length=MAX_SEQ_LENGTH )

In [18]:
# input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
# attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
# token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
# the_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)

# dataset = TensorDataset(input_ids, attention_mask, token_type_ids, the_labels)

In [19]:
# # Define Train Function For Bert Classification

# def train(train_dataset,model,tokenizer):
#     no_decay=["bias","LayerNorm.weight"]
#     optimizer_grouped_parameters=[
#         {
#             "params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)],
#             "weight_decay":0.0,

#         },
#         {
#             "params": [p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)],
#             "weight_decay":0.0
#         },
#     ]

    
#     t_total=len(train_dataset)// 5
#     optimizer=AdamW(optimizer_grouped_parameters,lr=2e-5,eps=1e-8)
    
#     scheduler=get_linear_schedule_with_warmup(
#         optimizer,num_warmup_steps=0,num_training_steps=t_total
#         )
    
    
    
#     # *********************
#     logger.info("*****Running training*****")
#     logger.info("  Num examples = %d", len(train_dataset))
#     logger.info("  Num Epochs = %d", 5)


#     epochs_trained=0
#     global_step=0
#     steps_trained_in_current_epoch=0

#     tr_loss,logging_loss=0.0,0.0
#     model.zero_grad()
#     train_iterator=trange(epochs_trained,5,desc="Epoch",disable=False)


#     for k in train_iterator: #5 epoch
    
#         train_sampler=RandomSampler(train_dataset)
#         train_dataloader=DataLoader(train_dataset,sampler=train_sampler,batch_size=16)
#         epoch_iterator=tqdm(train_dataloader,desc="Iteration",disable=False)

#         for step,batch in enumerate(epoch_iterator): 
#             if steps_trained_in_current_epoch>0:
#                 steps_trained_in_current_epoch-=1
#                 continue

#             model.train()
#             batch=tuple(t.to("cuda") for t in batch)
            
#             inputs={"input_ids": batch[0],"attention_mask": batch[1],"token_type_ids": batch[2], "labels": batch[3]}
#             outputs = model(**inputs)
#             loss=outputs[0]
 
#             loss.backward()

#             tr_loss+=loss.item()
#             if (step+1)%1==0:
#                 torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

#                 optimizer.step()
#                 scheduler.step()
#                 model.zero_grad()
#                 global_step+=1

#         logger.info("average loss:" +str(tr_loss/global_step))


#     return global_step,tr_loss/global_step

In [20]:
# # Start Training

# # torch.cuda.empty_cache()
# train(dataset,model,tokenizer)

In [21]:
# # Save Trained Model Parameters

# import os
# model.save_pretrained("./output/classification_models_" + model_path)
# tokenizer.save_pretrained("./output/classification_models_" + model_path)

# torch.save(args,os.path.join("./output/classification_models_" + model_path,"training_args.bin"))

### Step 4: Load the Trained Model & Evaluation

In [22]:
# Start Loading the trained model data

args_eval={"model_name_or_path": "./output/classification_models_" + model_path,
    "config_name": "./output/classification_models_" + model_path,
    "tokenizer_name": "./output/classification_models_" + model_path,
      }

config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model.to("cuda")

BertForClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [23]:
val_features = convert_examples_to_features(val_InputExamples, tokenizer, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )


val_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long)
val_attention_mask = torch.tensor([f.attention_mask for f in val_features], dtype=torch.long)
val_token_type_ids = torch.tensor([f.token_type_ids for f in val_features], dtype=torch.long)
val_the_labels = torch.tensor([f.label for f in val_features], dtype=torch.long)


eval_dataset = TensorDataset(val_input_ids, val_attention_mask, val_token_type_ids, val_the_labels)



In [24]:
from sklearn.metrics import f1_score

def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return acc, f1

In [25]:
def evaluate(model, tokenizer, eval_dataset):


    logger.info("***** Running evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", 16)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    eval_sampler =RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=16)

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to("cuda") for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps

    preds = np.argmax(preds, axis=1)
    
    accuracy,f1 = acc_and_f1(preds, out_label_ids)


    return accuracy,f1,eval_loss

In [26]:
accuracy,f1 ,eval_loss = evaluate(model, tokenizer, eval_dataset)

print("Accuracy: ",accuracy, "F1 Score: ",f1,"Loss: ",eval_loss)

Evaluating: 100%|██████████████████████| 106/106 [01:31<00:00,  1.15it/s]

Accuracy:  0.8286899822169532 F1 Score:  0.7336405529953918 Loss:  0.6081837744645353





### Get Text Embeddings & Combine Embeddings with Labels

In [27]:
def get_prediction(model, tokenizer, dataset):

    logger.info("***** Running prediction  *****")
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Batch size = %d", 16)

    pooled_outputs = None

    sampler =SequentialSampler(dataset)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=32)

    for batch in tqdm(dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to("cuda") for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = model(**inputs)
            pooled_output = outputs[2]

            if pooled_outputs is None:
                pooled_outputs = pooled_output.detach().cpu().numpy()
            else:
                pooled_outputs = np.append(pooled_outputs, pooled_output.detach().cpu().numpy(), axis=0)

    return pooled_outputs


In [28]:
args_eval={"model_name_or_path": "./output/classification_models_" + model_path,
    "config_name": "./output/classification_models_" + model_path,
    "tokenizer_name": "./output/classification_models_" + model_path,
      }


config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model.to("cuda")

BertForClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

In [29]:
train_features = convert_examples_to_features(train_InputExamples,tokenizer, label_list=my_label_list, output_mode="classification", max_length=MAX_SEQ_LENGTH )

val_features = convert_examples_to_features(val_InputExamples, tokenizer, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )



In [30]:
train_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
train_attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
train_token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
train_the_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)

train_dataset = TensorDataset(train_input_ids, train_attention_mask, train_token_type_ids, train_the_labels)

val_input_ids = torch.tensor([f.input_ids for f in val_features], dtype=torch.long)
val_attention_mask = torch.tensor([f.attention_mask for f in val_features], dtype=torch.long)
val_token_type_ids = torch.tensor([f.token_type_ids for f in val_features], dtype=torch.long)
val_the_labels = torch.tensor([f.label for f in val_features], dtype=torch.long)

val_dataset = TensorDataset(val_input_ids, val_attention_mask, val_token_type_ids, val_the_labels)

In [31]:
train_pooled_outputs = get_prediction(model, tokenizer, train_dataset)
val_pooled_outputs = get_prediction(model, tokenizer, val_dataset)

Evaluating: 100%|██████████████████████| 215/215 [07:35<00:00,  2.12s/it]
Evaluating: 100%|████████████████████████| 53/53 [02:09<00:00,  2.44s/it]


In [32]:
# Method 1: Feature Concatenation
train_x = {}
# print(index_l)
for l, emb in zip(index_l, train_pooled_outputs):
    # print(l)
    if l in train_x.keys():
        # np.vstack on lists represents features concatenation 
        train_x[l]  =np.vstack([train_x[l], emb])
    else:
        train_x[l] = [emb]

train_l_final = []
label_l_final = []
for k in train_x.keys():
    train_l_final.append(train_x[k])
    label_l_final.append(train.loc[k]['label'])

df_train = pd.DataFrame({'emb': train_l_final, 'label': label_l_final})
df_train.head(10)

Unnamed: 0,emb,label
0,"[[-0.5683017, 0.55494165, -0.262464, 0.5690243...",1
1,"[[-0.9194449, 0.2533443, -0.5469475, 0.9035225...",1
2,"[[0.93448186, -0.18541528, 0.07126316, -0.8495...",0
3,"[[-0.85737556, -0.026393186, -0.249982, 0.7508...",1
4,"[[0.8568014, -0.4103867, -0.85764766, -0.35503...",0
5,"[[0.8761184, 0.0084083, -0.09476933, -0.677081...",0
6,"[[-0.95887554, 0.18726955, -0.69755906, 0.9342...",1
7,"[[-0.80294716, 0.47645772, 0.7817435, 0.714493...",1
8,"[[0.74793553, -0.21281175, -0.36879343, -0.268...",0
9,"[[0.72174776, -0.3264285, -0.048052114, -0.567...",0


In [33]:
# Method 1: Feature Concatenation
val_x = {}

for l, emb in zip(val_index_l, val_pooled_outputs):
    if l in val_x.keys():
        val_x[l]  =np.vstack([val_x[l], emb])
    else:
        val_x[l] = [emb]


val_l_final = []
vlabel_l_final = []
for k in val_x.keys():
    val_l_final.append(val_x[k])
    vlabel_l_final.append(val.loc[k]['label'])

df_val = pd.DataFrame({'emb': val_l_final, 'label': vlabel_l_final})
df_val.head(10)

Unnamed: 0,emb,label
0,"[[-0.4756549, -0.06352481, 0.09831903, 0.58946...",1
1,"[[0.5987283, -0.44336528, -0.7421001, -0.25325...",0
2,"[[0.72510386, -0.1340074, 0.65199375, -0.42390...",0
3,"[[0.7011646, -0.33346644, -0.39699608, -0.1982...",0
4,"[[0.7918278, -0.41761106, -0.96751857, -0.2365...",0
5,"[[0.8720753, -0.33874634, -0.36667407, -0.4368...",0
6,"[[0.120981894, 0.30234385, 0.35127854, -0.1262...",1
7,"[[0.72142196, -0.43539217, -0.5718047, -0.3053...",0
8,"[[0.64644927, -0.24333975, -0.347056, -0.18372...",0
9,"[[0.70024073, -0.19189566, 0.14235741, -0.4654...",0


In [34]:
df_val, df_test = train_test_split(df_val, test_size=0.4, random_state=35)

In [35]:
df_train.shape, df_val.shape, df_test.shape

((4641, 2), (696, 2), (465, 2))

In [36]:
batch_dict = {
    'text_comments':[[7,663],[3,232],[5,93]],
    'text_only':[[7,663],[3,232],[5,93]],
    'comments_only':[[4,1088],[4,163],[4,109]],
    'comments_group1':[[4,387],[4,58],[5,31]],
    'comments_group2':[[4,398],[1,239],[4,40]],
    'comments_group3':[[5,300],[5,45],[1,151]],
    'natural_split':[[7,663],[3,232],[5,93]],
    'fixed_split':[[7,663],[3,232],[5,93]],
}

batches = batch_dict[model_path]

In [37]:
def train_generator(df, batch_size = batches[0][0], batches_per_epoch = batches[0][1]):
    num_sequences = len(df['emb'].to_list())
    assert batch_size * batches_per_epoch == num_sequences
    num_features= 768

    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch):
            longest_index = (b + 1) * batch_size - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size][-batch_size:], key=len))
            x_train = np.full((batch_size, timesteps, num_features), -99.)
            y_train = np.zeros((batch_size,  1))
            for i in range(batch_size):
                li = b * batch_size + i
                x_train[i, 0:len(x_list[li]), :] = x_list[li]
                y_train[i] = y_list[li]
            yield x_train, y_train

def val_generator(df,batch_size_val=batches[1][0],batches_per_epoch_val=batches[1][1]):
    
    num_sequences_val = len(df['emb'].to_list())
    assert batch_size_val * batches_per_epoch_val == num_sequences_val
    num_features= 768


    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_val):
            longest_index = (b + 1) * batch_size_val - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size_val][-31:], key=len))
            x_val = np.full((batch_size_val, timesteps, num_features), -99.)
            y_val = np.zeros((batch_size_val,  1))
            for i in range(batch_size_val):
                li = b * batch_size_val + i
                x_val[i, 0:len(x_list[li]), :] = x_list[li]
                y_val[i] = y_list[li]
            yield x_val, y_val

def test_generator(df,batch_size_test=batches[2][0],batches_per_epoch_test=batches[2][1]):
    
    num_sequences_test = len(df['emb'].to_list())
    assert batch_size_test * batches_per_epoch_test == num_sequences_test
    num_features= 768


    x_list= df['emb'].to_list()
    y_list =  df.label.to_list()
    # Generate batches
    while True:
        for b in range(batches_per_epoch_test):
            longest_index = (b + 1) * batch_size_test - 1
            timesteps = len(max(df['emb'].to_list()[:(b + 1) * batch_size_test][-31:], key=len))
            # print(len(df_train['emb'].to_list()[:b+batch_size][-7:]))
            x_test = np.full((batch_size_test, timesteps, num_features), -99.)
            y_test = np.zeros((batch_size_test,  1))
            for i in range(batch_size_test):
                li = b * batch_size_test + i
                x_test[i, 0:len(x_list[li]), :] = x_list[li]
                y_test[i] = y_list[li]
            yield x_test, y_test            

In [38]:
train_data = train_generator(df_train)
val_data = val_generator(df_val)
test_data = test_generator(df_test)

In [39]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def cul_all_metrics(y_true, y_pred, pos_label=1):
    return {"accuracy": float("%.5f" % accuracy_score(y_true=y_true, y_pred=y_pred)),
            "precision": float("%.5f" % precision_score(y_true=y_true, y_pred=y_pred, pos_label=pos_label)),
            "recall": float("%.5f" % recall_score(y_true=y_true, y_pred=y_pred, pos_label=pos_label)),
            "f1-score": float("%.5f" % f1_score(y_true=y_true, y_pred=y_pred)),
           }

In [40]:
import tensorflow as tf
from tensorflow import keras
import h5py

text_input = keras.Input(shape=(None,768,), dtype='float32', name='text')

# keras.layers.Masking(mask_value=0.0)
l_mask = keras.layers.Masking(mask_value=-99.)(text_input) 

# Which we encoded in a single vector via a LSTM
encoded_text = keras.layers.LSTM(100,)(l_mask)
out_dense = keras.layers.Dense(30, activation='relu')(encoded_text)
# And we add a softmax classifier on top
out = keras.layers.Dense(2, activation='softmax')(out_dense)
# At model instantiation, we specify the input and the output:
model = keras.Model(text_input, out)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['acc'])
model.summary()

2022-03-10 21:59:24.034783: I tensorflow/core/platform/cpu_feature_guard.cc:142] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
2022-03-10 21:59:24.069169: I tensorflow/core/platform/profile_utils/cpu_utils.cc:94] CPU Frequency: 2099965000 Hz
2022-03-10 21:59:24.073236: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x562be6f2b0b0 executing computations on platform Host. Devices:
2022-03-10 21:59:24.073275: I tensorflow/compiler/xla/service/service.cc:175]   StreamExecutor device (0): Host, Default Version


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text (InputLayer)            [(None, None, 768)]       0         
_________________________________________________________________
masking (Masking)            (None, None, 768)         0         
_________________________________________________________________
lstm (LSTM)                  (None, 100)               347600    
_________________________________________________________________
dense (Dense)                (None, 30)                3030      
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 62        
Total params: 350,692
Trainable params: 350,692
Non-trainable params: 0
_________________________________________________________________


In [13]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

import torch.optim as optim
from torchtext.data import BucketIterator

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from torch.utils.data.distributed import DistributedSampler
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers.data.processors.utils import InputExample, DataProcessor

import logging

logger=logging.getLogger(__name__)

In [14]:
MODEL_CLASSES={
    "bert":(BertConfig,BertTokenizer),
}

my_label_list=[0, 1]
MAX_SEQ_LENGTH=200

In [15]:
class BertForClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 2

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output, pooled_output=outputs[:2]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        
        outputs = (logits, pooled_output, sequence_output,)

        if labels is not None:
            
            if self.num_labels == 1:
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        
        return outputs  # loss, logits, pooled_output, sequence_output

In [41]:
call_reduce = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_acc', factor=0.95, patience=3, verbose=2,
                                mode='auto', min_delta=0.01, cooldown=0, min_lr=0)

In [42]:
batches_per_epoch = batches[0][1]

batches_per_epoch_val= batches[1][1]

model.fit(train_data, steps_per_epoch=batches_per_epoch, epochs=10,
                    validation_data=val_data, validation_steps=batches_per_epoch_val, callbacks =[call_reduce] )

Train for 663 steps, validate for 232 steps
Epoch 1/10


2022-03-10 21:59:31.031346: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference___backward_cudnn_lstm_with_fallback_5844_7301' and '__inference___backward_standard_lstm_7406_8003_specialized_for_StatefulPartitionedCall_at___inference_distributed_function_8125' both implement 'lstm_bb6a4fdb-19a6-4957-b7dc-f50946ef6533' but their signatures do not match.




2022-03-10 21:59:42.121851: W tensorflow/core/grappler/optimizers/implementation_selector.cc:310] Skipping optimization due to error while loading function libraries: Invalid argument: Functions '__inference_standard_lstm_10344_specialized_for_model_lstm_StatefulPartitionedCall_at___inference_distributed_function_12215' and '__inference_standard_lstm_10344' both implement 'lstm_8b047d65-c41e-4c5e-a64c-c0871f3c00e0' but their signatures do not match.


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 00007: ReduceLROnPlateau reducing learning rate to 0.0009500000451225787.
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 00010: ReduceLROnPlateau reducing learning rate to 0.0009025000152178108.


<tensorflow.python.keras.callbacks.History at 0x7efbbdc40450>

In [43]:
batches_per_epoch_test = batches[2][1]
pred = model.predict_generator(test_data, steps=batches_per_epoch_test)

In [44]:
pred = np.argmax(pred,axis=1).tolist()
label = df_test.label.to_list()

cul_all_metrics(label,pred)

{'accuracy': 0.84946,
 'precision': 0.81579,
 'recall': 0.74699,
 'f1-score': 0.77987}