In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
train_data = pd.read_csv("./data/processed_data/2_classes_medium/train_data.csv")
test_data = pd.read_csv("./data/processed_data/2_classes_medium/test_data.csv")

In [None]:
train_data = train_data[["text_comments", "2_way_label"]]
test_data = test_data[["text_comments", "2_way_label"]]

In [None]:
from transformers.data.processors.utils import InputExample

In [None]:
train_InputExamples = train_data.apply(lambda x: InputExample(guid=None,text_a = x['text_comments'], text_b = None, label = x['2_way_label']), axis = 1)
test_InputExamples = test_data.apply(lambda x: InputExample(guid=None, text_a = x['text_comments'], text_b = None, label = x['2_way_label']), axis = 1)

In [None]:
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.nn import CrossEntropyLoss, MSELoss
import torch.nn.functional as F

import torch.optim as optim

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler, TensorDataset
from tqdm import tqdm, trange

from transformers import (
    WEIGHTS_NAME,
    AdamW,
    get_linear_schedule_with_warmup,
    BertConfig,
    BertModel,
    BertPreTrainedModel,
    BertTokenizer,
    BertweetTokenizer,
)

from transformers import glue_convert_examples_to_features as convert_examples_to_features
from transformers import glue_output_modes as output_modes
from transformers import glue_processors as processors
from transformers.data.processors.utils import InputExample, DataProcessor

import logging

logger=logging.getLogger(__name__)

In [None]:
MODEL_CLASSES={
    "bert": (BertConfig, BertTokenizer),
    "bertweet": (BertConfig, BertweetTokenizer)
}

my_label_list = [0, 1]
MAX_SEQ_LENGTH = 200

In [None]:
class BertForClassification(BertPreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 2

        self.bert = BertModel(config)
        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.classifier = nn.Linear(config.hidden_size, self.num_labels)

        self.init_weights()

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        labels=None,
    ):

        outputs = self.bert(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            position_ids=position_ids,
            head_mask=head_mask,
            inputs_embeds=inputs_embeds,
        )

        sequence_output, pooled_output=outputs[:2]

        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        
        outputs = (logits, pooled_output, sequence_output,)

        if labels is not None:
            
            if self.num_labels == 1:
                loss_fct = MSELoss()
                loss = loss_fct(logits.view(-1), labels.view(-1))
            else:
                loss_fct = CrossEntropyLoss()
                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
            outputs = (loss,) + outputs

        
        return outputs  # loss, logits, pooled_output, sequence_output

In [None]:
args={"model_name_or_path": "bert-base-uncased",
    "config_name": "bert-base-uncased",
    "tokenizer_name": "bert-base-uncased",
      }

config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer = tokenizer_class.from_pretrained(
    args["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model = model_class.from_pretrained(
    args["model_name_or_path"],
    from_tf=bool(".ckpt" in args["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model.to("cuda")

In [None]:
train_features = convert_examples_to_features(train_InputExamples,tokenizer, label_list=my_label_list, output_mode="classification", max_length=MAX_SEQ_LENGTH )

In [None]:
input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
attention_mask = torch.tensor([f.attention_mask for f in train_features], dtype=torch.long)
token_type_ids = torch.tensor([f.token_type_ids for f in train_features], dtype=torch.long)
the_labels = torch.tensor([f.label for f in train_features], dtype=torch.long)

dataset = TensorDataset(input_ids, attention_mask, token_type_ids, the_labels)

In [None]:
def train(train_dataset,model,tokenizer):
    no_decay=["bias","LayerNorm.weight"]
    optimizer_grouped_parameters=[
        {
            "params":[p for n,p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay":0.0,

        },
        {
            "params": [p for n,p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay":0.0
        },
    ]

    
    t_total=len(train_dataset)// 5
    optimizer=AdamW(optimizer_grouped_parameters,lr=2e-5,eps=1e-8)
    
    scheduler=get_linear_schedule_with_warmup(
        optimizer,num_warmup_steps=0,num_training_steps=t_total
        )
    
    
    
    # *********************
    logger.info("*****Running training*****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", 5)


    epochs_trained=0
    global_step=0
    steps_trained_in_current_epoch=0

    tr_loss,logging_loss=0.0,0.0
    model.zero_grad()
    train_iterator=trange(epochs_trained,5,desc="Epoch",disable=False)


    for k in train_iterator: #5 epoch
    
        train_sampler=RandomSampler(train_dataset)
        train_dataloader=DataLoader(train_dataset,sampler=train_sampler,batch_size=16)
        epoch_iterator=tqdm(train_dataloader,desc="Iteration",disable=False)

        for step,batch in enumerate(epoch_iterator): 
            if steps_trained_in_current_epoch>0:
                steps_traned_in_current_epoch-=1
                continue

            model.train()
            batch=tuple(t.to("cuda") for t in batch)
            
            inputs={"input_ids": batch[0],"attention_mask": batch[1],"token_type_ids": batch[2], "labels": batch[3]}
            outputs = model(**inputs)
            loss=outputs[0]
            loss.backward()

            tr_loss+=loss.item()
            if (step+1)%1==0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),1.0)

                optimizer.step()
                scheduler.step()
                model.zero_grad()
                global_step+=1

        logger.info("average loss:" +str(tr_loss/global_step))


    return global_step,tr_loss/global_step

In [None]:
train(dataset,model,tokenizer)

In [None]:
import os
model_path = "text_comments"


In [None]:
model.save_pretrained("./trained_models/2_classes_medium/classification_models_" + model_path)
tokenizer.save_pretrained("./trained_models/2_classes_medium/classification_models_" + model_path)

torch.save(args,os.path.join("./trained_models/2_classes_medium/classification_models_" + model_path, "training_args.bin"))

In [None]:
# Start Loading the trained model data

args_eval={"model_name_or_path": "./trained_models/2_classes_medium/classification_models_" + model_path,
    "config_name": "./trained_models/2_classes_medium/classification_models_" + model_path,
    "tokenizer_name": "./trained_models/2_classes_medium/classification_models_" + model_path,
      }

config_class, tokenizer_class = MODEL_CLASSES["bert"]
model_class=BertForClassification


config = config_class.from_pretrained(
    args_eval["config_name"],
    finetuning_task="", 
    cache_dir=None,
)
tokenizer = tokenizer_class.from_pretrained(
    args_eval["tokenizer_name"],
    do_lower_case=True,
    cache_dir=None,
)
model = model_class.from_pretrained(
    args_eval["model_name_or_path"],
    from_tf=bool(".ckpt" in args_eval["model_name_or_path"]),
    config=config,
    cache_dir=None,
)


model.to("cuda")

In [None]:
# Prepare Data for Evaluation

test_features = convert_examples_to_features(test_InputExamples, tokenizer, label_list=my_label_list, output_mode="classification",  max_length=MAX_SEQ_LENGTH )

test_input_ids = torch.tensor([f.input_ids for f in test_features], dtype=torch.long)
test_attention_mask = torch.tensor([f.attention_mask for f in test_features], dtype=torch.long)
test_token_type_ids = torch.tensor([f.token_type_ids for f in test_features], dtype=torch.long)
test_the_labels = torch.tensor([f.label for f in test_features], dtype=torch.long)


test_dataset = TensorDataset(test_input_ids, test_attention_mask, test_token_type_ids, test_the_labels)

In [None]:
from sklearn.metrics import f1_score

In [None]:
def evaluate(model, tokenizer, eval_dataset):


    logger.info("***** Running evaluation  *****")
    logger.info("  Num examples = %d", len(eval_dataset))
    logger.info("  Batch size = %d", 16)
    eval_loss = 0.0
    nb_eval_steps = 0
    preds = None
    out_label_ids = None

    eval_sampler =RandomSampler(eval_dataset)
    eval_dataloader = DataLoader(eval_dataset, sampler=eval_sampler, batch_size=16)

    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        model.eval()
        batch = tuple(t.to("cuda") for t in batch)

        with torch.no_grad():
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            outputs = model(**inputs)
            tmp_eval_loss, logits = outputs[:2]

            eval_loss += tmp_eval_loss.mean().item()
        nb_eval_steps += 1
        if preds is None:
            preds = logits.detach().cpu().numpy()
            out_label_ids = inputs["labels"].detach().cpu().numpy()
        else:
            preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            out_label_ids = np.append(out_label_ids, inputs["labels"].detach().cpu().numpy(), axis=0)

    eval_loss = eval_loss / nb_eval_steps

    preds = np.argmax(preds, axis=1)
    
    accuracy,f1 = acc_and_f1(preds, out_label_ids)


    return accuracy,f1,eval_loss

In [None]:
def simple_accuracy(preds, labels):
    return (preds == labels).mean()

def acc_and_f1(preds, labels):
    acc = simple_accuracy(preds, labels)
    f1 = f1_score(y_true=labels, y_pred=preds)
    return acc, f1

In [None]:
accuracy,f1 ,eval_loss = evaluate(model, tokenizer, test_dataset)

print("Accuracy: ",accuracy, "F1 Score: ",f1,"Loss: ",eval_loss)