In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
!pip install transformers==4.21
import wandb
# %env WANDB_PROJECT=OPT_Text_Classification
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
train = pd.read_csv("../input/bemas-project/train.csv")
test = pd.read_csv("../input/bemas-project/test.csv")
val = test.iloc[0:100]
test = test.iloc[101:]


train_texts=train["message"].values.tolist()
val_texts = val["message"].values.tolist()
train_labels=train["labels"].values.tolist()
val_labels=val["labels"].values.tolist()
test_texts=test["message"].values.tolist()
test_labels=test["labels"].values.tolist()


l1 = []

for i in train_labels:
    if i=="Yes":
        l1.append(1)
    else:
        l1.append(0)

l2 = []
for i in val_labels:
    if i=="Yes":
        l2.append(1)
    else:
        l2.append(0)
    

l3 = []
for i in test_labels:
    if i=="Yes":
        l3.append(1)
    else:
        l3.append(0)

train_labels = l1.copy()
val_labels = l2.copy()
test_labels = l3.copy()
# print(train_texts)

In [None]:
from transformers import BERTTokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.padding_side = "left"
# Define PAD Token = EOS Token = 50256
tokenizer.pad_token = tokenizer.eos_token
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# tokenizer.add_special_tokens({'pad_token': '[PAD]'})

In [None]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

In [None]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [None]:
# !pip install evaluate
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [None]:
# !pip install evaluate

In [None]:
# !transformers-cli env
# from transformers import OPTForCausalLM

from transformers import GPT2ForSequenceClassification, Trainer, TrainingArguments

In [None]:
# trainer.evaluate()
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [None]:
from datasets import load_metric

# metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    metrics = ["accuracy", "recall", "precision", "f1"] #List of metrics to return
    metric={}
    for met in metrics:
       metric[met] = load_metric(met)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric_res={}
    for met in metrics:
       metric_res[met]=metric[met].compute(predictions=predictions, references=labels)[met]
    return metric_res

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

model = GPT2ForSequenceClassification.from_pretrained("gpt2")

# resize model embedding to match new tokenizer
model.resize_token_embeddings(len(tokenizer))

# fix model padding token id
model.config.pad_token_id = model.config.eos_token_id

# training_args = TrainingArguments("test")
training_args = TrainingArguments(
    output_dir='./gpt2_text_class',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="wandb",
)

# training_args = TrainingArguments("test", evaluation_strategy="no")
trainer       = Trainer(
    model         = model, 
    args          = training_args, 
    train_dataset = train_dataset, 
    eval_dataset  = val_dataset,
    compute_metrics = compute_metrics,
)
trainer.add_callback(CustomCallback(trainer)) 
trainer.train()
trainer.evaluate()
wandb.finish()

In [None]:
# # resize model embedding to match new tokenizer
# model.resize_token_embeddings(len(tokenizer))

# # fix model padding token id
# model.config.pad_token_id = model.config.eos_token_id