In [16]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
# !pip install transformers==4.21
import wandb
%env WANDB_PROJECT=OPT_Text_Classification
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

env: WANDB_PROJECT=OPT_Text_Classification
/kaggle/input/bemas-project/train.csv
/kaggle/input/bemas-project/test.csv


In [17]:
train = pd.read_csv("../input/bemas-project/train.csv")
test = pd.read_csv("../input/bemas-project/test.csv")
val = test.iloc[0:100]
test = test.iloc[101:]


train_texts=train["message"].values.tolist()
val_texts = val["message"].values.tolist()
train_labels=train["labels"].values.tolist()
val_labels=val["labels"].values.tolist()
test_texts=test["message"].values.tolist()
test_labels=test["labels"].values.tolist()


l1 = []

for i in train_labels:
    if i=="Yes":
        l1.append(1)
    else:
        l1.append(0)

l2 = []
for i in val_labels:
    if i=="Yes":
        l2.append(1)
    else:
        l2.append(0)
    

l3 = []
for i in test_labels:
    if i=="Yes":
        l3.append(1)
    else:
        l3.append(0)

train_labels = l1.copy()
val_labels = l2.copy()
test_labels = l3.copy()
# print(train_texts)

In [18]:
from transformers import GPT2Tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('facebook/opt-125m', batched=True)

loading file https://huggingface.co/facebook/opt-125m/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/a047788adc333d1c9ea27f0685a699665269b5b28c818d27bc5c10e9406491c6.647b4548b6d9ea817e82e7a9231a320231a1c9ea24053cc9e758f3fe68216f05
loading file https://huggingface.co/facebook/opt-125m/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/c20b2086e29c267013167e7a833dd17832de9ee6d724a4f4165962b005e4cd68.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/facebook/opt-125m/resolve/main/added_tokens.json from cache at None
loading file https://huggingface.co/facebook/opt-125m/resolve/main/special_tokens_map.json from cache at /root/.cache/huggingface/transformers/45eb88dfd61e3579b84a15ad3cf2636d01e121b5474ad8944761aae4a66c62ea.c7cc7d24e97c79eaf304e87679fffb4f36cf739d549738da5cc604bf047de6ce
loading file https://huggingface.co/facebook/opt-125m/resolve/main/tokenizer_config.json from cache at 

In [19]:
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [20]:
import torch

class IMDbDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = IMDbDataset(train_encodings, train_labels)
val_dataset = IMDbDataset(val_encodings, val_labels)
test_dataset = IMDbDataset(test_encodings, test_labels)

In [21]:
# !pip install evaluate
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

In [22]:
# !pip install evaluate

In [23]:
# !transformers-cli env
from transformers import OPTForCausalLM

from transformers import OPTForSequenceClassification, Trainer, TrainingArguments

In [24]:
# trainer.evaluate()
from transformers import TrainerCallback

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

In [26]:
from datasets import load_metric

# metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    metrics = ["accuracy", "recall", "precision", "f1"] #List of metrics to return
    metric={}
    for met in metrics:
       metric[met] = load_metric(met)
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    metric_res={}
    for met in metrics:
       metric_res[met]=metric[met].compute(predictions=predictions, references=labels)[met]
    return metric_res

# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(predictions=predictions, references=labels)

model = OPTForSequenceClassification.from_pretrained("facebook/opt-125m")
# training_args = TrainingArguments("test")
training_args = TrainingArguments(
    output_dir='./opt_distilbert_text_class',          # output directory
    num_train_epochs=3,              # total number of training epochs
    per_device_train_batch_size=2,  # batch size per device during training
    per_device_eval_batch_size=2,   # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    report_to="wandb",
)

# training_args = TrainingArguments("test", evaluation_strategy="no")
trainer       = Trainer(
    model         = model, 
    args          = training_args, 
    train_dataset = train_dataset, 
    eval_dataset  = val_dataset,
    compute_metrics = compute_metrics,
)
trainer.add_callback(CustomCallback(trainer)) 
trainer.train()
trainer.evaluate()
wandb.finish()

loading configuration file https://huggingface.co/facebook/opt-125m/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/8cc575ca4de298d186c89e2901ab76d16d44c199423b0759d658328abcbdd0cd.2a47ada990dde0d9edec3a77f0bfa418a1b0b20381f5d809d00b8d5306d9a09d
Model config OPTConfig {
  "_name_or_path": "facebook/opt-125m",
  "_remove_final_layer_norm": false,
  "activation_dropout": 0.0,
  "activation_function": "relu",
  "architectures": [
    "OPTForCausalLM"
  ],
  "attention_dropout": 0.0,
  "bos_token_id": 2,
  "do_layer_norm_before": true,
  "dropout": 0.1,
  "eos_token_id": 2,
  "ffn_dim": 3072,
  "hidden_size": 768,
  "init_std": 0.02,
  "layerdrop": 0.0,
  "max_position_embeddings": 2048,
  "model_type": "opt",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "prefix": "</s>",
  "torch_dtype": "float16",
  "transformers_version": "4.21.0",
  "use_cache": true,
  "vocab_size": 50272,
  "word_embed_proj_dim": 768
}

loading weights fi

Step,Training Loss
10,0.9256
20,0.6594
30,0.6896
40,0.6998
50,0.6594
60,0.768
70,0.6412
80,0.4932
90,1.0318
100,0.7562


Saving model checkpoint to ./opt/checkpoint-500
Configuration saved in ./opt/checkpoint-500/config.json
Model weights saved in ./opt/checkpoint-500/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 100
  Batch size = 2


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
eval/accuracy,▁██
eval/f1,▁██
eval/loss,▁▂█
eval/precision,▁██
eval/recall,▁▁▁
eval/runtime,█▁▂
eval/samples_per_second,▁█▇
eval/steps_per_second,▁▂█
train/epoch,▁▂▂▃▃▃▄▅▅▆█▁▂▂▂▃▄▄▅▅▆▆▇▇█▁▁▂▂▃▄▄▄▅▆▆▇▇██
train/global_step,▁▂▂▃▃▃▄▅▅▆▃▁▂▂▃▃▄▄▅▅▆▆▇▇█▁▁▂▃▃▄▄▄▅▆▆▇▇▇█

0,1
eval/accuracy,0.8
eval/f1,0.66667
eval/loss,1.20018
eval/precision,0.8
eval/recall,0.57143
eval/runtime,2.6476
eval/samples_per_second,37.77
eval/steps_per_second,18.885
train/epoch,3.0
train/global_step,657.0
