## Importing Libraries

In [17]:
import pandas as pd
import numpy as np
from sklearn import metrics
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
import transformers
from transformers import BertTokenizer, BertModel, BertConfig
from transformers import AutoModelForSequenceClassification
from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import evaluate

## Instantiating StableLM Model - 3B

In [2]:
model = "stabilityai/stablelm-base-alpha-3b-v2"
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForCausalLM.from_pretrained(
  model,
  trust_remote_code=True,
  torch_dtype="auto",
  problem_type="multi_label_classification",
  num_labels=6
)
model.cuda()

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.61it/s]


StableLMAlphaForCausalLM(
  (transformer): StableLMAlphaModel(
    (embed): Embedding(50432, 2560)
    (layers): ModuleList(
      (0-31): 32 x DecoderLayer(
        (norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
        (attention): Attention(
          (qkv_proj): Linear(in_features=2560, out_features=7680, bias=False)
          (out_proj): Linear(in_features=2560, out_features=2560, bias=False)
          (rotary_emb): RotaryEmbedding()
        )
        (mlp): MLP(
          (gate_proj): Linear(in_features=2560, out_features=13824, bias=False)
          (out_proj): Linear(in_features=6912, out_features=2560, bias=False)
          (act): SiLU()
        )
      )
    )
    (final_norm): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=2560, out_features=50432, bias=False)
)

## Importing Toxicity Dataset

In [7]:
train_data = pd.read_csv('/workspaces/LLM-Experimentation-Capstone/00_source_data/jigsaw_toxicity/train.csv')
test_data = pd.read_csv('/workspaces/LLM-Experimentation-Capstone/00_source_data/jigsaw_toxicity/test.csv')
test_data_labels = pd.read_csv('/workspaces/LLM-Experimentation-Capstone/00_source_data/jigsaw_toxicity/test_labels.csv')
train_labels = [[float(x), float(y), float(z), float(a), float(b), float(c)] for x, y, z, a, b, c in zip(train_data['toxic'], train_data['severe_toxic'], train_data['obscene'], 
                                                   train_data['threat'], train_data['insult'], train_data['identity_hate'])]
test_labels = [[float(x), float(y), float(z), float(a), float(b), float(c)] for x, y, z, a, b, c in zip(test_data_labels['toxic'], test_data_labels['severe_toxic'], test_data_labels['obscene'], 
                                                   test_data_labels['threat'], test_data_labels['insult'], test_data_labels['identity_hate'])]
print("Length of training data: ", len(train_data))
print("Length of test data: ", len(test_data))

Length of training data:  159571
Length of test data:  153164


In [14]:
class TextClassifierDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

train_encodings = tokenizer([str(i) for i in train_data['comment_text'].values], padding="max_length", truncation=True, max_length=512)
eval_encodings = tokenizer([str(i) for i in test_data['comment_text'].values], padding="max_length", truncation=True, max_length=512)

train_dataset = TextClassifierDataset(train_encodings, train_labels)
eval_dataset = TextClassifierDataset(eval_encodings, test_labels)

Using pad_token, but it is not set yet.


## Performing Inference on Baseline Model

In [15]:
from transformers import AutoModelForCausalLM, AutoTokenizer
inputs = tokenizer("The weather is always wonderful", return_tensors="pt").to("cuda")
tokens = model.generate(
  **inputs,
  max_new_tokens=64,
  temperature=0.75,
  top_p=0.95,
  do_sample=True,
)
print(tokenizer.decode(tokens[0], skip_special_tokens=True))

Setting `pad_token_id` to `eos_token_id`:0 for open-end generation.


The weather is always wonderful here at the lake."
"Oh, yes. I've lived here my entire life and we have had our share of bad weather."
"Well, you can tell me more about the weather if you have time. I've never lived in such a large city. I've never been able to see the whole


## Fine-Tuning Baseline Model

In [18]:
# hyperparameters - obtained from the original paper
training_args = TrainingArguments(
    output_dir=".",
    evaluation_strategy="epoch",
    per_device_train_batch_size=8192,
    per_device_eval_batch_size=512,
    num_train_epochs=40,
    learning_rate = 0.0001
)

def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics,
)

In [19]:
trainer.train()



OutOfMemoryError: CUDA out of memory. Tried to allocate 40.00 GiB (GPU 0; 15.77 GiB total capacity; 10.62 GiB already allocated; 4.77 GiB free; 10.63 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF