In [1]:
!pip install datasets
!pip install accelerate
!pip install transformers
!pip install scikit
!pip install numpy
!pip install pandas

Collecting accelerate
  Using cached accelerate-0.28.0-py3-none-any.whl (290 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch>=1.10.0->accelerate)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cufft_cu12-11

In [77]:
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.datasets import load_iris

import numpy as np
import pandas as pd

In [78]:
# TODO: Add train test split - stratified - later on

## 1. Data preprocessing

### 1.1 Loading dataset

In [79]:
iris_data = load_iris()

In [80]:
iris_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [81]:
iris_data["feature_names"]

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [82]:
iris_data["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [83]:
iris_df = pd.DataFrame(data = iris_data["data"], columns = iris_data["feature_names"])

In [84]:
iris_df["label"] = iris_data["target"]

In [85]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [86]:
iris_df["label_decoded"] = iris_df["label"].apply(lambda label_idx: iris_data["target_names"][label_idx])

In [87]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_decoded
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [88]:
iris_df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_decoded
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica
149,5.9,3.0,5.1,1.8,2,virginica


Let's now transform this dataset into HuggingFace dataset format.

First we need to transform data itself.

We will be performing classification task. The numerical labels are fine to achieve this goal.

However we need to transform all the individual four numerical features into one textual as LLM input. We will do this by concatenating all four values row-wise.

In [89]:
iris_df["text"] = iris_df.apply(lambda row:
                                str(row["sepal length (cm)"]) + " " +
                                str(row["sepal width (cm)"]) + " " +
                                str(row["petal length (cm)"]) + " " +
                                str(row["petal width (cm)"]), axis=1)

Finally we will need to transform the DataFrame into Hugging Face Dataset format.

In [90]:
dataset = Dataset.from_pandas(iris_df)

In [91]:
dataset

Dataset({
    features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
    num_rows: 150
})

In [92]:
dataset[0]

{'sepal length (cm)': 5.1,
 'sepal width (cm)': 3.5,
 'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'label': 0,
 'label_decoded': 'setosa',
 'text': '5.1 3.5 1.4 0.2'}

To be able to train the model and later properly evaluate it we will not create separate train, validation and test data splits. Train split will contain 80% of data examples and validation and test splits 10% each. We want to reproduce the categories balance of the original dataset in each of the split so we will use a stratify split function.

In [93]:
# TODO: Prepare validation and test splits

In [94]:
# column we want to stratify with
stratify_column_name = "label"

# create class label column and stratify
t = dataset.class_encode_column(
    stratify_column_name
)

Stringifying the column:   0%|          | 0/150 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/150 [00:00<?, ? examples/s]

In [95]:
# column we want to stratify first needs to be transformed into categorical variable type
dataset_testvalid = dataset.class_encode_column("label"
                                                ).train_test_split(test_size=0.2, shuffle=True, stratify_by_column="label")

test_valid = dataset_testvalid["test"].train_test_split(test_size=0.5, shuffle=True, stratify_by_column="label")

dataset = DatasetDict({
    "train": dataset_testvalid["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]

})

Stringifying the column:   0%|          | 0/150 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/150 [00:00<?, ? examples/s]

In [96]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
        num_rows: 120
    })
    validation: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
        num_rows: 15
    })
    test: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
        num_rows: 15
    })
})

In [114]:
dataset["train"][0]

{'sepal length (cm)': 5.5,
 'sepal width (cm)': 4.2,
 'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'label': 0,
 'label_decoded': 'setosa',
 'text': '5.5 4.2 1.4 0.2'}

In [115]:
# TODO: Create reference baseline model
# 2. Simple classification model baseline
# We can train a baseline logistic regression model to see how well it performs on Iris dataset and compare it to the LLM performance.
# Pytorch feed forward NN on numerical data

### 1.2 Tokenization

For simplicity of this experiment we will use BERT model.

In [116]:
MODEL_CHECKPOINT = "bert-base-uncased"

In [117]:
# TODO: Prepare data not as single string but as list of individual tokens
# use tokenizer with a is tokenized flag.

In [118]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [119]:
# We do not need padding or truncation - all values are of the same lengths.
def preprocess_data(example):
    return tokenizer(example["text"])

In [120]:
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [121]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120
    })
    validation: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15
    })
    test: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15
    })
})

In [122]:
tokenized_dataset["train"][0]

{'sepal length (cm)': 5.5,
 'sepal width (cm)': 4.2,
 'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'label': 0,
 'label_decoded': 'setosa',
 'text': '5.5 4.2 1.4 0.2',
 'input_ids': [101,
  1019,
  1012,
  1019,
  1018,
  1012,
  1016,
  1015,
  1012,
  1018,
  1014,
  1012,
  1016,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [123]:
# TODO: Do I need colator - all my examples are of the same length
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 2. Model

### 2.1 Metrics

In [124]:
metric = load_metric("accuracy")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [125]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
        predictions=predictions, references=labels)

In [126]:
# TODO: More metrics
# https://stackoverflow.com/questions/72367324/calculate-precision-recall-f1-score-for-custom-dataset-for-multiclass-classifi

# def custom_metrics(eval_pred):
#     metric1 = load_metric("precision")
#     metric2 = load_metric("recall")
#     metric3 = load_metric("f1")
#     metric4 = load_metric("accuracy")

#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)

#     precision = metric1.compute(predictions=predictions, references=labels, average="micro")["precision"]
#     recall = metric2.compute(predictions=predictions, references=labels, average="micro")["recall"]
#     f1 = metric3.compute(predictions=predictions, references=labels, average="micro")["f1"]
#     accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

#     return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}


# trainer = Trainer(model, args, train_dataset=dds['train'], eval_dataset=dds['test'],
#                           tokenizer=tokenizer, compute_metrics=custom_metrics)

# trainer.train()



### 2.2 Model training

We can now load the base model with the sequence classification head that we will be fine-tuning here.

In [127]:
MODEL_CHECKPOINT

'bert-base-uncased'

In [128]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [129]:
training_args = TrainingArguments("bert-base-uncased-iris",
                                  num_train_epochs=30,
                                  logging_steps=50,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  push_to_hub=False)

In [133]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120
    })
    validation: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15
    })
    test: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15
    })
})

In [130]:
trainer = Trainer(
    model,
    training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset["train"]
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator, # TODO: Remove collator?
    tokenizer=tokenizer # TODO: Remove tokenizer?
)

SyntaxError: invalid syntax. Perhaps you forgot a comma? (<ipython-input-130-241235509b0a>, line 5)

In [131]:
trainer.train()

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

In [132]:
trainer.evaluate()

AttributeError: 'AcceleratorState' object has no attribute 'distributed_type'

In [58]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [59]:
trainer.push_to_hub(commit_message="Training complete")

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

events.out.tfevents.1710953062.d3ed05720702.5451.0:   0%|          | 0.00/7.39k [00:00<?, ?B/s]

events.out.tfevents.1710953571.d3ed05720702.5451.2:   0%|          | 0.00/411 [00:00<?, ?B/s]

events.out.tfevents.1710953265.d3ed05720702.5451.1:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/msznajder/bert-base-uncased-iris/commit/217ecde097d7e5042c77866c0db4ef2fee089a9f', commit_message='Training complete', commit_description='', oid='217ecde097d7e5042c77866c0db4ef2fee089a9f', pr_url=None, pr_revision=None, pr_num=None)

## 3. Evaluation

In [61]:
# TODO: Implement human evaluation

In [None]:
from transformers import pipeline

model_checkpoint = "msznajder/distilbert-base-uncased-imdb2"
sequence_classifier = pipeline("text-classification", model=model_checkpoint)
sequence_classifier("This was a very good movie.")

In [None]:
sequence_classifier("This was a bad movie.")

In [None]:
sequence_classifier("This was a good movie.")