In [1]:
!pip install datasets
!pip install accelerate
!pip install transformers
!pip install scikit
!pip install numpy
!pip install pandas

Collecting datasets
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m14.1 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m10.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xxhash, dill, multiprocess, datasets
Successfully installed dataset

In [61]:
from datasets import Dataset, DatasetDict, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer

from sklearn.datasets import load_iris

import numpy as np
import pandas as pd

## 1. Data preprocessing

### 1.1 Loading dataset

In [4]:
iris_data = load_iris()

In [5]:
iris_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [6]:
iris_data["feature_names"]

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [7]:
iris_data["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [8]:
iris_df = pd.DataFrame(data = iris_data["data"], columns = iris_data["feature_names"])

In [9]:
iris_df["label"] = iris_data["target"]

In [10]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
iris_df["label_decoded"] = iris_df["label"].apply(lambda label_idx: iris_data["target_names"][label_idx])

In [12]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_decoded
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa



We will be performing classification task. The numerical labels are fine to achieve this goal.

However we need to transform all the individual four numerical features into one textual as LLM input. We will do this by concatenating all four values row-wise.

In [14]:
iris_df["text"] = iris_df.apply(lambda row:
                                str(row["sepal length (cm)"]) + " " +
                                str(row["sepal width (cm)"]) + " " +
                                str(row["petal length (cm)"]) + " " +
                                str(row["petal width (cm)"]), axis=1)

In [159]:
iris_df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_decoded,text
0,5.1,3.5,1.4,0.2,0,setosa,5.1 3.5 1.4 0.2
1,4.9,3.0,1.4,0.2,0,setosa,4.9 3.0 1.4 0.2
2,4.7,3.2,1.3,0.2,0,setosa,4.7 3.2 1.3 0.2
3,4.6,3.1,1.5,0.2,0,setosa,4.6 3.1 1.5 0.2
4,5.0,3.6,1.4,0.2,0,setosa,5.0 3.6 1.4 0.2
...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2,virginica,6.7 3.0 5.2 2.3
146,6.3,2.5,5.0,1.9,2,virginica,6.3 2.5 5.0 1.9
147,6.5,3.0,5.2,2.0,2,virginica,6.5 3.0 5.2 2.0
148,6.2,3.4,5.4,2.3,2,virginica,6.2 3.4 5.4 2.3


To be able to train the model and later properly evaluate it we will not create separate train, validation and test data splits. Train split will contain 80% of data examples and validation and test splits 10% each. We want to reproduce the categories balance of the original dataset in each of the split so we will use a stratify split function.

Finally we will need to transform the DataFrame into Hugging Face Dataset format.

In [161]:
# TODO: Change labels to text and see how it works - better do it before transforming data to the Dataset object - to be able to run baseline model

In [15]:
dataset = Dataset.from_pandas(iris_df)

In [16]:
dataset

Dataset({
    features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
    num_rows: 150
})

In [17]:
dataset[0]

{'sepal length (cm)': 5.1,
 'sepal width (cm)': 3.5,
 'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'label': 0,
 'label_decoded': 'setosa',
 'text': '5.1 3.5 1.4 0.2'}

To be able to train the model and later properly evaluate it we will not create separate train, validation and test data splits. Train split will contain 80% of data examples and validation and test splits 10% each. We want to reproduce the categories balance of the original dataset in each of the split so we will use a stratify split function.

In [19]:
# column we want to stratify with
stratify_column_name = "label"

# create class label column and stratify
t = dataset.class_encode_column(
    stratify_column_name
)

Stringifying the column:   0%|          | 0/150 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/150 [00:00<?, ? examples/s]

In [20]:
# column we want to stratify first needs to be transformed into categorical variable type
dataset_testvalid = dataset.class_encode_column("label"
                                                ).train_test_split(test_size=0.2, shuffle=True, stratify_by_column="label")

test_valid = dataset_testvalid["test"].train_test_split(test_size=0.5, shuffle=True, stratify_by_column="label")

dataset = DatasetDict({
    "train": dataset_testvalid["train"],
    "validation": test_valid["train"],
    "test": test_valid["test"]

})

Stringifying the column:   0%|          | 0/150 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/150 [00:00<?, ? examples/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
        num_rows: 120
    })
    validation: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
        num_rows: 15
    })
    test: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
        num_rows: 15
    })
})

In [22]:
dataset["train"][0]

{'sepal length (cm)': 4.9,
 'sepal width (cm)': 2.5,
 'petal length (cm)': 4.5,
 'petal width (cm)': 1.7,
 'label': 2,
 'label_decoded': 'virginica',
 'text': '4.9 2.5 4.5 1.7'}

In [23]:
# TODO: Create reference baseline model
# 2. Simple classification model baseline
# We can train a baseline logistic regression model to see how well it performs on Iris dataset and compare it to the LLM performance.
# Pytorch feed forward NN on numerical data

### 1.2 Tokenization

For simplicity of this experiment we will use BERT model.

In [74]:
MODEL_CHECKPOINT = "bert-base-uncased"

In [75]:
# TODO: Prepare data not as single string but as list of individual tokens
# use tokenizer with a is tokenized flag.

In [76]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [77]:
# We do not need padding or truncation - all values are of the same lengths.
def preprocess_data(example):
    return tokenizer(example["text"])

In [78]:
tokenized_dataset = dataset.map(preprocess_data, batched=True)

Map:   0%|          | 0/120 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

Map:   0%|          | 0/15 [00:00<?, ? examples/s]

In [79]:
tokenized_dataset

DatasetDict({
    train: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 120
    })
    validation: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15
    })
    test: Dataset({
        features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 15
    })
})

In [80]:
tokenized_dataset["train"][0]

{'sepal length (cm)': 4.9,
 'sepal width (cm)': 2.5,
 'petal length (cm)': 4.5,
 'petal width (cm)': 1.7,
 'label': 2,
 'label_decoded': 'virginica',
 'text': '4.9 2.5 4.5 1.7',
 'input_ids': [101,
  1018,
  1012,
  1023,
  1016,
  1012,
  1019,
  1018,
  1012,
  1019,
  1015,
  1012,
  1021,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [81]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

## 2. Model

### 2.1 Metrics

In [110]:
metric = load_metric("accuracy")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [132]:
# def compute_metrics(eval_pred):
#     logits, labels = eval_pred
#     predictions = np.argmax(logits, axis=-1)
#     return metric.compute(
#         predictions=predictions, references=labels)

In [133]:
def compute_metrics(eval_pred):
    metric1 = load_metric("precision")
    metric2 = load_metric("recall")
    metric3 = load_metric("f1")
    metric4 = load_metric("accuracy")

    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)

    precision = metric1.compute(predictions=predictions, references=labels, average="micro")["precision"]
    recall = metric2.compute(predictions=predictions, references=labels, average="micro")["recall"]
    f1 = metric3.compute(predictions=predictions, references=labels, average="micro")["f1"]
    accuracy = metric4.compute(predictions=predictions, references=labels)["accuracy"]

    return {"precision": precision, "recall": recall, "f1": f1, "accuracy": accuracy}

### 2.2 Model training

We can now load the base model with the sequence classification head that we will be fine-tuning here.

In [134]:
MODEL_CHECKPOINT

'bert-base-uncased'

In [135]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [136]:
training_args = TrainingArguments("bert-base-uncased-iris",
                                  num_train_epochs=15,
                                  logging_steps=50,
                                  evaluation_strategy="epoch",
                                  save_strategy="epoch",
                                  learning_rate=2e-5,
                                  weight_decay=0.01,
                                  push_to_hub=False)

In [137]:
trainer = Trainer(
    model,
    training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [138]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.840449,0.666667,0.666667,0.666667,0.666667
2,No log,0.669608,0.733333,0.733333,0.733333,0.733333
3,No log,0.490276,0.733333,0.733333,0.733333,0.733333
4,0.680100,0.239627,0.933333,0.933333,0.933333,0.933333
5,0.680100,0.149324,0.933333,0.933333,0.933333,0.933333
6,0.680100,0.05744,1.0,1.0,1.0,1.0
7,0.198900,0.041158,1.0,1.0,1.0,1.0
8,0.198900,0.018783,1.0,1.0,1.0,1.0
9,0.198900,0.024765,1.0,1.0,1.0,1.0
10,0.069800,0.012778,1.0,1.0,1.0,1.0


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
Checkpoint destination directory bert-base-uncased-iris/checkpoint-15 already exists and is non-empty. Saving will proceed but saved results may be invalid.
You can avoid this message in future by

TrainOutput(global_step=225, training_loss=0.22498465220133462, metrics={'train_runtime': 245.4119, 'train_samples_per_second': 7.335, 'train_steps_per_second': 0.917, 'total_flos': 12950113528800.0, 'train_loss': 0.22498465220133462, 'epoch': 15.0})

## 3. Evaluation

In [139]:
trainer.evaluate()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'eval_loss': 0.014550279825925827,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_accuracy': 1.0,
 'eval_runtime': 1.6103,
 'eval_samples_per_second': 9.315,
 'eval_steps_per_second': 1.242,
 'epoch': 15.0}

In [140]:
prediction = trainer.predict(tokenized_dataset["test"])

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [141]:
prediction[2]

{'test_loss': 0.34309566020965576,
 'test_precision': 0.9333333333333333,
 'test_recall': 0.9333333333333333,
 'test_f1': 0.9333333333333333,
 'test_accuracy': 0.9333333333333333,
 'test_runtime': 1.3399,
 'test_samples_per_second': 11.195,
 'test_steps_per_second': 1.493}

In [142]:
trainer.push_to_hub(commit_message="Training complete")

events.out.tfevents.1711026650.f81b7377183e.1062.12:   0%|          | 0.00/12.9k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

events.out.tfevents.1711026897.f81b7377183e.1062.13:   0%|          | 0.00/560 [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/msznajder/bert-base-uncased-iris/commit/b4ca160a47961a32c13c0926e1b52bfbb4bbbc32', commit_message='Training complete', commit_description='', oid='b4ca160a47961a32c13c0926e1b52bfbb4bbbc32', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
# TODO: Compare to baseline.

In [143]:
from transformers import pipeline

model_checkpoint = "msznajder/bert-base-uncased-iris"
iris_sequence_classifier = pipeline("text-classification", model=model_checkpoint)

In [144]:
iris_df.iloc[0]

sepal length (cm)                5.1
sepal width (cm)                 3.5
petal length (cm)                1.4
petal width (cm)                 0.2
label                              0
label_decoded                 setosa
text                 5.1 3.5 1.4 0.2
Name: 0, dtype: object

In [145]:
iris_sequence_classifier("5.1 3.5 1.4 0.2")

[{'label': 'LABEL_0', 'score': 0.9951192140579224}]

In [146]:
iris_df.iloc[100]

sepal length (cm)                6.3
sepal width (cm)                 3.3
petal length (cm)                6.0
petal width (cm)                 2.5
label                              2
label_decoded              virginica
text                 6.3 3.3 6.0 2.5
Name: 100, dtype: object

In [147]:
iris_sequence_classifier("6.3 3.3 6.0 2.5")

[{'label': 'LABEL_2', 'score': 0.9905140399932861}]

In [148]:
iris_sequence_classifier("6.3 30.3 6.0 82.5")

[{'label': 'LABEL_2', 'score': 0.982669472694397}]

In [156]:
iris_sequence_classifier("10000.1 1700.8 10.9 107.1")

[{'label': 'LABEL_2', 'score': 0.6642792820930481}]

In [157]:
iris_sequence_classifier("10000.1 1700.8 1088.9 1777707.1")

[{'label': 'LABEL_2', 'score': 0.7526283860206604}]

In [158]:
iris_sequence_classifier("100.1 100.1 2000.1 2000.1")

[{'label': 'LABEL_2', 'score': 0.827082097530365}]

In [149]:
# TODO: Compare to baseline.

Overall, it pretty much behaves like a numerical classifier.