In [1]:
!pip install datasets
!pip install accelerate
!pip install transformers
!pip install scikit
!pip install numpy
!pip install pandas

[31mERROR: Could not find a version that satisfies the requirement scikit (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for scikit[0m[31m


In [2]:
!pip install accelerate



In [3]:
from datasets import Dataset, load_metric
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments

from sklearn.datasets import load_iris

import numpy as np
import pandas as pd

In [4]:
# TODO: Add train test split - stratified - later on

## 1. Data preprocessing

In [5]:
iris_data = load_iris()

In [6]:
iris_data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [7]:
iris_data["feature_names"]

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [8]:
iris_data["target_names"]

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [9]:
iris_df = pd.DataFrame(data = iris_data["data"], columns = iris_data["feature_names"])

In [10]:
iris_df["label"] = iris_data["target"]

In [11]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [12]:
iris_df["label_decoded"] = iris_df["label"].apply(lambda label_idx: iris_data["target_names"][label_idx])

In [13]:
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_decoded
0,5.1,3.5,1.4,0.2,0,setosa
1,4.9,3.0,1.4,0.2,0,setosa
2,4.7,3.2,1.3,0.2,0,setosa
3,4.6,3.1,1.5,0.2,0,setosa
4,5.0,3.6,1.4,0.2,0,setosa


In [14]:
iris_df.tail()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),label,label_decoded
145,6.7,3.0,5.2,2.3,2,virginica
146,6.3,2.5,5.0,1.9,2,virginica
147,6.5,3.0,5.2,2.0,2,virginica
148,6.2,3.4,5.4,2.3,2,virginica
149,5.9,3.0,5.1,1.8,2,virginica


Let's now transform this dataset into HuggingFace dataset format.

First we need to transform data itself.

We will be performing classification task. The numerical labels are fine to achieve this goal.

However we need to transform all the individual four numerical features into one textual as LLM input. We will do this by concatenating all four values row-wise.

In [15]:
iris_df["text"] = iris_df.apply(lambda row:
                                str(row["sepal length (cm)"]) + " " +
                                str(row["sepal width (cm)"]) + " " +
                                str(row["petal length (cm)"]) + " " +
                                str(row["petal width (cm)"]), axis=1)

## 2. Simple classification model baseline

We can train a baseline logistic regression model to see how well it performs on Iris dataset and compare it to the LLM performance.

In [15]:
# LR

In [45]:
# Pytorch feed forward NN on numerical data

## 3. LLM based classification model

### 3.1 Tokenization

First we will need to transform the DataFrame into Hugging Face Dataset format.

In [16]:
dataset = Dataset.from_pandas(iris_df)

In [17]:
dataset

Dataset({
    features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text'],
    num_rows: 150
})

In [18]:
dataset[0]

{'sepal length (cm)': 5.1,
 'sepal width (cm)': 3.5,
 'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'label': 0,
 'label_decoded': 'setosa',
 'text': '5.1 3.5 1.4 0.2'}

For simplicity of this experiment we will use BERT model.

In [19]:
MODEL_CHECKPOINT = "bert-base-uncased"

In [20]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_CHECKPOINT)

In [21]:
# We do not need padding or truncation - all values are of the same lengths.
def preprocess_data(example):
    return tokenizer(example["text"])

In [22]:
tokenized_dataset = dataset.map(preprocess_data)

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [23]:
tokenized_dataset

Dataset({
    features: ['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)', 'label', 'label_decoded', 'text', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 150
})

In [24]:
tokenized_dataset[0]

{'sepal length (cm)': 5.1,
 'sepal width (cm)': 3.5,
 'petal length (cm)': 1.4,
 'petal width (cm)': 0.2,
 'label': 0,
 'label_decoded': 'setosa',
 'text': '5.1 3.5 1.4 0.2',
 'input_ids': [101,
  1019,
  1012,
  1015,
  1017,
  1012,
  1019,
  1015,
  1012,
  1018,
  1014,
  1012,
  1016,
  102],
 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

In [25]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### 3.2 Model training

We can now load the base model with the sequence classification head that we will be fine-tuning here.

In [26]:
MODEL_CHECKPOINT

'bert-base-uncased'

In [27]:
model = AutoModelForSequenceClassification.from_pretrained(MODEL_CHECKPOINT, num_labels=3)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [34]:
training_args = TrainingArguments("bert-base-uncased-iris",
                                  num_train_epochs=30,
                                  logging_steps=50,
                                  push_to_hub=False)

In [35]:
metric = load_metric("accuracy")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [40]:
metric = load_metric("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(
        predictions=predictions, references=labels)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [41]:
trainer = Trainer(
    model,
    training_args,
    compute_metrics=compute_metrics,
    train_dataset=tokenized_dataset, # train_dataset=tokenized_dataset["train"]
    eval_dataset=tokenized_dataset, # eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [42]:
trainer.train()

Step,Training Loss
50,0.0797
100,0.1063
150,0.0154
200,0.0001
250,0.0434
300,0.0251
350,0.0099
400,0.017
450,0.0002
500,0.0001


Checkpoint destination directory bert-base-uncased-iris/checkpoint-500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=570, training_loss=0.026071341001232595, metrics={'train_runtime': 39.7888, 'train_samples_per_second': 113.097, 'train_steps_per_second': 14.326, 'total_flos': 32375283822000.0, 'train_loss': 0.026071341001232595, 'epoch': 30.0})

In [43]:
trainer.evaluate()

{'eval_loss': 3.600795389502309e-05,
 'eval_accuracy': 1.0,
 'eval_runtime': 0.2942,
 'eval_samples_per_second': 509.773,
 'eval_steps_per_second': 64.571,
 'epoch': 30.0}

In [44]:
model.eval()

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12,

In [None]:
# trainer.push_to_hub(commit_message="Training complete")

## 4. Prediction

In [None]:
from transformers import pipeline

model_checkpoint = "msznajder/distilbert-base-uncased-imdb2"
sequence_classifier = pipeline("text-classification", model=model_checkpoint)
sequence_classifier("This was a very good movie.")

In [None]:
sequence_classifier("This was a bad movie.")

In [None]:
sequence_classifier("This was a good movie.")