[Iris dataset competition](https://www.kaggle.com/competitions/iris-dataset-competition)

In [49]:
import os

PATH = "data"
TRAIN_CSV = os.path.join(PATH, "iris_train.csv")
TEST_CSV = os.path.join(PATH, "iris_test.csv")

if not os.path.exists(PATH):
    raise FileNotFoundError(f"{PATH} not found")

if not os.path.exists(TRAIN_CSV):
    raise FileNotFoundError(f"{TRAIN_CSV} not found")

if not os.path.exists(TEST_CSV):
    raise FileNotFoundError(f"{TEST_CSV} not found")


In [50]:
import pandas as pd
import numpy as np


def drop_unnamed_columns(df):
    df.drop(
        df.columns[df.columns.str.contains("unnamed", case=False)], axis=1, inplace=True
    )


train_df = pd.read_csv(TRAIN_CSV)
test_df = pd.read_csv(TEST_CSV)
test_df["target"] = np.zeros(test_df.shape[0], dtype=np.int64)


drop_unnamed_columns(train_df)
drop_unnamed_columns(test_df)


In [51]:
train_df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.7,2.9,4.2,1.3,1.0
1,7.6,3.0,6.6,2.1,2.0
2,5.6,3.0,4.5,1.5,1.0
3,5.1,3.5,1.4,0.2,0.0
4,7.7,2.8,6.7,2.0,2.0


In [52]:
test_df.head()


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,6.1,2.8,4.7,1.2,0
1,5.7,3.8,1.7,0.3,0
2,7.7,2.6,6.9,2.3,0
3,6.0,2.9,4.5,1.5,0
4,6.8,2.8,4.8,1.4,0


In [53]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np

RANDOM_SEED = 42
TEST_SIZE = 0.15

X = train_df.values.astype(np.float64)
Y = train_df["target"].astype(np.int64)

X_train, X_test, Y_train, Y_test = train_test_split(
    X,
    Y,
    test_size=TEST_SIZE,
    random_state=RANDOM_SEED,
    stratify=Y,
)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [54]:
import torch


class IrisDS(torch.utils.data.Dataset):
    def __init__(
        self,
        df: pd.DataFrame,
        has_labels: bool = True,
        mean: torch.Tensor | None = None,
        std: torch.Tensor | None = None,
        target_col: str = "target",
    ):
        self.has_labels = has_labels

        X = df.drop(columns=[target_col], errors="ignore").to_numpy(dtype=np.float32)
        self.X = torch.from_numpy(X)

        if has_labels:
            y = pd.Categorical(df[target_col]).codes.astype(np.int64)
            self.Y = torch.from_numpy(y)

        if mean is None or std is None:
            self.mean = self.X.mean(0, keepdim=True)
            self.std = self.X.std(0, unbiased=False, keepdim=True).clamp_min(1e-6)
        else:
            self.mean = torch.as_tensor(mean, dtype=self.X.dtype).view(1, -1)
            self.std = torch.as_tensor(std, dtype=self.X.dtype).view(1, -1)

        self.X = (self.X - self.mean) / self.std

        if has_labels and len(self.X) != len(self.Y):
            raise ValueError(
                f"Length of X ({len(self.X)}) and Y ({len(self.Y)}) must be equal"
            )

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        x = self.X[idx]
        if self.has_labels:
            y = self.Y[idx]
            return x, y
        return x


In [55]:
from torch import optim, nn
import pytorch_lightning as pl


class IrisModel(pl.LightningModule):
    def __init__(
        self, iris_columns=4, species_num=3, lr=1e-3, lr_decay=1e-7, weight_decay=1e-4
    ):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(iris_columns, 5),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(5, 25),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(25, 35),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(35, 5),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(5, species_num),
        )
        self.loss_fn = nn.CrossEntropyLoss()
        self.lr = lr
        self.lr_decay = lr_decay
        self.weight_decay = weight_decay

    def forward(self, x):
        return self.net(x)

    def __step(self, batch, stage):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        preds = logits.argmax(dim=1)
        acc = (preds == y).float().mean()
        self.log(f"{stage}_loss", loss, prog_bar=True)
        self.log(f"{stage}_acc", acc, prog_bar=True)
        return loss

    def training_step(self, batch, _):
        return self.__step(batch, "train")

    def validation_step(self, batch, _):
        self.__step(batch, "val")

    def test_step(self, batch, _):
        self.__step(batch, "test")

    def configure_optimizers(self):
        optimizer = optim.Adam(
            self.parameters(), lr=self.lr, weight_decay=self.weight_decay
        )

        def handle_lr(step):
            new_lr = self.lr - self.lr_decay * step
            return max(new_lr, 0.0) / self.lr

        scheduler = optim.lr_scheduler.LambdaLR(
            optimizer=optimizer,
            lr_lambda=handle_lr,
        )

        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "interval": "step",
            },
        }


In [56]:
MAX_EPOCHS = 100
ENABLE_CHECKPOINTING = False
LOGGER = False

ds = IrisDS(train_df)
loader = torch.utils.data.DataLoader(
    ds,
    batch_size=8,
    shuffle=True,
)

model = IrisModel()
trainer = pl.Trainer(
    max_epochs=MAX_EPOCHS,
    enable_checkpointing=ENABLE_CHECKPOINTING,
    logger=LOGGER,
)

trainer.fit(model, loader)


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\0\anaconda3\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\configuration_validator.py:70: You defined a `validation_step` but have no `val_dataloader`. Skipping val loop.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type             | Params | Mode 
-----------------------------------------------------
0 | net     | Sequential       | 1.3 K  | train
1 | loss_fn | CrossEntropyLoss | 0      | train
-----------------------------------------------------
1.3 K     Trainable params
0         Non-trainable params
1.3 K     Total params
0.005     Total estimated model params size (MB)
15        Modules in train mode
0         Modules in eval mode
c:\Users\0\anaconda3\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasi

Epoch 99: 100%|██████████| 13/13 [00:00<00:00, 262.83it/s, train_loss=0.014, train_acc=1.000] 

`Trainer.fit` stopped: `max_epochs=100` reached.


Epoch 99: 100%|██████████| 13/13 [00:00<00:00, 250.15it/s, train_loss=0.014, train_acc=1.000]


In [57]:
train_ds = IrisDS(train_df)
test_ds = IrisDS(
    test_df,
    has_labels=False,
    mean=train_ds.mean,
    std=train_ds.std,
)
test_loader = torch.utils.data.DataLoader(
    test_ds,
    batch_size=128,
    shuffle=False,
)

pred_batches = trainer.predict(model, dataloaders=test_loader)

logits = torch.cat([b.detach().cpu() for b in pred_batches], dim=0)
pred_idx = logits.argmax(dim=1).numpy()

classes = [0, 1, 2]
pred_labels = [classes[i] for i in pred_idx]

submission = pd.DataFrame(
    {
        "ID": range(len(pred_labels)),
        "target": pred_labels,
    }
)
submission.to_csv("submission.csv", index=False)


LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
c:\Users\0\anaconda3\envs\pytorch\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:433: The 'predict_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=23` in the `DataLoader` to improve performance.


Predicting DataLoader 0: 100%|██████████| 1/1 [00:00<00:00, 333.38it/s]
