## 1.1. Libraries

In [78]:
# Utils
from dataclasses import dataclass
import os
import psutil
from tqdm.notebook import tqdm
from typing import List
import time

# EDA & DATA
import imageio.v3 as imageio
import pandas as pd
import matplotlib.pyplot as plt
import cv2

# ML
import albumentations as A
from albumentations.pytorch import ToTensorV2
import timm
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchmetrics
from xgboost import XGBRegressor
import xgboost as xgb


tqdm.pandas()

In [2]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE

'cuda'

# 2. Data

# 2.1. Read files

In [3]:
DATAPATH = '/kaggle/input/planttraits2024'

In [5]:
train = pd.read_csv(DATAPATH + '/train.csv')
train['file_path'] = train['id'].apply(lambda s: DATAPATH + f'/train_images/{s}.jpeg')
train['jpeg_bytes'] = train['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())

  0%|          | 0/55489 [00:00<?, ?it/s]

In [7]:
test = pd.read_csv(DATAPATH + '/test.csv', nrows=0)

In [63]:
all_test_cols = list(set(train.columns).difference(test.columns))
train_cols = [col for col in train.columns if col not in ['id'] + all_test_cols]
test_cols = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']

## 2.2. Clean outliers

In [9]:
print(f'Before cleaning df has {train.shape[0]} rows')
for col in test_cols:
    q_low = train[col].quantile(0.01)
    q_hi  = train[col].quantile(0.99)
    rows = train.shape[0]
    train = train[(train[col] < q_hi) & (train[col] > q_low)]
    print(f'\tCleaning: {col} removed {rows - train.shape[0]} rows')
print(f'After cleaning df has {train.shape[0]} rows')

Before cleaning df has 55489 rows
	Cleaning: X4_mean removed 1113 rows
	Cleaning: X11_mean removed 1090 rows
	Cleaning: X18_mean removed 1067 rows
	Cleaning: X50_mean removed 1047 rows
	Cleaning: X26_mean removed 1026 rows
	Cleaning: X3112_mean removed 1007 rows
After cleaning df has 49139 rows


## 2.4. Create datasets & dataloaders

In [10]:
class CustomDataset(Dataset):
    def __init__(self, df, y_cols: List[str], transforms=None):
        self.ys = df[y_cols].values
        self.img_col = 'jpeg_bytes'
        self.imgs = df[self.img_col].values
        
        self.row_cols = [col for col in train_cols if col not in ('id', 'file_path', self.img_col)]
        self.rows = df[self.row_cols].values
        
        self.transforms = transforms
        
        
    def __len__(self):
        return len(self.imgs)

    def __getitem__(self, ix):

        row = self.rows[ix]
        img = imageio.imread(self.imgs[ix])
        if self.transforms:
            img = self.transforms(image=img)['image']
        else:
            img = torch.from_numpy(img)
        y = self.ys[ix]

        return row, img, y

# 3. Model

In [12]:
@dataclass
class CONFIG:
    BACKBONE = 'swin_large_patch4_window12_384.ms_in22k_ft_in1k'
    TARGET_COLUMNS = ['X4_mean', 'X11_mean', 'X18_mean', 'X50_mean', 'X26_mean', 'X3112_mean']
    N_TARGETS = len(TARGET_COLUMNS)
    BATCH_SIZE = 10
    LR_MAX = 1e-4
    WEIGHT_DECAY = 0.01
    N_EPOCHS = 1
    TRAIN_MODEL = True
    IS_INTERACTIVE = os.environ['KAGGLE_KERNEL_RUN_TYPE'] == 'Interactive'
    
    N_TRAIN_SAMPLES = len(train)
    N_STEPS_PER_EPOCH = (N_TRAIN_SAMPLES // BATCH_SIZE)
    N_STEPS = N_STEPS_PER_EPOCH * N_EPOCHS + 1

In [13]:
data_config = timm.data.resolve_model_data_config(CONFIG.BACKBONE)
transforms = timm.data.create_transform(**data_config, is_training=False)

In [14]:
CONFIG.IMAGE_SIZE = 384 #data_config['input_size'][1]
CONFIG.IMAGE_MEAN = data_config['mean']
CONFIG.IMAGE_STD = data_config['std']
CONFIG.IMAGE_INTERPOLATION = data_config['interpolation']

In [83]:
XGBOOST_PARAMS = {'random_state': 0,
                  'multi_strategy' : "one_output_per_tree",
                  'tree_method': 'hist', #'exact',
                  'booster' : "gbtree",
                  'eval_metric' : "rmse",
                  'objective': 'reg:squarederror',
                  'colsample_bynode': 0.8274131159915409,
                  'colsample_bytree': 0.6807728406101965,
                  'gamma': 0.033759641187317335,
                  'learning_rate': 0.057359495247975095,
                  'max_depth': 9,
                  'min_child_weight': 57,
                  'reg_alpha': 1.9190306595715692,
                  'reg_lambda': 10.477908255864408,
                  'subsample': 0.7708761325192125,
         }

In [17]:
class ContinuesModel(nn.Module):
    def __init__(self):
        super().__init__()
        
        self.backbone = timm.create_model(
            CONFIG.BACKBONE,
            num_classes=CONFIG.N_TARGETS,
            pretrained=True)
        self._init_xgboost()
        self.weighted_join = nn.Linear(2*CONFIG.N_TARGETS, CONFIG.N_TARGETS)

    def _init_xgboost(self):
        sample_df = train.sample(n=100)
        self.xgboost = xgb.train(XGBOOST_PARAMS,
                         dtrain = xgb.DMatrix(sample_df[train_cols].values, sample_df[test_cols].values), num_boost_round=261
                        )
        add_params = {'updater':'refresh','process_type': 'update', 'refresh_leaf': True,}
        self.params = dict(XGBOOST_PARAMS.items() + add_params.items())
        
    def forward(self, row, img, y):
        if self.training:
            self.xgboost = xgb.train(self.params, dtrain=xgb.DMatrix(row, y), xgb_model=self.xgboost)
            row_path = self.xgboost.predict(xgb.DMatrix(row))
        else:
            row_path = self.xgboost.predict(xgb.DMatrix(row))
                    
        cat_path = torch.cat([img_path, row_path])
        out = self.weighted_join(cat_path)
        return out

In [81]:
# model.train()

# train_dataloader = DataLoader(
#         train_dataset,
#         batch_size=1000,
#         shuffle=True,
#         drop_last=True,
# #         num_workers=psutil.cpu_count(),
# )

# for step, (row_batch, img_batch, y_true) in enumerate(train_dataloader):
# #     img_batch = img_batch.to(DEVICE)
#     y_true = y_true#.to(DEVICE)
#     if model.training:
#         test_xgboost = xgb.train(params,
#                                  dtrain = xgb.DMatrix(row_batch, y_true.cpu()), 
#                                  num_boost_round=100,
#                                  xgb_model=test_xgboost)
#         row_path = test_xgboost.predict(xgb.DMatrix(row_batch))
#         r2_ = r2_score(y_true, row_path)
#         print(f'step:{step} {r2_}')
#     if step == 20:
#         break

In [None]:
class ImageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.backbone = timm.create_model(
                CONFIG.BACKBONE,
                num_classes=CONFIG.N_TARGETS,
                pretrained=True)
        
    def forward(self, inputs):
        return self.backbone(inputs)


class RegressionModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.model = XGBRegressor(**XGBOOST_PARAMS,
                                  device=DEVICE, )

    def forward(self, X_train, y_train):
        return  model.fit(cp.array(X_train), cp.array(y_train))


class StackedModel(nn.Module):
    def __init__(self, img_model, reg_model):
        self.img_model = img_model
        self.reg_model = reg_model
        self.weighted_join = nn.Linear(2*CONFIG.N_TARGETS, CONFIG.N_TARGETS)

    def forward(self, x):
        out = torch.cat([
            self.img_model(x),
            self.r_model(x)
        ])
        out = self.weighted_join(out)
        return out

## 2.3. Define Albumentations

In [80]:
data_config

{'input_size': (3, 224, 224),
 'interpolation': 'bicubic',
 'mean': (0.485, 0.456, 0.406),
 'std': (0.229, 0.224, 0.225),
 'crop_pct': 0.875,
 'crop_mode': 'center'}

In [79]:
transforms

Compose(
    Resize(size=256, interpolation=bicubic, max_size=None, antialias=warn)
    CenterCrop(size=(224, 224))
    ToTensor()
    Normalize(mean=tensor([0.4850, 0.4560, 0.4060]), std=tensor([0.2290, 0.2240, 0.2250]))
)

In [18]:
TRAIN_TRANSFORMS = A.Compose([
        A.HorizontalFlip(p=0.5),
        A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=0.25),
        A.ImageCompression(quality_lower=85, quality_upper=100, p=0.25),
        A.RandomSizedCrop(
                [448, 512],
                CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, w2h_ratio=1.0, p=0.75),
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE),
        A.ToFloat(),
        A.Normalize(mean=CONFIG.IMAGE_MEAN, std=CONFIG.IMAGE_STD, max_pixel_value=1),
        ToTensorV2(),
    ])

TEST_TRANSFORMS = A.Compose([
        A.Resize(CONFIG.IMAGE_SIZE, CONFIG.IMAGE_SIZE, interpolation=cv2.INTER_CUBIC),
        A.ToFloat(),
        A.Normalize(mean=CONFIG.IMAGE_MEAN, std=CONFIG.IMAGE_STD, max_pixel_value=1),
        ToTensorV2(),
    ])

## 3.1. Define architecture

## 3.2. Define training

## 3.3. Trainig...

In [21]:
train_dataset = CustomDataset(train, test_cols, TRAIN_TRANSFORMS)

train_dataloader = DataLoader(
        train_dataset,
        batch_size=CONFIG.BATCH_SIZE,
        shuffle=True,
        drop_last=True,
#         num_workers=psutil.cpu_count(),
)


In [20]:
model = Model().to(DEVICE)

class AverageMeter(object):
    def __init__(self):
        self.reset()

    def reset(self):
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val):
        self.sum += val.sum()
        self.count += val.numel()
        self.avg = self.sum / self.count

MAE = torchmetrics.regression.MeanAbsoluteError().to(DEVICE)
R2 = torchmetrics.regression.R2Score(num_outputs=CONFIG.N_TARGETS, multioutput='uniform_average').to(DEVICE)
LOSS = AverageMeter()

LOSS_FN = nn.SmoothL1Loss() # r2_loss

optimizer = torch.optim.AdamW(
    params=model.parameters(),
    lr=CONFIG.LR_MAX,
    weight_decay=CONFIG.WEIGHT_DECAY,
)

LR_SCHEDULER = torch.optim.lr_scheduler.OneCycleLR(
        optimizer=optimizer,
        max_lr=CONFIG.LR_MAX,
        total_steps=CONFIG.N_STEPS,
        pct_start=0.1,
        anneal_strategy='cos',
        div_factor=1e1,
        final_div_factor=1e1,)

model.safetensors:   0%|          | 0.00/801M [00:00<?, ?B/s]

In [22]:
print("Start Training:")
for epoch in range(CONFIG.N_EPOCHS):
    model.train()
        
    for step, (row_batch, img_batch, y_true) in enumerate(train_dataloader):
        img_batch = img_batch.to(DEVICE)
        y_true = y_true.to(DEVICE)
        t_start = time.perf_counter_ns()
        y_pred = model(row_batch, img_batch)
        loss = LOSS_FN(y_pred, y_true)
        LOSS.update(loss)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        LR_SCHEDULER.step()
        
        print(f'Step" {step}/{CONFIG.N_STEPS_PER_EPOCH} finished')

torch.save(model, 'model.pth')

Start Training:
Step" 0/4913 finished
Step" 1/4913 finished
Step" 2/4913 finished
Step" 3/4913 finished
Step" 4/4913 finished
Step" 5/4913 finished
Step" 6/4913 finished
Step" 7/4913 finished
Step" 8/4913 finished
Step" 9/4913 finished
Step" 10/4913 finished
Step" 11/4913 finished
Step" 12/4913 finished
Step" 13/4913 finished
Step" 14/4913 finished
Step" 15/4913 finished
Step" 16/4913 finished
Step" 17/4913 finished
Step" 18/4913 finished
Step" 19/4913 finished
Step" 20/4913 finished
Step" 21/4913 finished
Step" 22/4913 finished
Step" 23/4913 finished
Step" 24/4913 finished
Step" 25/4913 finished
Step" 26/4913 finished
Step" 27/4913 finished
Step" 28/4913 finished
Step" 29/4913 finished
Step" 30/4913 finished
Step" 31/4913 finished
Step" 32/4913 finished
Step" 33/4913 finished
Step" 34/4913 finished
Step" 35/4913 finished
Step" 36/4913 finished
Step" 37/4913 finished
Step" 38/4913 finished
Step" 39/4913 finished
Step" 40/4913 finished
Step" 41/4913 finished
Step" 42/4913 finished
Step"

KeyboardInterrupt: 

In [85]:
#train_dataset[:100000]

# 4. Submission

In [None]:
test = pd.read_csv(DATAPATH + '/test.csv')
test['file_path'] = test['id'].apply(lambda s: DATAPATH + f'/test_images/{s}.jpeg')
test['jpeg_bytes'] = test['file_path'].progress_apply(lambda fp: open(fp, 'rb').read())

In [None]:
test_dataset = CustomDataset(test, ['id'])

## 4.1. Run interfarance