In [None]:
!pip install -Uq lightgbm fastai timm xgboost

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
# Turn this to true if you want to train the models given as well as getting
# fresh new validation and testing predictions. Otherwise, set this to False
# when you already have presaved model files and predictions.
fine_tune = True

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt

## Training / Validation / Test Setup

In [None]:
from pathlib import Path
competition_path = Path('../input/cs-480-2024-spring/data')

original_training_df = pd.read_csv(competition_path / 'train.csv')
original_testing_df = pd.read_csv(competition_path / 'test.csv')

In [None]:
training_df_presplit = original_training_df.copy()
training_df_presplit.id = original_training_df.id.map(lambda x: competition_path / 'train_images' / f"{str(x)}.jpeg")

In [None]:
testing_df = original_testing_df.copy()
testing_df.id = original_testing_df.id.map(lambda x: competition_path / 'test_images' / f"{str(x)}.jpeg")

In [None]:
from sklearn.model_selection import train_test_split

training_df, validation_df = train_test_split(training_df_presplit, random_state=20877679, test_size=0.2)

In [None]:
id_col = "id"
y_columns = ['X4', 'X11', 'X18', 'X26', 'X50', 'X3112']
y_columns_mean = [y_column + '_mean' for y_column in y_columns]
x_columns = [col for col in list(testing_df.columns) if col != id_col]

In [None]:
validation_X_df = validation_df[x_columns]
validation_Y_df = validation_df[y_columns_mean]

In [None]:
testing_X_df = testing_df[x_columns]

## LightGBM Library

In [None]:
import lightgbm as lgb

In [None]:
train_df, valid_df = train_test_split(training_df, test_size=0.2)

In [None]:
train_X_df_lgb, train_Y_df_lgb = train_df[x_columns], train_df[y_columns_mean]
valid_X_df_lgb, valid_Y_df_lgb = valid_df[x_columns], valid_df[y_columns_mean]

In [None]:
lgb_train_datasets = {}
lgb_valid_datasets = {}

for y_column in y_columns_mean:
    lgb_train_datasets[y_column] = lgb.Dataset(train_X_df_lgb, label=train_Y_df_lgb[y_column])
    lgb_valid_datasets[y_column] = lgb.Dataset(valid_X_df_lgb, label=valid_Y_df_lgb[y_column], reference=lgb_train_datasets[y_column])

In [None]:
from sklearn.metrics import r2_score

def r2_score_eval(y_pred, data):
    y_true = data.get_label()
    return 'r2_score', r2_score(y_true, y_pred), True

In [None]:
params = {
    'objective': 'mse',
    "metric": "None",
    "verbosity": 0,
    # "device_type": "gpu",
    "learning_rate": 0.05,
    "num_leaves": 64,
}

In [None]:
lgb_boosters = {}

for y_column in y_columns_mean:
    lgb_boosters[y_column] = lgb.train(
        params,
        train_set=lgb_train_datasets[y_column],
        valid_sets=lgb_valid_datasets[y_column],
        feval=r2_score_eval,
        num_boost_round=1000,
        callbacks=[lgb.log_evaluation(period=20), lgb.early_stopping(stopping_rounds=10)],
    )

In [None]:
def predict_lgb(df):
    preds_array = []
    for y_column in y_columns_mean:
        preds = lgb_boosters[y_column].predict(df)
        preds_array.append(preds)
    preds_tensor = np.array(preds_array).T
    
    return preds_tensor

In [None]:
validation_preds_lgb = predict_lgb(validation_X_df)
validation_actual = validation_Y_df.values

r2_score(validation_actual, validation_preds_lgb)

In [None]:
test_preds_lgb = predict_lgb(testing_X_df)

## XGBoost

In [None]:
import xgboost as xgb

In [None]:
train_df_xgb, valid_df_xgb = train_test_split(training_df, test_size=0.2)

In [None]:
train_X_df_xgb, train_Y_df_xgb = train_df_xgb[x_columns], train_df_xgb[y_columns_mean]
valid_X_df_xgb, valid_Y_df_xgb = valid_df_xgb[x_columns], valid_df_xgb[y_columns_mean]

In [None]:
xgb_train_datasets = {}
xgb_valid_datasets = {}

for y_column in y_columns_mean:
    xgb_train_datasets[y_column] = xgb.DMatrix(train_X_df_xgb, label=train_Y_df_xgb[y_column])
    xgb_valid_datasets[y_column] = xgb.DMatrix(valid_X_df_xgb, label=valid_Y_df_xgb[y_column])

In [None]:
from sklearn.metrics import r2_score

def r2_score_xgb(y_pred, dtrain):
    y_true = dtrain.get_label()
    return "r2_score", r2_score(y_true, y_pred)

In [None]:
common_params = {
    "booster": "gbtree",
    # "device": "cuda",
    "objective": "reg:squarederror",
    "verbosity": 0,
    "max_depth": 4,
    "eta": 0.1,
}

In [None]:
xgb_boosters = {}

for y_column in y_columns_mean:
    print(y_column)
    xgb_boosters[y_column] = xgb.train(
        common_params,
        xgb_train_datasets[y_column],
        num_boost_round=1000,
        evals=[(xgb_valid_datasets[y_column], "valid")],
        maximize=True,
        early_stopping_rounds=30,
        custom_metric=r2_score_xgb,
        verbose_eval=30,
    )

In [None]:
def predict_xgb(df):
    preds_array = []
    ddf = xgb.DMatrix(df)
    for y_column in y_columns_mean:
        preds = xgb_boosters[y_column].predict(ddf)
        preds_array.append(preds)
    preds_tensor = np.array(preds_array).T
    
    return preds_tensor

In [None]:
validation_preds_xgb = predict_xgb(validation_X_df)
validation_actual = validation_Y_df.values

r2_score(validation_actual, validation_preds_xgb)

In [None]:
test_preds_xgb = predict_xgb(testing_X_df)

## Tabular data preprocessing

In [None]:
training_df_standard = training_df.copy()
training_df_minmax = training_df.copy()

In [None]:
from sklearn.preprocessing import MinMaxScaler, FunctionTransformer, PowerTransformer, StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
y_standard_scaler = StandardScaler()

y_scaled = y_standard_scaler.fit_transform(training_df[y_columns_mean])
training_df_standard[y_columns_mean] = y_scaled

In [None]:
y_minmax_scaler = MinMaxScaler()

y_scaled = y_minmax_scaler.fit_transform(training_df[y_columns_mean])
training_df_minmax[y_columns_mean] = y_scaled

In [None]:
from fastai.metrics import AccumMetric, mse
from sklearn.metrics import r2_score

r2_score_fastai = AccumMetric(r2_score, to_np=True, invert_arg=True, flatten=False)

## Convolutional Neural Nets

In [None]:
from fastai.vision.all import *

In [None]:
import gc

def cleanup():
    """Cleans up the GPU cache for PyTorch as well as starting garbage collection"""
    torch.cuda.empty_cache() # PyTorch thing
    gc.collect() # Python thing

In [None]:
image_augmentations = [
    DihedralItem(p=1.0),
    Contrast(max_lighting=0.5, p=0.75), # TODO: Pick better max_lighting
    Saturation(max_lighting=0.5, p=0.75),
    Brightness(max_lighting=0.5, p=0.75),
]

In [None]:
# Notes on PlantVisionLearner: I recommend that fine_tune is first set to True
# on the first run because it will create files of presaved models and predictions.
# On subsequent runs, you can set fine_tune to False to use the existing models and
# predictions, or set to True if you want fresh generated files.

class PlantVisionLearner:
    def __init__(self, arch_name, scaler, arch_nickname=None, accum=1, resize=None, bs=64, patience=3, \
                 saved_model=False, saved_preds=False, saved_tta=False, model_load=True, opt_func=Adam,):
        item_tfms = Resize(resize) if resize is not None else []

        self.bs = bs
        self.arch_name = arch_name
        
        if arch_nickname is None:
            self.arch_nickname = arch_name
        else:
            self.arch_nickname = arch_nickname
        
        if scaler == "standard":
            self.scaler = y_standard_scaler
            df = training_df_standard
        elif scaler == "minmax":
            self.scaler = y_minmax_scaler
            df = training_df_minmax
        
        self.saved_preds = saved_preds
        self.saved_tta = saved_tta
        
        self.saved_model_name = f"plant_{scaler}_{self.arch_nickname}"
        self.saved_preds_name = f"preds_{scaler}_{self.arch_nickname}"
        self.saved_tta_name = f"tta_{scaler}_{self.arch_nickname}"
        
        if not model_load:
            return
        
        self.training_images_dls = ImageDataLoaders.from_df(
            df, path=".",
            label_col=y_columns_mean,
            y_block=RegressionBlock(len(y_columns_mean)),
            item_tfms=item_tfms,
            batch_tfms=image_augmentations,
            bs=bs // accum
        )

        cbs = GradientAccumulation(self.bs) if accum > 1 else []    
        
        self.learner = vision_learner(
            self.training_images_dls, arch_name, loss_func=mse,
            metrics=r2_score_fastai, n_out=len(y_columns_mean), cbs=cbs,
            opt_func=opt_func, model_dir=".",
        )
        
        self.fit_cbs = [
            EarlyStoppingCallback(monitor='r2_score', comp=np.greater, min_delta=0.0, patience=patience),
            SaveModelCallback(monitor='r2_score', comp=np.greater, min_delta=0.0,
                with_opt=True, fname=self.saved_model_name
            ),
        ]
        
        if saved_model:
            load_model(f"{self.saved_model_name}.pth", self.learner, self.learner.opt, device="cuda:0")
        
    def fine_tune(self, *args, **kwargs):
        self.learner.fine_tune(*args, **dict(kwargs, cbs=self.fit_cbs))
    
    def predict(self, df, bs=64):
        dl = self.learner.dls.test_dl(df)
        preds_scaled, _ = self.learner.get_preds(dl=dl)
        preds = self.scaler.inverse_transform(preds_scaled)

        return preds
    
    def tta(self, df, bs=64):
        "TTA = Test Time Augmentation"
        dl = self.learner.dls.test_dl(df)
        tta_scaled, _ = self.learner.tta(dl=dl)
        tta = self.scaler.inverse_transform(preds_scaled)

        return tta
    
    def get_valid_test_preds(self, save_values=True, bs=64):
        if self.saved_preds:
            loaded = np.load(f"{self.saved_preds_name}.npz")
            valid = loaded["valid"]
            test = loaded["test"]
        else:
            valid = self.predict(validation_df, bs)
            test = self.predict(testing_df, bs)
            
            if save_values:
                np.savez(self.saved_preds_name,
                         valid=valid,
                         test=test)
        return valid, test
    
    def get_valid_test_tta(self, save_values=True, bs=64):
        if self.saved_tta:
            loaded = np.load(f"{self.saved_tta_name}.npz")
            valid = loaded["valid"]
            test = loaded["test"]
        else:
            valid = self.tta(validation_df, bs=64)
            test = self.tta(testing_df, bs=64)
            
            if save_values:
                np.savez(self.saved_tta_name,
                         valid=valid,
                         test=test)
        return valid, test
    
    def save(self, *args, **kwargs):
        self.learner.save(*args, **kwargs)

## convnext_base.fb_in22k_ft_in1k (StandardScaler)

In [None]:
convnext_base_in22k_ft_in1k_standard_vlearner = PlantVisionLearner(
    "convnext_base.fb_in22k_ft_in1k",
    "standard",
    arch_nickname="convnext_base_in22k_ft_in1k",
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
    patience=4,
)

In [None]:
if fine_tune:
    lr_valley, = convnext_base_in22k_ft_in1k_standard_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    convnext_base_in22k_ft_in1k_standard_vlearner.fine_tune(18, 0.001)

In [None]:
validation_preds_convnext_base_in22k_ft_in1k_standard, test_preds_convnext_base_in22k_ft_in1k_standard = \
    convnext_base_in22k_ft_in1k_standard_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df.values, validation_preds_convnext_base_in22k_ft_in1k_standard)

## convnext_large_in22k (StandardScaler)

In [None]:
convnext_large_in22k_standard_vlearner = PlantVisionLearner(
    "convnext_large_in22k",
    "standard",
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
    patience=4,
)

In [None]:
if fine_tune:
    lr_valley, = convnext_large_in22k_standard_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    convnext_large_in22k_standard_vlearner.fine_tune(18, lr_valley)

In [None]:
validation_preds_convnext_large_in22k_standard, test_preds_convnext_large_in22k_standard = \
    convnext_large_in22k_standard_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_convnext_large_in22k_standard)

## convnext_large_in22k (MinMaxScaler)

In [None]:
convnext_large_in22k_minmax_vlearner = PlantVisionLearner(
    "convnext_large_in22k",
    "minmax",
    model_load=fine_tune,
    saved_model=True,
    saved_preds=True,
)

In [None]:
if fine_tune:
    lr_valley, = convnext_large_in22k_minmax_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    convnext_large_in22k_minmax_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_convnext_large_in22k_minmax, test_preds_convnext_large_in22k_minmax = \
    convnext_large_in22k_minmax_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_convnext_large_in22k_minmax)

## vit_large_patch16_224 (Standard Scaler)

In [None]:
vit_large_patch16_224_standard_vlearner = PlantVisionLearner(
    "vit_large_patch16_224",
    "standard",
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
    resize=224,
    accum=2,
)

In [None]:
if fine_tune:
    lr_valley, = vit_large_patch16_224_standard_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    vit_large_patch16_224_standard_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_vit_large_patch16_224_standard, test_preds_vit_large_patch16_224_standard = \
    vit_large_patch16_224_standard_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_vit_large_patch16_224_standard)

## vit_large_patch16_224 (MinMaxScaler)

In [None]:
vit_large_patch16_224_minmax_vlearner = PlantVisionLearner(
    "vit_large_patch16_224",
    "minmax",
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
    resize=224,
    accum=2,
)

In [None]:
if fine_tune:
    lr_valley, = vit_large_patch16_224_minmax_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    vit_large_patch16_224_minmax_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_vit_large_patch16_224_minmax, test_preds_vit_large_patch16_224_minmax = \
    vit_large_patch16_224_minmax_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_vit_large_patch16_224_minmax)

## convnext_xlarge_in22k (StandardScaler)

In [None]:
convnext_xlarge_in22k_standard_vlearner = PlantVisionLearner(
    "convnext_xlarge_in22k",
    "standard",
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
)

In [None]:
if fine_tune:
    lr_valley, = convnext_xlarge_in22k_standard_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    convnext_xlarge_in22k_standard_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_convnext_xlarge_in22k_standard, test_preds_convnext_xlarge_in22k_standard = \
    convnext_xlarge_in22k_standard_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_convnext_xlarge_in22k_standard)

## convnext_xlarge_in22k (MinMaxScaler)

In [None]:
convnext_xlarge_in22k_minmax_vlearner = PlantVisionLearner(
    "convnext_xlarge_in22k",
    "minmax",
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
)

In [None]:
if fine_tune:
    lr_valley, = convnext_xlarge_in22k_minmax_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    convnext_xlarge_in22k_minmax_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_convnext_xlarge_in22k_minmax, test_preds_convnext_xlarge_in22k_minmax = \
    convnext_xlarge_in22k_minmax_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_convnext_xlarge_in22k_minmax)

## mixer_l16_224.goog_in21k (MinMaxScaler)

In [None]:
mixer_l16_224_goog_in21k_minmax_vlearner = PlantVisionLearner(
    "hf_hub:timm/mixer_l16_224.goog_in21k",
    "minmax",
    arch_nickname="mixer_l16_224_goog_in21k",
    resize=224,
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
    accum=64,
)

In [None]:
if fine_tune:
    lr_valley, = mixer_l16_224_goog_in21k_minmax_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    mixer_l16_224_goog_in21k_minmax_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_mixer_l16_224_goog_in21k_minmax, test_preds_mixer_l16_224_goog_in21k_minmax = \
    mixer_l16_224_goog_in21k_minmax_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_mixer_l16_224_goog_in21k_minmax)

## mixer_l16_224.goog_in21k (StandardScaler)

In [None]:
mixer_l16_224_goog_in21k_standard_vlearner = PlantVisionLearner(
    "hf_hub:timm/mixer_l16_224.goog_in21k",
    "standard",
    arch_nickname="mixer_l16_224_goog_in21k",
    resize=224,
    model_load=fine_tune,
    saved_model=not fine_tune,
    saved_preds=not fine_tune,
)

In [None]:
if fine_tune:
    lr_valley, = mixer_l16_224_goog_in21k_standard_vlearner.learner.lr_find()

In [None]:
if fine_tune:
    mixer_l16_224_goog_in21k_standard_vlearner.fine_tune(12, lr_valley)

In [None]:
validation_preds_mixer_l16_224_goog_in21k_standard, test_preds_mixer_l16_224_goog_in21k_standard = \
    mixer_l16_224_goog_in21k_standard_vlearner.get_valid_test_preds(save_values=True)

In [None]:
r2_score(validation_Y_df, validation_preds_mixer_l16_224_goog_in21k_standard)

## And now, average the predictions

In [None]:
validation_preds = [
    validation_preds_lgb,
    # validation_preds_xgb,
    validation_preds_convnext_base_in22k_ft_in1k_standard,
    validation_preds_convnext_large_in22k_standard,
    # validation_preds_convnext_large_in22k_minmax,
    validation_preds_vit_large_patch16_224_standard,
    validation_preds_vit_large_patch16_224_minmax,
    validation_preds_convnext_xlarge_in22k_standard,
    validation_preds_convnext_xlarge_in22k_minmax,
    # validation_preds_mixer_l16_224_goog_in21k_standard,
    validation_preds_mixer_l16_224_goog_in21k_minmax,
]

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.metrics import r2_score
import numpy as np

validation_preds_colstack = np.column_stack(validation_preds)

cv_results = cross_validate(
    LinearRegression(),
    validation_preds_colstack,
    validation_Y_df.values,
    cv=5,
)

linear_model = LinearRegression()
linear_model.fit(validation_preds_colstack, validation_Y_df.values)

cv_results

In [None]:
linreg_validation_preds = linear_model.predict(validation_preds_colstack)

In [None]:
r2_score(validation_Y_df.values, linreg_validation_preds)

In [None]:
import numpy as np
from sklearn.metrics import r2_score
from scipy.optimize import minimize

validation_preds_stack = np.stack(validation_preds, axis=0)

def weighted_average_maximize(weights):
    weights = np.array(weights)
    # weights = weights / np.sum(weights) # Normalize weights

    weighted_avg_pred = np.tensordot(weights, validation_preds_stack, axes=(0, 0))

    return -r2_score(validation_Y_df.values, weighted_avg_pred)

init_weights = np.random.uniform(-4, 4, size=validation_preds_stack.shape[0])
# init_weights /= np.sum(init_weights) # Normalize weights

result = minimize(
    weighted_average_maximize, 
    init_weights,
    # bounds=[(0, 1)] * validation_preds_stack.shape[0] # Normalize weights
)

weights = result["x"]
# weights /= np.sum(weights) # Normalize the weights again.

weights

In [None]:
weighted_validation_preds = np.tensordot(weights, validation_preds_stack, axes=(0, 0))

In [None]:
r2_score(validation_Y_df.values, weighted_validation_preds)

In [None]:
from sklearn.metrics import r2_score
import matplotlib.pyplot as plt

for i, col in enumerate(y_columns_mean):
    actual_list = list(validation_actual[:,i])
    preds_list = list(linreg_validation_preds[:,i])

    print(col, r2_score(actual_list, preds_list))
    plt.scatter(actual_list, preds_list)
    plt.show()

## Test submission

In [None]:
test_preds = [
    test_preds_lgb,
    # test_preds_xgb,
    test_preds_convnext_base_in22k_ft_in1k_standard,
    test_preds_convnext_large_in22k_standard,
    # test_preds_convnext_large_in22k_minmax,
    test_preds_vit_large_patch16_224_standard,
    test_preds_vit_large_patch16_224_minmax,
    test_preds_convnext_xlarge_in22k_standard,
    test_preds_convnext_xlarge_in22k_minmax,
    # test_preds_mixer_l16_224_goog_in21k_standard,
    test_preds_mixer_l16_224_goog_in21k_minmax,
]

In [None]:
test_preds_colstack = np.column_stack(test_preds)

test_preds = linear_model.predict(test_preds_colstack)

In [None]:
X_test_id = original_testing_df.id

In [None]:
submit_df = pd.DataFrame(test_preds, columns=y_columns)
submit_df["id"] = X_test_id
submit_df = submit_df[["id", *y_columns]]
submit_df.to_csv('submission.csv', index=False)

submit_df