In [None]:
import os
os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'python'

import gc
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from tqdm.auto import tqdm
import warnings

warnings.filterwarnings('ignore')

DATA_DIR = '/kaggle/input/csiro-biomass'
WORKING_DIR = '/kaggle/working'
SIGLIP_PATH = '/kaggle/input/siglip/keras/siglip_so400m_patch14_384/1'

SEED = 42
np.random.seed(SEED)

TARGET_COLS = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']

print('imports done')

In [None]:
train_df = pd.read_csv(f'{DATA_DIR}/train.csv')
test_df = pd.read_csv(f'{DATA_DIR}/test.csv')

pivot = (
    train_df.pivot_table(index='image_path', columns='target_name', values='target', aggfunc='first')
    .reset_index()
)
for t in TARGET_COLS:
    if t not in pivot.columns:
        pivot[t] = np.nan
pivot = pivot[['image_path'] + TARGET_COLS]

meta_cols = [c for c in train_df.columns if c not in ('sample_id', 'target_name', 'target')]
meta_first = train_df.groupby('image_path', as_index=False)[meta_cols].first()
train_data = meta_first.merge(pivot, on='image_path', how='left')
train_data['full_image_path'] = train_data['image_path'].apply(lambda p: os.path.join(DATA_DIR, p))

test_images = test_df.groupby('image_path').first().reset_index()
test_images['full_image_path'] = test_images['image_path'].apply(lambda p: os.path.join(DATA_DIR, p))

print('train_data', train_data.shape)
print('test_images', test_images.shape)

In [None]:
import keras_hub
import keras

print('keras version:', keras.__version__)
print('keras_hub version:', keras_hub.__version__)

print('\nLoading SigLIP model...')
siglip_model = keras.saving.load_model(SIGLIP_PATH, compile=False)
print('SigLIP loaded successfully')
print('Model type:', type(siglip_model))

In [None]:
IMG_SIZE = 384
IMAGENET_MEAN = np.array([0.485, 0.456, 0.406])
IMAGENET_STD = np.array([0.229, 0.224, 0.225])

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    if img is None:
        return np.zeros((1, IMG_SIZE, IMG_SIZE, 3), dtype=np.float32)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = cv2.resize(img, (IMG_SIZE, IMG_SIZE))
    img = img.astype(np.float32) / 255.0
    img = (img - IMAGENET_MEAN) / IMAGENET_STD
    return np.expand_dims(img, axis=0).astype(np.float32)

def extract_embedding(image_path, model):
    try:
        img_batch = preprocess_image(image_path)
        
        if hasattr(model, 'vision_encoder'):
            output = model.vision_encoder(img_batch, training=False)
        else:
            output = model(img_batch, training=False)
        
        if isinstance(output, dict):
            if 'image_embedding' in output:
                emb = output['image_embedding']
            elif 'pooled_output' in output:
                emb = output['pooled_output']
            else:
                emb = list(output.values())[0]
        else:
            emb = output
        
        emb = emb.numpy() if hasattr(emb, 'numpy') else np.array(emb)
        
        if len(emb.shape) == 4:
            emb = np.mean(emb, axis=(1, 2))[0]
        elif len(emb.shape) == 3:
            emb = emb[0, 0]
        elif len(emb.shape) == 2:
            emb = emb[0]
        else:
            emb = emb.flatten()
        
        return emb.astype(np.float32)
    except Exception as e:
        print(f'Error processing {image_path}: {e}')
        import traceback
        traceback.print_exc()
        return None

print('Testing embedding extraction...')
test_emb = extract_embedding(train_data['full_image_path'].values[0], siglip_model)
if test_emb is not None:
    print(f'Embedding shape: {test_emb.shape}')
    EMB_DIM = test_emb.shape[0]
else:
    print('Embedding extraction failed!')
    raise RuntimeError('Could not extract embeddings')

In [None]:
print('Extracting train embeddings...')
train_embeddings = []
for path in tqdm(train_data['full_image_path'].values):
    emb = extract_embedding(path, siglip_model)
    if emb is None:
        emb = np.zeros(EMB_DIM, dtype=np.float32)
    train_embeddings.append(emb)
train_embeddings = np.stack(train_embeddings)
print('train_embeddings', train_embeddings.shape)

print('Extracting test embeddings...')
test_embeddings = []
for path in tqdm(test_images['full_image_path'].values):
    emb = extract_embedding(path, siglip_model)
    if emb is None:
        emb = np.zeros(EMB_DIM, dtype=np.float32)
    test_embeddings.append(emb)
test_embeddings = np.stack(test_embeddings)
print('test_embeddings', test_embeddings.shape)

del siglip_model
gc.collect()

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import KFold
from sklearn.ensemble import HistGradientBoostingRegressor

try:
    from catboost import CatBoostRegressor
    HAS_CATBOOST = True
except:
    HAS_CATBOOST = False

try:
    from lightgbm import LGBMRegressor
    HAS_LGBM = True
except:
    HAS_LGBM = False

print(f'CatBoost: {HAS_CATBOOST}, LightGBM: {HAS_LGBM}')

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(train_embeddings)
X_test_scaled = scaler.transform(test_embeddings)

pca = PCA(n_components=0.95, random_state=SEED)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
print(f'PCA components: {X_train_pca.shape[1]}')

y_train = train_data[TARGET_COLS].values.astype(np.float32)
print('y_train', y_train.shape)

In [None]:
kfold = KFold(n_splits=5, shuffle=True, random_state=SEED)

def train_predict_target(X_train, y_train, X_test, target_idx, target_name):
    fold_preds = []
    
    for fold, (tr_idx, va_idx) in enumerate(kfold.split(X_train)):
        X_tr, X_va = X_train[tr_idx], X_train[va_idx]
        y_tr, y_va = y_train[tr_idx, target_idx], y_train[va_idx, target_idx]
        
        model_preds = []
        
        hist = HistGradientBoostingRegressor(
            max_iter=500, learning_rate=0.05, max_depth=6,
            l2_regularization=0.5, random_state=SEED + fold
        )
        hist.fit(X_tr, y_tr)
        model_preds.append(hist.predict(X_test))
        
        if HAS_CATBOOST:
            cat = CatBoostRegressor(
                iterations=800, learning_rate=0.05, depth=6,
                l2_leaf_reg=0.5, random_seed=SEED + fold, verbose=0
            )
            cat.fit(X_tr, y_tr)
            model_preds.append(cat.predict(X_test))
        
        if HAS_LGBM:
            lgbm = LGBMRegressor(
                n_estimators=800, learning_rate=0.05, max_depth=6,
                reg_lambda=0.5, random_state=SEED + fold, verbose=-1
            )
            lgbm.fit(X_tr, y_tr)
            model_preds.append(lgbm.predict(X_test))
        
        fold_pred = np.mean(model_preds, axis=0)
        fold_preds.append(fold_pred)
    
    final_pred = np.mean(fold_preds, axis=0)
    return final_pred

print('Training models for each target...')
all_preds = np.zeros((len(X_test_pca), len(TARGET_COLS)), dtype=np.float32)

for i, target_name in enumerate(TARGET_COLS):
    print(f'  {target_name}...')
    pred = train_predict_target(X_train_pca, y_train, X_test_pca, i, target_name)
    all_preds[:, i] = pred

print('Training complete')

In [None]:
def post_process_predictions(preds):
    preds = preds.copy()
    preds = np.maximum(preds, 0.0)
    
    green = preds[:, 2]
    dead = preds[:, 1]
    
    preds[:, 0] = 0.0
    preds[:, 4] = green
    preds[:, 3] = green + dead
    
    return preds

final_preds = post_process_predictions(all_preds)

preds_df = pd.DataFrame(
    {
        'image_path': test_images['image_path'].values,
        **{c: final_preds[:, i] for i, c in enumerate(TARGET_COLS)},
    }
)

merged = test_df.merge(preds_df, on='image_path', how='left')

rows = []
for _, r in merged.iterrows():
    rows.append({'sample_id': r['sample_id'], 'target': max(0.0, float(r[r['target_name']]))})

sub = pd.DataFrame(rows)
sub.to_csv(f'{WORKING_DIR}/submission.csv', index=False)
print(sub.head())
print('saved', f'{WORKING_DIR}/submission.csv')