In [None]:
import os
import gc
import sys
import numpy as np
import pandas as pd
import cv2
from PIL import Image
from tqdm.auto import tqdm
import warnings

from google.protobuf import message_factory
if not hasattr(message_factory.MessageFactory, 'GetPrototype'):
    def _GetPrototype(self, descriptor):
        return self.GetMessageClass(descriptor)
    message_factory.MessageFactory.GetPrototype = _GetPrototype

import torch
from transformers import AutoModel, AutoImageProcessor, AutoTokenizer

warnings.filterwarnings('ignore')
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

DATA_DIR = '/kaggle/input/csiro-biomass'
WORKING_DIR = '/kaggle/working'
SIGLIP_PATH = '/kaggle/input/google-siglip-so400m-patch14-384/transformers/default/1'

SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)

TARGET_NAMES = ['Dry_Clover_g', 'Dry_Dead_g', 'Dry_Green_g', 'Dry_Total_g', 'GDM_g']
TARGET_MAX = {
    'Dry_Clover_g': 71.7865,
    'Dry_Dead_g': 83.8407,
    'Dry_Green_g': 157.9836,
    'Dry_Total_g': 185.70,
    'GDM_g': 157.9836,
}

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('device:', device)

In [None]:
def pivot_table(df):
    if 'target' in df.columns:
        df_pt = pd.pivot_table(
            df, values='target',
            index=['image_path', 'Sampling_Date', 'State', 'Species', 'Pre_GSHH_NDVI', 'Height_Ave_cm'],
            columns='target_name', aggfunc='mean'
        ).reset_index()
    else:
        df['target'] = 0
        df_pt = pd.pivot_table(df, values='target', index='image_path', columns='target_name', aggfunc='mean').reset_index()
    return df_pt

train_long = pd.read_csv(f'{DATA_DIR}/train.csv')
test_long = pd.read_csv(f'{DATA_DIR}/test.csv')

train_df = pivot_table(train_long)
test_df = pivot_table(test_long)

train_df['image_path'] = train_df['image_path'].apply(lambda p: os.path.join(DATA_DIR, p))
test_df['image_path'] = test_df['image_path'].apply(lambda p: os.path.join(DATA_DIR, p))

from sklearn.model_selection import StratifiedGroupKFold
sgkf = StratifiedGroupKFold(n_splits=5, shuffle=True, random_state=SEED)
train_df['fold'] = -1
for fold, (_, val_idx) in enumerate(sgkf.split(train_df, train_df['State'], groups=train_df['Sampling_Date'])):
    train_df.loc[val_idx, 'fold'] = fold

print('train_df', train_df.shape)
print('test_df', test_df.shape)
print('Folds:', train_df['fold'].value_counts().sort_index().tolist())

In [None]:
print('Loading SigLIP')
model = AutoModel.from_pretrained(SIGLIP_PATH, local_files_only=True).eval().to(device)
processor = AutoImageProcessor.from_pretrained(SIGLIP_PATH)
print('SigLIP loaded')

In [None]:
def split_image(image, patch_size=520, overlap=16):
    h, w, c = image.shape
    stride = patch_size - overlap
    patches, coords = [], []
    for y in range(0, h, stride):
        for x in range(0, w, stride):
            y1, x1, y2, x2 = y, x, y + patch_size, x + patch_size
            patch = image[y1:y2, x1:x2, :]
            if patch.shape[0] < patch_size or patch.shape[1] < patch_size:
                pad_h = patch_size - patch.shape[0]
                pad_w = patch_size - patch.shape[1]
                patch = np.pad(patch, ((0, pad_h), (0, pad_w), (0, 0)), mode='reflect')
            patches.append(patch)
            coords.append((y1, x1, y2, x2))
    return patches, coords

def compute_embeddings(df, patch_size=520):
    IMAGE_PATHS, EMBEDDINGS = [], []
    for i, row in tqdm(df.iterrows(), total=len(df)):
        img_path = row['image_path']
        img = cv2.imread(img_path, cv2.IMREAD_COLOR)
        if img is None:
            img = np.zeros((1000, 2000, 3), dtype=np.uint8)
        patches, coords = split_image(img, patch_size=patch_size)
        images = [Image.fromarray(p).convert('RGB') for p in patches]
        inputs = processor(images=images, return_tensors='pt').to(device)
        with torch.no_grad():
            features = model.get_image_features(**inputs)
        embeds = features.mean(dim=0).detach().cpu().numpy()
        EMBEDDINGS.append(embeds)
        IMAGE_PATHS.append(img_path)
    embeddings = np.stack(EMBEDDINGS, axis=0)
    n_features = embeddings.shape[1]
    emb_columns = [f'emb{i+1}' for i in range(n_features)]
    emb_df = pd.DataFrame(embeddings, columns=emb_columns)
    emb_df['image_path'] = IMAGE_PATHS
    df_final = df.merge(emb_df, on='image_path', how='left')
    return df_final

print('Computing train embeddings...')
train_df = compute_embeddings(train_df, patch_size=520)
print('Computing test embeddings...')
test_df = compute_embeddings(test_df, patch_size=520)
print('Done')

In [None]:
print('Generating semantic features...')
tokenizer = AutoTokenizer.from_pretrained(SIGLIP_PATH)

concept_groups = {
    'bare': ['bare soil', 'dirt ground', 'sparse vegetation', 'exposed earth'],
    'sparse': ['low density pasture', 'thin grass', 'short clipped grass'],
    'medium': ['average pasture cover', 'medium height grass', 'grazed pasture'],
    'dense': ['dense tall pasture', 'thick grassy volume', 'high biomass', 'overgrown vegetation'],
    'green': ['lush green vibrant pasture', 'photosynthesizing leaves', 'fresh growth'],
    'dead': ['dry brown dead grass', 'yellow straw', 'senesced material', 'standing hay'],
    'clover': ['white clover', 'trifolium repens', 'broadleaf legume', 'clover flowers'],
    'grass': ['ryegrass', 'blade-like leaves', 'fescue', 'grassy sward'],
    'weeds': ['broadleaf weeds', 'thistles', 'non-pasture vegetation']
}

concept_vectors = {}
with torch.no_grad():
    for name, prompts in concept_groups.items():
        inputs = tokenizer(prompts, padding='max_length', return_tensors='pt').to(device)
        emb = model.get_text_features(**inputs)
        emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
        concept_vectors[name] = emb.mean(dim=0, keepdim=True)

emb_cols = [c for c in train_df.columns if c.startswith('emb')]
X_all_emb = np.vstack([train_df[emb_cols].values, test_df[emb_cols].values])
img_tensor = torch.tensor(X_all_emb, dtype=torch.float32).to(device)
img_tensor = img_tensor / img_tensor.norm(p=2, dim=-1, keepdim=True)

scores = {}
for name, vec in concept_vectors.items():
    scores[name] = torch.matmul(img_tensor, vec.T).cpu().numpy().flatten()

df_scores = pd.DataFrame(scores)
df_scores['ratio_greenness'] = df_scores['green'] / (df_scores['green'] + df_scores['dead'] + 1e-6)
df_scores['ratio_clover'] = df_scores['clover'] / (df_scores['clover'] + df_scores['grass'] + 1e-6)
df_scores['ratio_cover'] = (df_scores['dense'] + df_scores['medium']) / (df_scores['bare'] + df_scores['sparse'] + 1e-6)
df_scores['max_density'] = df_scores[['bare', 'sparse', 'medium', 'dense']].max(axis=1)

sem_all = df_scores.values
n_train = len(train_df)
sem_train = sem_all[:n_train]
sem_test = sem_all[n_train:]
print('Semantic features:', sem_train.shape)

In [None]:
del model, processor, tokenizer
torch.cuda.empty_cache()
gc.collect()
print('Cleaned up')

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import GradientBoostingRegressor, HistGradientBoostingRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from copy import deepcopy

class SupervisedEmbeddingEngine:
    def __init__(self, n_pca=0.80, n_pls=8, n_gmm=6, random_state=42):
        self.n_pca = n_pca
        self.n_pls = n_pls
        self.n_gmm = n_gmm
        self.random_state = random_state
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=n_pca, random_state=random_state)
        self.pls = PLSRegression(n_components=n_pls, scale=False)
        self.gmm = GaussianMixture(n_components=n_gmm, covariance_type='diag', random_state=random_state)
        self.pls_fitted_ = False

    def fit(self, X, y=None, X_semantic=None):
        X_scaled = self.scaler.fit_transform(X)
        self.pca.fit(X_scaled)
        self.gmm.fit(X_scaled)
        if y is not None:
            y_clean = y.values if hasattr(y, 'values') else y
            self.pls.fit(X_scaled, y_clean)
            self.pls_fitted_ = True
        return self

    def transform(self, X, X_semantic=None):
        X_scaled = self.scaler.transform(X)
        features = [self.pca.transform(X_scaled)]
        if self.pls_fitted_:
            features.append(self.pls.transform(X_scaled))
        features.append(self.gmm.predict_proba(X_scaled))
        if X_semantic is not None:
            sem_norm = (X_semantic - np.mean(X_semantic, axis=0)) / (np.std(X_semantic, axis=0) + 1e-6)
            features.append(sem_norm)
        return np.hstack(features)

print('Engine ready')

In [None]:
target_max_arr = np.array([TARGET_MAX[t] for t in TARGET_NAMES], dtype=float)
emb_cols = [c for c in train_df.columns if c.startswith('emb')]

def cross_validate(model_class, train_data, test_data, feat_engine, sem_train, sem_test):
    n_splits = train_data['fold'].nunique()
    y_true = train_data[TARGET_NAMES]
    y_pred = pd.DataFrame(0.0, index=train_data.index, columns=TARGET_NAMES)
    y_pred_test = np.zeros([len(test_data), len(TARGET_NAMES)], dtype=float)
    
    for fold in range(n_splits):
        train_mask = train_data['fold'] != fold
        valid_mask = train_data['fold'] == fold
        val_idx = train_data[valid_mask].index
        
        X_train_raw = train_data[train_mask][emb_cols].values
        X_valid_raw = train_data[valid_mask][emb_cols].values
        X_test_raw = test_data[emb_cols].values
        
        sem_train_fold = sem_train[train_mask] if sem_train is not None else None
        sem_valid_fold = sem_train[valid_mask] if sem_train is not None else None
        
        y_train = train_data[train_mask][TARGET_NAMES].values
        y_train_proc = y_train / target_max_arr
        
        engine = deepcopy(feat_engine)
        engine.fit(X_train_raw, y=y_train_proc, X_semantic=sem_train_fold)
        
        x_train_eng = engine.transform(X_train_raw, X_semantic=sem_train_fold)
        x_valid_eng = engine.transform(X_valid_raw, X_semantic=sem_valid_fold)
        x_test_eng = engine.transform(X_test_raw, X_semantic=sem_test)
        
        fold_valid_pred = np.zeros_like(train_data[valid_mask][TARGET_NAMES].values)
        fold_test_pred = np.zeros([len(test_data), len(TARGET_NAMES)])
        
        for k in range(len(TARGET_NAMES)):
            regr = deepcopy(model_class)
            regr.fit(x_train_eng, y_train_proc[:, k])
            pred_valid_raw = regr.predict(x_valid_eng)
            pred_test_raw = regr.predict(x_test_eng)
            fold_valid_pred[:, k] = pred_valid_raw * target_max_arr[k]
            fold_test_pred[:, k] = pred_test_raw * target_max_arr[k]
        
        y_pred.loc[val_idx] = fold_valid_pred
        y_pred_test += fold_test_pred / n_splits
    
    return y_pred.values, y_pred_test

feat_engine = SupervisedEmbeddingEngine(n_pca=0.80, n_pls=8, n_gmm=6)

print('Training GradientBoosting...')
oof_gb, pred_gb = cross_validate(GradientBoostingRegressor(), train_df, test_df, feat_engine, sem_train, sem_test)

print('Training HistGradientBoosting...')
oof_hb, pred_hb = cross_validate(HistGradientBoostingRegressor(), train_df, test_df, feat_engine, sem_train, sem_test)

print('Training CatBoost...')
oof_cat, pred_cat = cross_validate(CatBoostRegressor(verbose=0), train_df, test_df, feat_engine, sem_train, sem_test)

print('Training LightGBM...')
oof_lgbm, pred_lgbm = cross_validate(LGBMRegressor(verbose=-1), train_df, test_df, feat_engine, sem_train, sem_test)

print('Training complete')

In [None]:
pred_test = (pred_gb + pred_hb + pred_cat + pred_lgbm) / 4.0

def post_process_biomass(df_preds):
    ordered_cols = ['Dry_Green_g', 'Dry_Clover_g', 'Dry_Dead_g', 'GDM_g', 'Dry_Total_g']
    Y = df_preds[ordered_cols].values.T
    C = np.array([[1, 1, 0, -1, 0], [0, 0, 1, 1, -1]])
    C_T = C.T
    inv_CCt = np.linalg.inv(C @ C_T)
    P = np.eye(5) - C_T @ inv_CCt @ C
    Y_reconciled = P @ Y
    Y_reconciled = Y_reconciled.T.clip(min=0)
    df_out = df_preds.copy()
    df_out[ordered_cols] = Y_reconciled
    return df_out

test_df[TARGET_NAMES] = pred_test
test_df = post_process_biomass(test_df)

def melt_table(df):
    melted = df.melt(
        id_vars='image_path',
        value_vars=TARGET_NAMES,
        var_name='target_name',
        value_name='target'
    )
    melted['sample_id'] = (
        melted['image_path']
        .str.replace(r'^.*/', '', regex=True)
        .str.replace('.jpg', '', regex=False)
        + '__' + melted['target_name']
    )
    return melted[['sample_id', 'target']]

sub = melt_table(test_df)
sub.to_csv(f'{WORKING_DIR}/submission.csv', index=False)
print(sub.head())
print('saved')

In [None]:
mods_to_del = [k for k in list(sys.modules.keys()) if any(x in k for x in ['torch', 'transformers', 'huggingface', 'tokenizers', 'safetensors', 'google.protobuf', 'protobuf'])]
for m in mods_to_del:
    try:
        del sys.modules[m]
    except:
        pass
gc.collect()
print('done')