<a href="https://colab.research.google.com/github/ma850419/Fast_UNet/blob/main/Csiro_Resnet50_XGBoost_V1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import resnet50, ResNet50_Weights
from PIL import Image
import pandas as pd
import numpy as np
# -----------------------------
# 1. Custom Dataset for biomass regression
# -----------------------------
class BiomassDataset(Dataset):
    def __init__(self, csv_file, transform=None):
        self.data = pd.read_csv(csv_file)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        img_path = self.data.iloc[idx]["image_path"]
        target = self.data.iloc[idx]["target"]

        image = Image.open(img_path).convert("RGB")
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(target, dtype=torch.float32)

# -----------------------------
# 2. Transforms
# -----------------------------
transform = transforms.Compose([
    transforms.Resize((384,384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

# -----------------------------
# 3. Load dataset
# -----------------------------
train_dataset = BiomassDataset("/content/drive/MyDrive/csiro/csiro-biomass/train.csv", transform=transform)
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# -----------------------------
# 4. Define ResNet50 model for regression
# -----------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

resnet = resnet50(weights=ResNet50_Weights.DEFAULT)  # pretrained weights
resnet.fc = nn.Linear(resnet.fc.in_features, 1)      # regression output
resnet = resnet.to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(resnet.parameters(), lr=1e-4)

# -----------------------------
# 5. Training loop
# -----------------------------
num_epochs = 15  # adjust as needed

for epoch in range(num_epochs):
    resnet.train()
    running_loss = 0.0
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = resnet(inputs).squeeze()
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {epoch_loss:.4f}")

# -----------------------------
# 6. Save trained weights
# -----------------------------
torch.save(resnet.state_dict(), "/content/drive/MyDrive/resnet50_trained.pth")
print("Trained ResNet50 weights saved as resnet50_trained.pth")



In [None]:
import torch
import numpy as np
import pandas as pd
from PIL import Image
from torchvision import transforms
from torchvision.models import resnet50
import xgboost as xgb
from sklearn.model_selection import KFold
from sklearn.metrics import r2_score

# Reload trained ResNet50 weights
resnet = resnet50(weights=None)
resnet.fc = nn.Linear(resnet.fc.in_features, 1)  # match training
resnet.load_state_dict(torch.load("/content/drive/MyDrive/resnet50_trained.pth", map_location="cpu"))

# Now strip off the regression head to get embeddings
resnet.fc = torch.nn.Identity()
resnet.eval()

transform = transforms.Compose([
    transforms.Resize((384,384)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485,0.456,0.406],
                         std=[0.229,0.224,0.225])
])

def extract_features(img_path):
    img = Image.open(img_path).convert("RGB")
    x = transform(img).unsqueeze(0)
    with torch.no_grad():
        feat = resnet(x)
    return feat.squeeze().numpy()

# Load train/test CSVs
train = pd.read_csv("/content/drive/MyDrive/csiro/csiro-biomass/train.csv")
test = pd.read_csv("/content/drive/MyDrive/csiro/csiro-biomass/test.csv")
y = train["target"]
sample_ids = test["sample_id"]

# Tabular categorical features
X_cat = pd.get_dummies(train[["target_name"]], dummy_na=True)
test_cat = pd.get_dummies(test[["target_name"]], dummy_na=True)
X_cat, test_cat = X_cat.align(test_cat, join="left", axis=1, fill_value=0)

# Extract embeddings
train_img_feats = np.vstack([extract_features(p) for p in train["image_path"]])
test_img_feats = np.vstack([extract_features(p) for p in test["image_path"]])

X = np.hstack([X_cat.values, train_img_feats])
X_test = np.hstack([test_cat.values, test_img_feats])

# Cross-validation with XGBoost
kf = KFold(n_splits=5, shuffle=True, random_state=42)
test_preds = np.zeros(len(X_test))
val_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f"Fold {fold+1}")
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    dtest = xgb.DMatrix(X_test)

    params = {
        "objective": "reg:squarederror",
        "eta": 0.05,
        "max_depth": 8,
        "subsample": 0.8,
        "colsample_bytree": 0.8,
        "seed": 42
    }

    model = xgb.train(params, dtrain, num_boost_round=2000,
                      evals=[(dval, "valid")],
                      early_stopping_rounds=100,
                      verbose_eval=100)

    val_pred = model.predict(dval)
    score = r2_score(y_val, val_pred)
    val_scores.append(score)
    print(f"Fold {fold+1} R²: {score:.4f}")

    test_preds += model.predict(dtest) / kf.n_splits

print(f"Average CV R²: {np.mean(val_scores):.4f}")

# Save final model
model.save_model("xgb_biomass.model")

# Save submission
test_preds = np.maximum(test_preds, 0)
submission = pd.DataFrame({"sample_id": sample_ids, "Prediction": test_preds})
submission.to_csv("submission.csv", index=False)
print("Submission file saved.")
