# Entrenamiento baseline
Modelo sencillo (ResNet18) que predice Dry_Clover_g, Dry_Green_g y Dry_Dead_g a partir de cada foto.

In [None]:
import os
import sys
import random
from pathlib import Path
import numpy as np
import pandas as pd
from PIL import Image

# Detectar si estamos en Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    from google.colab import drive
    drive.mount('/content/drive', force_remount=False)
    project_path = '/content/drive/MyDrive/image2biomass'
    if os.path.exists(project_path):
        os.chdir(project_path)
        print(f"Directorio de trabajo cambiado a: {os.getcwd()}")
    else:
        print(f"Advertencia: No se encontr√≥ el directorio {project_path}")
else:
    sys.path.append('../')

from utils.paths import get_data_path

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from torchvision import models, transforms

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

In [None]:
base_path = Path(get_data_path())
train_df = pd.read_csv(base_path / 'train.csv')
test_df = pd.read_csv(base_path / 'test.csv')

targets = ['Dry_Clover_g', 'Dry_Green_g', 'Dry_Dead_g']
pivot = (
    train_df
    .pivot_table(index='image_path', columns='target_name', values='target')
    .reset_index()
)
pivot = pivot[['image_path'] + targets].dropna().reset_index(drop=True)
print(f"Imagenes disponibles: {len(pivot)}")
pivot.head()

In [None]:
# Split simple 80/20
perm = np.random.permutation(len(pivot))
split = int(len(pivot) * 0.8)
train_meta = pivot.iloc[perm[:split]].reset_index(drop=True)
val_meta = pivot.iloc[perm[split:]].reset_index(drop=True)

img_size = 224
train_tfms = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])
val_tfms = transforms.Compose([
    transforms.Resize((img_size, img_size)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

class BiomassDataset(Dataset):
    def __init__(self, df, images_root, transform):
        self.df = df
        self.images_root = Path(images_root)
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = self.images_root / row['image_path']
        image = Image.open(img_path).convert('RGB')
        image = self.transform(image)
        target = torch.tensor(row[targets].values.astype(np.float32))
        return image, target

batch_size = 16
train_ds = BiomassDataset(train_meta, base_path, train_tfms)
val_ds = BiomassDataset(val_meta, base_path, val_tfms)
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0)
len(train_ds), len(val_ds)

In [None]:
def create_model():
    model = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
    model.fc = nn.Linear(model.fc.in_features, len(targets))
    return model

model = create_model().to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)
epochs = 5
best_val = float('inf')

def run_epoch(loader, train):
    model.train() if train else model.eval()
    total_loss = 0.0
    with torch.set_grad_enabled(train):
        for images, y in loader:
            images = images.to(device)
            y = y.to(device)
            preds = model(images)
            loss = criterion(preds, y)
            if train:
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
            total_loss += loss.item() * len(images)
    return total_loss / len(loader.dataset)

for epoch in range(1, epochs + 1):
    train_loss = run_epoch(train_loader, train=True)
    val_loss = run_epoch(val_loader, train=False)
    if val_loss < best_val:
        best_val = val_loss
        os.makedirs('models', exist_ok=True)
        torch.save(model.state_dict(), 'models/baseline_resnet18.pt')
    print(f"Epoch {epoch} | train {train_loss:.4f} | val {val_loss:.4f} | best {best_val:.4f}")

In [None]:
class BiomassTestDataset(Dataset):
    def __init__(self, df, images_root, transform):
        self.df = df.reset_index(drop=True)
        self.images_root = Path(images_root)
        self.transform = transform
    def __len__(self):
        return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_path = self.images_root / row['image_path']
        image = Image.open(img_path).convert('RGB')
        return self.transform(image), row['image_path']

test_meta = (
    test_df[test_df['target_name'].isin(targets)]
    .drop_duplicates('image_path')
    .reset_index(drop=True)
)
test_loader = DataLoader(BiomassTestDataset(test_meta, base_path, val_tfms), batch_size=batch_size, shuffle=False, num_workers=0)

model.load_state_dict(torch.load('models/baseline_resnet18.pt', map_location=device))
model.eval()

pred_rows = []
with torch.no_grad():
    for images, paths in test_loader:
        images = images.to(device)
        preds = model(images).cpu().numpy()
        for path, pred in zip(paths, preds):
            pred_rows.append({
                'image_path': path,
                **{t: p for t, p in zip(targets, pred)},
            })

pred_df = pd.DataFrame(pred_rows).sort_values('image_path').reset_index(drop=True)
long_df = pred_df.melt(id_vars='image_path', value_vars=targets, var_name='target_name', value_name='target')
long_df['sample_id'] = long_df.apply(lambda r: f"{Path(r['image_path']).stem}__{r['target_name']}", axis=1)
submission = long_df[['sample_id', 'target']]
os.makedirs('models', exist_ok=True)
submission.to_csv('models/submission_baseline.csv', index=False)
submission.head()