# Prediccion de Precios de Casas usando CNN
Este notebook muestra un ejemplo de como entrenar una red neuronal convolucional (CNN) para predecir el precio de una casa utilizando tanto las caracteristicas numericas como las imagenes asociadas a cada propiedad.

## Carga de librerias y datos
Se utilizan librerias comunes de Python junto con **PyTorch** para definir el modelo y entrenarlo. Los datos se cargan desde los archivos CSV proporcionados y las imagenes se leen desde la carpeta `imgs`.

In [12]:
import pandas as pdimport numpy as npfrom PIL import Imageimport osimport torchfrom torch import nnfrom torch.utils.data import Dataset, DataLoaderfrom torchvision import transforms, modelsfrom sklearn.model_selection import train_test_splitfrom sklearn.preprocessing import StandardScalertrain_df = pd.read_csv('data/train_data.csv')test_df = pd.read_csv('data/test_data.csv')# Limpieza y extracción de año y mesdef preprocess(df, ref_df=None):    df = df.copy()    df['date'] = pd.to_datetime(df['date'], format='%Y%m%dT%H%M%S')    df['year'] = df['date'].dt.year    df['month'] = df['date'].dt.month    df.drop(columns='date', inplace=True)    num_cols = df.select_dtypes(include=np.number).columns    means = ref_df[num_cols].mean() if ref_df is not None else df[num_cols].mean()    df[num_cols] = df[num_cols].fillna(means)    return dftrain_df = preprocess(train_df)test_df = preprocess(test_df, train_df)numeric_cols = [c for c in train_df.select_dtypes(include=np.number).columns if c not in ['ID', 'price']]scaler = StandardScaler()train_df[numeric_cols] = scaler.fit_transform(train_df[numeric_cols])test_df[numeric_cols] = scaler.transform(test_df[numeric_cols])train_split, val_split = train_test_split(train_df, test_size=0.2, random_state=42)

## Preparacion del conjunto de datos
Creamos una clase `Dataset` que combina las columnas numericas con las imagenes de cada casa. Cada propiedad tiene hasta cinco imagenes: bano, dormitorio, comedor, cocina y sala de estar. Las imagenes se redimensionan y se concatenan en un solo tensor.

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset
from torchvision import transforms
from PIL import Image

class HouseDataset(Dataset):
    def __init__(self, df, img_dir, numeric_cols=None, scaler=None, is_train=True):
        self.df = df
        self.img_dir = img_dir
        self.is_train = is_train
        self.numeric_cols = numeric_cols or [c for c in df.select_dtypes(include=[np.number]).columns if c not in ['ID', 'price']]
        self.transform = transforms.Compose([
            transforms.Resize((64, 64)),
            transforms.ToTensor(),
        ])
        if scaler is None:
            self.means = df[self.numeric_cols].mean()
            self.stds = df[self.numeric_cols].std().replace(0, 1)
        else:
            self.means = scaler['mean']
            self.stds = scaler['std']

    def __len__(self):
        return len(self.df)

    def load_images(self, idx):
        property_id = int(self.df.iloc[idx]['ID'])
        parts = ['bath', 'bed', 'din', 'kitchen', 'living']
        images = []
        for p in parts:
            path = os.path.join(self.img_dir, f'{p}_{property_id}.jpg')
            if os.path.exists(path):
                img = Image.open(path).convert('RGB')
            else:
                img = Image.new('RGB', (64, 64), (0, 0, 0))
            images.append(self.transform(img))
        return torch.stack(images)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_tensor = self.load_images(idx)
        numeric = row[self.numeric_cols].fillna(0).astype(float)
        numeric = (numeric - self.means) / self.stds
        numeric = torch.tensor(numeric.to_numpy(), dtype=torch.float32)
        if self.is_train:
            target = torch.tensor(row['price'], dtype=torch.float32)
            return img_tensor, numeric, target
        else:
            return img_tensor, numeric


## Definicion del modelo
El modelo consta de una red convolucional que procesa cada imagen de manera independiente seguida de una capa totalmente conectada que incorpora las caracteristicas numericas.

In [14]:
class CNNModel(nn.Module):    def __init__(self, num_features):        super().__init__()        backbone = models.resnet18(weights=models.ResNet18_Weights.DEFAULT)        num_img_feat = backbone.fc.in_features        backbone.fc = nn.Identity()        self.backbone = backbone        self.fc = nn.Sequential(            nn.Linear(5 * num_img_feat + num_features, 128),            nn.ReLU(),            nn.Linear(128, 1)        )    def forward(self, images, numeric):        b, n, c, h, w = images.shape        images = images.view(b * n, c, h, w)        feats = self.backbone(images)        feats = feats.view(b, n * feats.shape[1])        x = torch.cat([feats, numeric], dim=1)        return self.fc(x).squeeze()

## Entrenamiento
Se preparan los *DataLoader* para entrenamiento y validacion. Luego se entrena la red durante varias epocas utilizando el optimizador Adam y la funcion de perdida MSE.

In [15]:
train_transforms = transforms.Compose([    transforms.Resize((224, 224)),    transforms.RandomHorizontalFlip(),    transforms.ToTensor(),])val_transforms = transforms.Compose([    transforms.Resize((224, 224)),    transforms.ToTensor(),])train_dataset = HouseDataset(train_split, 'imgs', numeric_cols, transform=train_transforms, is_train=True)val_dataset = HouseDataset(val_split, 'imgs', numeric_cols, transform=val_transforms, is_train=True)train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)val_loader = DataLoader(val_dataset, batch_size=16)model = CNNModel(num_features=len(numeric_cols))criterion = nn.MSELoss()optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)for epoch in range(10):    model.train()    for images, numeric, target in train_loader:        pred = model(images, numeric)        loss = criterion(pred, target)        optimizer.zero_grad()        loss.backward()        optimizer.step()    model.eval()    val_loss = 0.0    with torch.no_grad():        for images, numeric, target in val_loader:            pred = model(images, numeric)            val_loss += criterion(pred, target).item()    val_loss /= len(val_loader)    print(f'Epoch {epoch+1}, Val Loss: {val_loss:.4f}')

ValueError: could not convert string to float: '20141107T000000'

## Generacion de predicciones
Se cargan los datos de prueba y se guarda el archivo `submission.csv` con el formato solicitado.

In [None]:
test_dataset = HouseDataset(test_df, 'imgs', numeric_cols, transform=val_transforms, is_train=False)test_loader = DataLoader(test_dataset, batch_size=16)model.eval()preds = []with torch.no_grad():    for images, numeric in test_loader:        pred = model(images, numeric)        preds.extend(pred.cpu().numpy())submission = pd.DataFrame({'ID': test_df['ID'], 'price': preds})submission.to_csv('submission.csv', index=False)