In [None]:
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models

# Set random seed
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

############################
# 1. Prepare Metadata Model
############################

# Load the metadata
train_meta = pd.read_csv('train_metadata.csv')
test_meta = pd.read_csv('test_metadata.csv')

# Update the correct categorical columns based on provided data
categorical_cols = ['sex', 'anatom_site_general', 'tbp_tile_type', 'tbp_lv_location', 'tbp_lv_location_simple']  # Correct column names

# Encode categorical features
for col in categorical_cols:
    le = LabelEncoder()
    train_meta[col] = le.fit_transform(train_meta[col].astype(str))
    test_meta[col] = le.transform(test_meta[col].astype(str))

# Separate features and labels
X = train_meta.drop(['id', 'target'], axis=1)
y = train_meta['target'].values

# Train-validation split
X_train_meta, X_val_meta, y_train_meta, y_val_meta = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

# Train XGBoost metadata model
metadata_model = xgb.XGBClassifier(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.05,
    random_state=SEED,
    use_label_encoder=False,
    eval_metric='logloss',
    enable_categorical=True
)

# Train the model
eval_set = [(X_val_meta, y_val_meta)]
metadata_model.fit(
    X_train_meta, y_train_meta,
    eval_set=eval_set,
    verbose=False
)

p_metadata_val = metadata_model.predict_proba(X_val_meta)[:, 1]

############################
# 2. Image Dataset and Model
############################

# Assuming image paths and IDs align
train_image_dir = 'train_images'
test_image_dir = 'test_images'

# Create a dataset class for image data
class SkinDataset(Dataset):
    def __init__(self, df, image_dir, transform=None):
        self.df = df.reset_index(drop=True)
        self.image_dir = image_dir
        self.transform = transform
        self.labels = self.df['target'].values if 'target' in self.df.columns else None

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        image_id = self.df.loc[idx, 'id']
        img_path = os.path.join(self.image_dir, str(image_id) + '.jpg')  # Assuming .jpg format
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        if self.labels is not None:
            label = self.labels[idx]
            return image, label
        else:
            return image, image_id

# Image preprocessing
train_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomVerticalFlip(),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

# Prepare train and validation datasets
train_ids_meta = X_train_meta.index
val_ids_meta = X_val_meta.index

train_df_image = train_meta.loc[train_ids_meta].reset_index(drop=True)
val_df_image = train_meta.loc[val_ids_meta].reset_index(drop=True)

train_dataset = SkinDataset(train_df_image, train_image_dir, transform=train_transform)
val_dataset = SkinDataset(val_df_image, train_image_dir, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=4)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=4)

############################
# 3. Train ResNet-50 Model
############################

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

image_model = models.resnet50(pretrained=True)
for param in image_model.parameters():
    param.requires_grad = False

num_features = image_model.fc.in_features
image_model.fc = nn.Linear(num_features, 1)
image_model = image_model.to(device)

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(image_model.fc.parameters(), lr=1e-3)

def train_one_epoch(model, loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for images, labels in loader:
        images = images.to(device)
        labels = labels.float().to(device)

        optimizer.zero_grad()
        logits = model(images)
        loss = criterion(logits.view(-1), labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item() * images.size(0)
    return total_loss / len(loader.dataset)

def evaluate(model, loader, criterion, device):
    model.eval()
    preds = []
    truths = []
    total_loss = 0
    with torch.no_grad():
        for images, labels in loader:
            images = images.to(device)
            labels = labels.float().to(device)
            logits = model(images)
            loss = criterion(logits.view(-1), labels)
            total_loss += loss.item() * images.size(0)
            prob = torch.sigmoid(logits).cpu().numpy().flatten()
            preds.extend(prob)
            truths.extend(labels.cpu().numpy())
    avg_loss = total_loss / len(loader.dataset)
    return avg_loss, np.array(preds), np.array(truths)

# Train the model
epochs = 5
best_auc = 0
for epoch in range(1, epochs + 1):
    train_loss = train_one_epoch(image_model, train_loader, optimizer, criterion, device)
    val_loss, val_preds, val_truth


ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:tbp_tile_type: object, tbp_lv_location: object, tbp_lv_location_simple: object