<a href="https://colab.research.google.com/github/manoprasad2006/kaagle-comp1/blob/main/Training_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: SOME KAGGLE DATA SOURCES ARE PRIVATE
# RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES.
import kagglehub
kagglehub.login()


In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

rice_pistachio_and_grapevine_leaf_classification_path = kagglehub.competition_download('rice-pistachio-and-grapevine-leaf-classification')

print('Data source import complete.')


In [None]:
import os
import gc
import time
import psutil
import logging
import json
import pickle
from datetime import datetime
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight




import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
from torchvision import models

import cv2
from PIL import Image
import albumentations as A
from albumentations.pytorch import ToTensorV2

import xgboost as xgb

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

Using device: cuda


In [None]:
import torch.nn.functional as F

class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, weight=None):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.weight = weight

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, weight=self.weight, reduction='none')
        pt = torch.exp(-ce_loss)
        focal_loss = self.alpha * (1-pt)**self.gamma * ce_loss
        return focal_loss.mean()

In [None]:
def mixup_data(x, y, alpha=1.0):
    if alpha > 0:
        lam = np.random.beta(alpha, alpha)
    else:
        lam = 1

    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(x.device)

    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def mixup_criterion(criterion, pred, y_a, y_b, lam):
    return lam * criterion(pred, y_a) + (1 - lam) * criterion(pred, y_b)

In [None]:
def setup_logging():
    log_filename = f"training_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
    logging.basicConfig(
        level=logging.INFO,
        format='%(asctime)s - %(levelname)s - %(message)s',
        handlers=[
            logging.FileHandler(log_filename),
            logging.StreamHandler()
        ]
    )
    return logging.getLogger(__name__)

logger = setup_logging()

In [None]:
class Config:
    BATCH_SIZE = 28  # Optimized batch size
    EPOCHS = 18      # Sweet spot for convergence
    LEARNING_RATE = 2e-4  # Better learning rate
    IMG_SIZE = 256   # Larger images for better detail
    NUM_CLASSES = 20
    NUM_FOLDS = 4    # 4 folds for balance of performance vs time
    N_SPLITS = NUM_FOLDS
    SEED = 42

    SAVE_EVERY_EPOCH = False
    SAVE_EVERY_N_BATCHES = 300
    CHECKPOINT_DIR = "checkpoints"
    MAX_MEMORY_PERCENT = 90
    PATIENCE = 6  # Balanced patience

    RESUME_FROM_CHECKPOINT = False
    AUTO_REDUCE_BATCH_ON_OOM = True
    MIN_BATCH_SIZE = 8

In [None]:
os.makedirs(Config.CHECKPOINT_DIR, exist_ok=True)

torch.manual_seed(Config.SEED)
np.random.seed(Config.SEED)

In [None]:
def cleanup_memory():
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [None]:
def save_checkpoint(model, optimizer, scheduler, epoch, fold, batch_idx, loss, filepath):
    checkpoint = {
        'epoch': epoch,
        'fold': fold,
        'batch_idx': batch_idx,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler else None,
        'loss': loss,
        'timestamp': datetime.now().isoformat(),
        'memory_usage': psutil.virtual_memory().percent,
        'config': {
            'batch_size': Config.BATCH_SIZE,
            'learning_rate': Config.LEARNING_RATE,
            'img_size': Config.IMG_SIZE
        }
    }

    try:
        torch.save(checkpoint, filepath)
        print(f"Checkpoint saved: {filepath}")
    except Exception as e:
        print(f"Failed to save checkpoint: {e}")

In [None]:
class AgriculturalDataset(Dataset):

    def __init__(self, image_paths, labels=None, transforms=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transforms = transforms

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        try:
            image_path = self.image_paths[idx]

            # Load image with error handling
            if not os.path.exists(image_path):
                print(f"Image not found: {image_path}")
                # Return a black image as fallback
                image = np.zeros((Config.IMG_SIZE, Config.IMG_SIZE, 3), dtype=np.uint8)
            else:
                image = cv2.imread(image_path)
                if image is None:
                    print(f"Failed to load image: {image_path}")
                    image = np.zeros((Config.IMG_SIZE, Config.IMG_SIZE, 3), dtype=np.uint8)
                else:
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

            if self.transforms:
                augmented = self.transforms(image=image)
                image = augmented['image']

            if self.labels is not None:
                label = self.labels[idx]
                return image, torch.tensor(label, dtype=torch.long)

            return image

        except Exception as e:
            print(f"Error loading image {idx}: {e}")
            # Return fallback data
            fallback_image = torch.zeros((3, Config.IMG_SIZE, Config.IMG_SIZE))
            if self.labels is not None:
                return fallback_image, torch.tensor(0, dtype=torch.long)
            return fallback_image

In [None]:
def get_transforms(phase='train'):
    if phase == 'train':
        return A.Compose([
            A.OneOf([
                A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                A.Resize(Config.IMG_SIZE + 32, Config.IMG_SIZE + 32),
            ], p=1.0),
            A.RandomCrop(Config.IMG_SIZE, Config.IMG_SIZE),
            A.HorizontalFlip(p=0.5),
            A.VerticalFlip(p=0.3),
            A.RandomRotate90(p=0.5),
            A.ShiftScaleRotate(shift_limit=0.15, scale_limit=0.15, rotate_limit=25, p=0.7),
            A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.7),
            A.HueSaturationValue(hue_shift_limit=25, sat_shift_limit=35, val_shift_limit=25, p=0.6),
            A.OneOf([
                A.GaussNoise(var_limit=(10.0, 50.0)),
                A.GaussianBlur(blur_limit=3),
                A.MotionBlur(blur_limit=3),
            ], p=0.4),
            A.CoarseDropout(max_holes=6, max_height=24, max_width=24, p=0.4),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])
    else:
        return A.Compose([
            A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
            A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
            ToTensorV2()
        ])

In [None]:
class ResNetClassifier(nn.Module):
    def __init__(self, num_classes=Config.NUM_CLASSES, pretrained=True):
        super(ResNetClassifier, self).__init__()

        # Use ResNet101 for better feature extraction
        self.backbone = models.resnet101(pretrained=pretrained)
        self.backbone.fc = nn.Identity()

        # Attention mechanism for better feature selection
        self.attention = nn.Sequential(
            nn.Linear(2048, 512),
            nn.ReLU(),
            nn.Dropout(0.1),
            nn.Linear(512, 2048),
            nn.Sigmoid()
        )

        # Enhanced classifier
        self.classifier = nn.Sequential(
            nn.Dropout(0.4),
            nn.Linear(2048, 1024),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(1024),
            nn.Dropout(0.3),
            nn.Linear(1024, 512),
            nn.ReLU(inplace=True),
            nn.BatchNorm1d(512),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )

        self._init_weights()

    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)

    def forward(self, x):
        features = self.backbone(x)

        # Apply attention mechanism
        attention_weights = self.attention(features)
        attended_features = features * attention_weights

        output = self.classifier(attended_features)
        return output, attended_features

In [None]:
def load_data(train_dir, test_dir, train_labels_file='/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train.csv'):
    train_images = []
    train_labels = []

    print("Loading training data...")

    # Load training data with labels file
    if train_labels_file and os.path.exists(train_labels_file):
        try:
            labels_df = pd.read_csv(train_labels_file)
            label_dict = dict(zip(labels_df['ID'], labels_df['TARGET']))

            for img_name in os.listdir(train_dir):
                if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                    img_path = os.path.join(train_dir, img_name)
                    if img_name in label_dict:
                        train_images.append(img_path)
                        train_labels.append(label_dict[img_name])

            print(f"Loaded {len(train_images)} training images with labels")

        except Exception as e:
            print(f"Error loading labels file: {e}")
            raise
    else:
        print("No labels file provided! Please create train_labels.csv")
        raise FileNotFoundError("train_labels.csv is required")

    # Load test data
    test_images = []
    test_ids = []

    print("Loading test data...")
    if os.path.exists(test_dir):
        for img_name in os.listdir(test_dir):
            if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                img_path = os.path.join(test_dir, img_name)
                test_images.append(img_path)
                test_ids.append(img_name)

        print(f"Loaded {len(test_images)} test images")

    return train_images, train_labels, test_images, test_ids


In [None]:
def extract_features(model, dataloader, device):
    model.eval()
    features = []
    labels = []

    try:
        with torch.no_grad():
            for batch_idx, batch in enumerate(dataloader):
                if len(batch) == 2:
                    images, batch_labels = batch
                    batch_labels = batch_labels.to(device)
                    labels.extend(batch_labels.cpu().numpy())
                else:
                    images = batch

                images = images.to(device)
                _, batch_features = model(images)
                features.extend(batch_features.cpu().numpy())

                # Periodic cleanup
                if batch_idx % 10 == 0:
                    cleanup_memory()

        return np.array(features), np.array(labels) if labels else None

    except Exception as e:
        print(f"Feature extraction failed: {e}")
        raise


In [None]:
class AgriculturalClassifier:
    """Main classifier class with robustness features"""

    def __init__(self):
        self.cnn_model = None
        self.xgb_model = None
        self.label_encoder = LabelEncoder()
        self.class_names = None

    def train_cnn(self, train_images, train_labels, val_images=None, val_labels=None, fold=0):
      """Train CNN model with robustness features and per-epoch logging"""
      print(f"Starting robust CNN training for fold {fold}")

      # Encode labels
      encoded_labels = self.label_encoder.fit_transform(train_labels)
      self.class_names = self.label_encoder.classes_
      print(f"Found {len(self.class_names)} classes: {self.class_names}")

      # Assign training and validation data based on whether validation data was provided
      if val_images is None or val_labels is None:
          print("Using StratifiedKFold for splitting data")
          from sklearn.model_selection import StratifiedKFold
          skf = StratifiedKFold(n_splits=Config.N_SPLITS, shuffle=True, random_state=Config.SEED)
          for fold_idx, (train_idx, val_idx) in enumerate(skf.split(train_images, encoded_labels)):
              if fold_idx == fold:
                  train_subset_imgs = [train_images[i] for i in train_idx]
                  train_subset_labels = encoded_labels[train_idx]
                  val_subset_imgs = [train_images[i] for i in val_idx]
                  val_subset_labels = encoded_labels[val_idx]
                  break
          train_imgs_for_training = train_subset_imgs
          train_labels_for_training = train_subset_labels
          val_imgs_for_training = val_subset_imgs
          val_labels_for_training = val_subset_labels

      else:
          print("Using provided validation data")
          train_imgs_for_training = train_images
          train_labels_for_training = encoded_labels # Use encoded labels for training
          val_imgs_for_training = val_images
          val_labels_for_training = self.label_encoder.transform(val_labels) # Encode provided validation labels


      # Create datasets
      try:
          train_dataset = AgriculturalDataset(
              train_imgs_for_training, train_labels_for_training, get_transforms('train')
          )
          train_loader = DataLoader(
              train_dataset, batch_size=Config.BATCH_SIZE,
              shuffle=True, num_workers=0, pin_memory=False
          )

          val_dataset, val_loader = None, None
          if val_imgs_for_training is not None:
              val_dataset = AgriculturalDataset(
                  val_imgs_for_training, val_labels_for_training, get_transforms('val')
              )
              val_loader = DataLoader(
                  val_dataset, batch_size=Config.BATCH_SIZE,
                  shuffle=False, num_workers=0, pin_memory=False
              )
      except Exception as e:
          print(f"Failed to create data loaders: {e}")
          raise

      # Initialize model
      self.cnn_model = ResNetClassifier(len(self.class_names)).to(device)
      # In train_cnn method, replace the criterion with:

    # Calculate class weights
      class_weights = compute_class_weight(
          'balanced',
          classes=np.unique(encoded_labels),
          y=encoded_labels
      )
      class_weights = torch.FloatTensor(class_weights).to(device)

    # Use weighted CrossEntropyLoss
      # Calculate class weights for focal loss
      # Calculate class weights for focal loss
      import sklearn.utils.class_weight
      unique_classes = np.unique(train_labels_for_training)
      class_weights_array = sklearn.utils.class_weight.compute_class_weight(
          'balanced',
          classes=unique_classes,
          y=train_labels_for_training
      )
      class_weights = torch.FloatTensor(class_weights_array).to(device)

    # Use Focal Loss with class weights
      criterion = FocalLoss(alpha=1, gamma=2, weight=class_weights)
      optimizer = optim.AdamW(
          self.cnn_model.parameters(),
          lr=Config.LEARNING_RATE,
          weight_decay=1e-4
      )

      scheduler = optim.lr_scheduler.OneCycleLR(
          optimizer,
          max_lr=Config.LEARNING_RATE,
          steps_per_epoch=len(train_loader),
          epochs=Config.EPOCHS,
          pct_start=0.3,
          div_factor=25,
          final_div_factor=10000
      )

      # Check for existing checkpoint
      checkpoint_path = os.path.join(Config.CHECKPOINT_DIR, f'fold_{fold}_latest.pth')
      start_epoch, start_batch, best_val_acc = 0, 0, 0.0

      if Config.RESUME_FROM_CHECKPOINT:
          checkpoint_info = load_checkpoint(checkpoint_path, self.cnn_model, optimizer, scheduler)
          if checkpoint_info:
              start_epoch = checkpoint_info['epoch']
              start_batch = checkpoint_info.get('batch_idx', 0)
              print(f"Resuming from epoch {start_epoch}, batch {start_batch}")

      train_losses, val_accuracies = [], []
      no_improvement_count = 0

      try:
          for epoch in range(start_epoch, Config.EPOCHS):
              print(f"Starting epoch {epoch+1}/{Config.EPOCHS}")
              monitor_memory()

              # Training phase
              self.cnn_model.train()
              running_loss, correct, total = 0.0, 0, 0
              epoch_start_time = time.time()

              batch_start = start_batch if epoch == start_epoch else 0

              for batch_idx, (images, labels) in enumerate(train_loader):
                  if batch_idx < batch_start:
                      continue

                  images, labels = images.to(device), labels.to(device)
                  optimizer.zero_grad()

                # Apply mixup 40% of the time
                  if np.random.rand() < 0.4:
                      images, labels_a, labels_b, lam = mixup_data(images, labels, alpha=0.3)
                      outputs, _ = self.cnn_model(images)
                      loss = mixup_criterion(criterion, outputs, labels_a, labels_b, lam)
                    # For accuracy calculation, use original labels
                      _, predicted = outputs.max(1)
                      total += labels.size(0)
                      correct += (lam * predicted.eq(labels_a).sum().item() +
                                (1-lam) * predicted.eq(labels_b).sum().item())
                  else:
                      outputs, _ = self.cnn_model(images)
                      loss = criterion(outputs, labels)
                      _, predicted = outputs.max(1)
                      total += labels.size(0)
                      correct += predicted.eq(labels).sum().item()
                  loss.backward()
                  torch.nn.utils.clip_grad_norm_(self.cnn_model.parameters(), max_norm=1.0)
                  optimizer.step()

                  running_loss += loss.item()
                  _, predicted = outputs.max(1)
                  total += labels.size(0)
                  correct += predicted.eq(labels).sum().item()

              epoch_loss = running_loss / len(train_loader)
              train_acc = 100. * correct / total
              train_losses.append(epoch_loss)

              # Validation phase
              val_acc = 0.0
              if val_loader is not None:
                  val_acc = self._validate(val_loader) * 100
                  val_accuracies.append(val_acc)

                  if val_acc > best_val_acc:
                      best_val_acc, no_improvement_count = val_acc, 0
                      best_model_path = os.path.join(Config.CHECKPOINT_DIR, f'best_model_fold_{fold}.pth')
                      torch.save(self.cnn_model.state_dict(), best_model_path)
                      print(f"New best validation accuracy: {val_acc:.2f}%")
                  else:
                      no_improvement_count += 1
                      if no_improvement_count >= Config.PATIENCE:
                          print(f"Early stopping at epoch {epoch+1}")
                          break

              # Log epoch summary
              epoch_time = time.time() - epoch_start_time
              print(
                  f'Fold {fold}, Epoch [{epoch+1}/{Config.EPOCHS}] '
                  f'Loss: {epoch_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%, '
                  f'Time: {epoch_time:.1f}s'
              )

              # Save checkpoint
              save_checkpoint(
                  self.cnn_model, optimizer, scheduler,
                  epoch+1, fold, 0, epoch_loss, checkpoint_path
              )
              scheduler.step()
              cleanup_memory()

      except Exception as e:
          print(f"Training failed: {e}")
          emergency_path = os.path.join(Config.CHECKPOINT_DIR, f'emergency_fold_{fold}.pth')
          try:
              save_checkpoint(
                  self.cnn_model, optimizer, scheduler,
                  epoch, fold, 0, epoch_loss if 'epoch_loss' in locals() else 0.0,
                  emergency_path
              )
          except:
              pass
          raise

      return train_losses, val_accuracies


    def _validate(self, val_loader):
        """Validate model with error handling"""
        self.cnn_model.eval()
        correct = 0
        total = 0

        try:
            with torch.no_grad():
                for batch_idx, (images, labels) in enumerate(val_loader):
                    try:
                        images, labels = images.to(device), labels.to(device)
                        outputs, _ = self.cnn_model(images)
                        _, predicted = torch.max(outputs.data, 1)
                        total += labels.size(0)
                        correct += (predicted == labels).sum().item()

                        # Memory cleanup every few batches
                        if batch_idx % 10 == 0:
                            cleanup_memory()

                    except Exception as e:
                        print(f"Validation batch {batch_idx} failed: {e}")
                        continue

            accuracy = correct / total if total > 0 else 0.0
            return accuracy

        except Exception as e:
            print(f"Validation failed: {e}")
            return 0.0

    def train_xgboost(self, features, labels):
        """Train XGBoost model on CNN features"""
        print("Training XGBoost model...")

        try:
            self.xgb_model = xgb.XGBClassifier(
                n_estimators=200,
                max_depth=6,
                learning_rate=0.1,
                subsample=0.8,
                colsample_bytree=0.8,
                random_state=Config.SEED,
                n_jobs=-1
            )

            self.xgb_model.fit(features, labels)
            print("XGBoost training completed")

        except Exception as e:
            print(f"XGBoost training failed: {e}")
            raise

    def predict(self, test_images, tta_transforms=None):
        """Make predictions on test data using TTA with probability averaging."""
        print("Making predictions with TTA...")

        if tta_transforms is None:
            # Enhanced TTA with 6 augmentations
            tta_transforms = [
                get_transforms('val'),  # original
                A.Compose([
                    A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                    A.HorizontalFlip(p=1.0),
                    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    ToTensorV2()
                ]),
                A.Compose([
                    A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                    A.VerticalFlip(p=1.0),
                    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    ToTensorV2()
                ]),
                A.Compose([
                    A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                    A.Transpose(p=1.0),
                    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    ToTensorV2()
                ]),
                A.Compose([
                    A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                    A.RandomBrightnessContrast(brightness_limit=0.1, contrast_limit=0.1, p=1.0),
                    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    ToTensorV2()
                ]),
                A.Compose([
                    A.Resize(Config.IMG_SIZE, Config.IMG_SIZE),
                    A.HorizontalFlip(p=1.0),
                    A.VerticalFlip(p=1.0),
                    A.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    ToTensorV2()
                ])
            ]

        try:
            self.cnn_model.eval()
            final_predictions = []
            all_probs = []

            for idx, img_path in enumerate(test_images):
                tta_probs = []

                # Load the image first
                try:
                    if not os.path.exists(img_path):
                        print(f"Image not found: {img_path}")
                        continue

                    image = cv2.imread(img_path)
                    if image is None:
                        print(f"Failed to load image: {img_path}")
                        continue
                    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
                except Exception as e:
                    print(f"Error loading image {img_path}: {e}")
                    continue

                # Apply each TTA transform
                for tform in tta_transforms:
                    augmented = tform(image=image)  # Use named argument
                    img_tensor = augmented['image'].unsqueeze(0).to(device)  # Add batch dim  # Add batch dim
                    with torch.no_grad():
                        outputs, features = self.cnn_model(img_tensor)
                        probs = torch.softmax(outputs, dim=1).cpu().numpy()
                        tta_probs.append(probs[0])

                # Average probabilities across TTA
                avg_probs = np.mean(tta_probs, axis=0)

                # XGBoost predictions if model exists
                if self.xgb_model is not None:
                    # Extract features for XGBoost
                    with torch.no_grad():
                        _, features = self.cnn_model(augmented['image'].unsqueeze(0).to(device))
                        xgb_prob = self.xgb_model.predict_proba(features.cpu().numpy())
                        # Weighted combination: CNN 70%, XGBoost 30%
                        combined_probs = 0.7 * avg_probs + 0.3 * xgb_prob[0]
                else:
                    combined_probs = avg_probs

                all_probs.append(combined_probs)
                final_predictions.append(np.argmax(combined_probs))

                if idx % 50 == 0:
                    print(f"Processed {idx+1}/{len(test_images)} images")

            # Convert back to class names
            predicted_classes = self.label_encoder.inverse_transform(final_predictions)
            return predicted_classes, np.array(all_probs)

        except Exception as e:
            print(f"Prediction failed: {e}")
            raise


In [None]:
def perform_cross_validation(train_images, train_labels):
    print("Starting robust cross-validation...")

    # Save cross-validation state
    cv_state_file = os.path.join(Config.CHECKPOINT_DIR, 'cv_state.json')

    # Load existing CV state if available
    cv_scores = []
    completed_folds = []

    if os.path.exists(cv_state_file) and Config.RESUME_FROM_CHECKPOINT:
        try:
            with open(cv_state_file, 'r') as f:
                cv_state = json.load(f)
                cv_scores = cv_state.get('scores', [])
                completed_folds = cv_state.get('completed_folds', [])
            print(f"Resuming CV from fold {len(completed_folds) + 1}")
        except Exception as e:
            print(f"Could not load CV state: {e}")

    classifier = AgriculturalClassifier()
    encoded_labels = classifier.label_encoder.fit_transform(train_labels)

    skf = StratifiedKFold(n_splits=Config.NUM_FOLDS, shuffle=True, random_state=Config.SEED)

    for fold, (train_idx, val_idx) in enumerate(skf.split(train_images, encoded_labels)):
        if fold in completed_folds:
            print(f"Fold {fold + 1} already completed, skipping")
            continue

        print(f"Starting Fold {fold + 1}/{Config.NUM_FOLDS}")

        try:
            # Split data
            fold_train_images = [train_images[i] for i in train_idx]
            fold_train_labels = [train_labels[i] for i in train_idx]
            fold_val_images = [train_images[i] for i in val_idx]
            fold_val_labels = [train_labels[i] for i in val_idx]

            # Train fold model
            fold_classifier = AgriculturalClassifier()
            fold_classifier.train_cnn(
                fold_train_images, fold_train_labels,
                fold_val_images, fold_val_labels, fold=fold
            )

            # Evaluate
            val_dataset = AgriculturalDataset(
                fold_val_images,
                fold_classifier.label_encoder.transform(fold_val_labels),
                get_transforms('val')
            )
            val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE,
                                  shuffle=False, num_workers=0)

            val_acc = fold_classifier._validate(val_loader)
            cv_scores.append(val_acc)
            completed_folds.append(fold)

            print(f"Fold {fold + 1} completed - Validation Accuracy: {val_acc:.4f}")

            # Save CV progress
            cv_state = {
                'scores': cv_scores,
                'completed_folds': completed_folds,
                'timestamp': datetime.now().isoformat()
            }
            with open(cv_state_file, 'w') as f:
                json.dump(cv_state, f)

            # Cleanup before next fold
            del fold_classifier
            cleanup_memory()

        except Exception as e:
            print(f"Fold {fold + 1} failed: {e}")
            print("You can restart the script to resume from the next fold")
            break

    if cv_scores:
        print(f"Cross-validation scores: {cv_scores}")
        print(f"Mean CV Score: {np.mean(cv_scores):.4f} (+/- {np.std(cv_scores) * 2:.4f})")
    else:
        print("No CV scores available")

    return cv_scores

In [None]:
def analyze_data_distribution(train_labels):
    print("Analyzing data distribution...")

    # Count distribution
    class_counts = Counter(train_labels)
    print("\nClass distribution:")
    for class_name, count in sorted(class_counts.items()):
        print(f"{class_name}: {count}")

    # Plot distribution
    try:
        plt.figure(figsize=(15, 8))
        classes = list(class_counts.keys())
        counts = list(class_counts.values())

        plt.bar(classes, counts)
        plt.title('Class Distribution in Training Data')
        plt.xlabel('Classes')
        plt.ylabel('Number of Images')
        plt.xticks(rotation=45)
        plt.tight_layout()

        # Save plot
        plot_path = os.path.join(Config.CHECKPOINT_DIR, 'class_distribution.png')
        plt.savefig(plot_path)
        plt.show()
        print(f"Class distribution plot saved to {plot_path}")

    except Exception as e:
        print(f"Could not create distribution plot: {e}")

    return class_counts

In [None]:
def create_submission(test_ids, predictions, filename='submission.csv'):
    try:
        submission_df = pd.DataFrame({
            'ID': test_ids,
            'TARGET': predictions
        })
        submission_df.to_csv(filename, index=False)
        print(f"Submission file saved as {filename}")
        return submission_df
    except Exception as e:
        print(f"Failed to create submission: {e}")
        raise

In [None]:
def create_sample_labels_file():
    """This function is not needed for Kaggle competitions - they provide train.csv"""

    train_dir = "/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train/train"
    train_csv_path = "/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train.csv"

    # First check if the actual train.csv already exists
    if os.path.exists(train_csv_path):
        print("train.csv already exists! Using the actual labels from the dataset.")
        try:
            train_df = pd.read_csv(train_csv_path)
            print(f"Found {len(train_df)} labeled images")
            print(f"Classes in dataset: {sorted(train_df['TARGET'].unique())}")
            print(f"Number of classes: {train_df['TARGET'].nunique()}")
            return True
        except Exception as e:
            print(f"Error reading existing train.csv: {e}")
            return False

    # If for some reason train.csv doesn't exist, create a dynamic version
    print("train.csv not found. This is unusual for a Kaggle competition.")
    print("Creating a sample version, but you should use the actual train.csv from the dataset.")

    if not os.path.exists(train_dir):
        print("Train directory not found!")
        return False

    try:
        # Get all image files
        image_files = []
        for img_name in os.listdir(train_dir):
            if img_name.lower().endswith(('.jpg', '.jpeg', '.png')):
                image_files.append(img_name)

        if not image_files:
            print("No image files found in train directory!")
            return False

        # Try to infer classes from filename patterns instead of hardcoding
        print("Attempting to infer classes from filename patterns...")

        # Method 1: Look for common prefixes/patterns in filenames
        prefixes = set()
        for img_name in image_files[:100]:  # Sample first 100 files
            base_name = os.path.splitext(img_name)[0]

            # Try different pattern extraction methods
            if '_' in base_name:
                prefix = base_name.split('_')[0]
                prefixes.add(prefix)
            elif '-' in base_name:
                prefix = base_name.split('-')[0]
                prefixes.add(prefix)
            else:
                # Extract alphabetic prefix
                import re
                match = re.match(r'^([A-Za-z]+)', base_name)
                if match:
                    prefixes.add(match.group(1))

        if len(prefixes) > 1 and len(prefixes) < 50:
            sample_classes = sorted(list(prefixes))
            print(f"Inferred {len(sample_classes)} classes from filenames: {sample_classes}")
        else:
            # Fallback: Generate generic class names based on estimated number
            estimated_classes = max(3, min(20, len(image_files) // 100))  # Estimate based on dataset size
            sample_classes = [f"CLASS_{i+1}" for i in range(estimated_classes)]
            print(f"Could not infer classes from filenames. Using {estimated_classes} generic classes.")

        # Assign labels using different strategies
        labels_data = []

        if len(prefixes) > 1:
            # Use filename pattern matching
            for img_name in sorted(image_files):
                base_name = os.path.splitext(img_name)[0]

                # Find matching prefix
                assigned_class = None
                for prefix in sample_classes:
                    if base_name.startswith(prefix) or prefix in base_name:
                        assigned_class = prefix
                        break

                if not assigned_class:
                    # Fallback to round-robin
                    assigned_class = sample_classes[len(labels_data) % len(sample_classes)]

                labels_data.append({'ID': img_name, 'TARGET': assigned_class})
        else:
            # Round-robin assignment
            for img_name in sorted(image_files):
                label_idx = len(labels_data) % len(sample_classes)
                sample_label = sample_classes[label_idx]
                labels_data.append({'ID': img_name, 'TARGET': sample_label})

        # Save to working directory (not the input directory which is read-only)
        output_path = '/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train.csv'  # Save to current working directory
        labels_df = pd.DataFrame(labels_data)
        labels_df.to_csv(output_path, index=False)

        # Show class distribution
        class_counts = labels_df['TARGET'].value_counts()
        print(f"Created train.csv with {len(labels_data)} entries")
        print(f"Number of classes: {len(sample_classes)}")
        print("Class distribution:")
        for class_name, count in class_counts.items():
            print(f"  {class_name}: {count}")

        print("WARNING: This is a SAMPLE file created because train.csv was missing!")
        print("For the actual competition, use the provided train.csv from the dataset!")

        return True

    except Exception as e:
        print(f"Failed to create sample labels file: {e}")
        return False

In [None]:
def resume_training_from_checkpoint():
    checkpoint_dir = Config.CHECKPOINT_DIR

    if not os.path.exists(checkpoint_dir):
        print("No checkpoint directory found")
        return False

    # List available checkpoints
    checkpoints = [f for f in os.listdir(checkpoint_dir) if f.endswith('.pth')]

    if not checkpoints:
        print("No checkpoints found")
        return False

    print("\nAvailable checkpoints:")
    for i, ckpt in enumerate(checkpoints):
        print(f"{i}: {ckpt}")

    try:
        choice = input("Enter checkpoint number to resume from (or 'latest'): ").strip()

        if choice.lower() == 'latest':
            # Find latest checkpoint
            checkpoint_files = [(f, os.path.getmtime(os.path.join(checkpoint_dir, f)))
                              for f in checkpoints]
            checkpoint_files.sort(key=lambda x: x[1], reverse=True)
            chosen_checkpoint = checkpoint_files[0][0]
        else:
            chosen_checkpoint = checkpoints[int(choice)]

        print(f"Will resume from: {chosen_checkpoint}")
        Config.RESUME_FROM_CHECKPOINT = True

        return True

    except (ValueError, IndexError):
        print("Invalid choice")
        return False

In [None]:
def validate_setup():
    required_dirs = ["/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train/train", "/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/test/test"]
    required_files = ["/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train.csv"]

    missing_dirs = [d for d in required_dirs if not os.path.exists(d)]
    missing_files = [f for f in required_files if not os.path.exists(f)]

    if missing_dirs or missing_files:
        print("Setup validation failed!")

        if missing_dirs:
            print(f"Missing directories: {missing_dirs}")

        if missing_files:
            print(f"Missing files: {missing_files}")
            if "train_labels.csv" in missing_files:
                print("You can create a sample train_labels.csv file using create_sample_labels_file()")

        return False

    return True

In [None]:
def monitor_memory():
    """Monitor memory usage"""
    try:
        return psutil.virtual_memory().percent
    except:
        return 50.0  # Default fallback

def load_checkpoint(checkpoint_path, model, optimizer, scheduler):
    """Load checkpoint if exists"""
    if os.path.exists(checkpoint_path):
        try:
            checkpoint = torch.load(checkpoint_path, map_location=device)
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            if scheduler and checkpoint.get('scheduler_state_dict'):
                scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
            return checkpoint
        except Exception as e:
            print(f"Failed to load checkpoint: {e}")
            return None
    return None

In [None]:
def quick_train_and_predict():
    print("Agricultural Multi-Class Classifier - Maximum Performance Training")
    print("="*70)

    try:
        TRAIN_DIR = "/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train/train"
        TEST_DIR = "/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/test/test"
        LABELS_FILE = "/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train.csv"

        if not os.path.exists(TRAIN_DIR) or not os.path.exists(LABELS_FILE):
            print(f"Required files not found!")
            return None, None

        # Load data
        train_images, train_labels, test_images, test_ids = load_data(
            TRAIN_DIR, TEST_DIR, LABELS_FILE
        )

        print(f"Training images: {len(train_images)}")
        print(f"Test images: {len(test_images)}")
        print(f"Number of classes: {len(set(train_labels))}")

        if len(train_images) == 0:
            return None, None

        # Train all folds
        fold_classifiers = []

        for fold in range(Config.N_SPLITS):
            print(f"=== Training fold {fold + 1}/{Config.N_SPLITS} ===")
            classifier = AgriculturalClassifier()
            train_losses, val_accuracies = classifier.train_cnn(train_images, train_labels, fold=fold)

            # Load best model
            best_model_path = os.path.join(Config.CHECKPOINT_DIR, f'best_model_fold_{fold}.pth')
            if os.path.exists(best_model_path):
                classifier.cnn_model.load_state_dict(torch.load(best_model_path, map_location=device))

            fold_classifiers.append(classifier)
            cleanup_memory()

        # Extract features for XGBoost
        print("Extracting features for XGBoost training...")
        all_features = []

        for fold_idx, classifier in enumerate(fold_classifiers):
            print(f"Extracting features from fold {fold_idx + 1}")

            feature_dataset = AgriculturalDataset(train_images, None, get_transforms('val'))
            feature_loader = DataLoader(feature_dataset, batch_size=32, shuffle=False, num_workers=0)

            features, _ = extract_features(classifier.cnn_model, feature_loader, device)
            all_features.append(features)
            cleanup_memory()

        # Train XGBoost on ensemble features
        print("Training XGBoost ensemble...")
        ensemble_features = np.mean(all_features, axis=0)
        encoded_labels = fold_classifiers[0].label_encoder.transform(train_labels)

        # Enhanced XGBoost
        xgb_model = xgb.XGBClassifier(
            n_estimators=300,
            max_depth=8,
            learning_rate=0.08,
            subsample=0.8,
            colsample_bytree=0.8,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=Config.SEED,
            n_jobs=-1,
            eval_metric='mlogloss'
        )
        xgb_model.fit(ensemble_features, encoded_labels)

        # Assign XGBoost to all classifiers
        for classifier in fold_classifiers:
            classifier.xgb_model = xgb_model

        print("Making ensemble predictions with CNN + XGBoost...")
        all_fold_predictions = []
        all_fold_probs = []

        # Get fold weights based on validation performance
        fold_weights = []
        for fold_idx in range(Config.N_SPLITS):
            acc_file = os.path.join(Config.CHECKPOINT_DIR, f'best_acc_fold_{fold_idx}.txt')
            if os.path.exists(acc_file):
                with open(acc_file, 'r') as f:
                    weight = float(f.read().strip())
            else:
                weight = 0.9
            fold_weights.append(weight)

        fold_weights = np.array(fold_weights)
        fold_weights = fold_weights / fold_weights.sum()

        for fold_idx, classifier in enumerate(fold_classifiers):
            print(f"Getting predictions from fold {fold_idx + 1} (weight: {fold_weights[fold_idx]:.3f})")
            fold_predictions, fold_probs = classifier.predict(test_images)
            all_fold_predictions.append(fold_predictions)
            all_fold_probs.append(fold_probs)

        # Weighted ensemble
        weighted_probs = np.average(all_fold_probs, axis=0, weights=fold_weights)
        ensemble_predictions = fold_classifiers[0].label_encoder.inverse_transform(
            np.argmax(weighted_probs, axis=1)
        )

        submission_df = create_submission(test_ids, ensemble_predictions)

        return fold_classifiers[0], submission_df

    except Exception as e:
        print(f"Training failed: {e}")
        import traceback
        traceback.print_exc()
        return None, None

In [None]:
def main():
    """Main function with setup and options"""

    print("Enhanced Agricultural Classifier with Cross-Validation Training")
    print("="*70)
    print("Features:")
    print("- Automatic checkpointing every epoch and every 50 batches")
    print("- Memory monitoring and automatic cleanup")
    print("- Automatic batch size reduction on OOM errors")
    print("- Resume training from any checkpoint")
    print("- Early stopping to prevent overfitting")
    print("- Individual fold recovery in cross-validation")
    print("- Comprehensive logging to file and console")
    print("- Data validation and error handling")
    print("="*70)

    # Check directory structure
    if not os.path.exists("/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train/train"):
        print("Train directory 'train/train' not found!")
        if os.path.exists("/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/train/train"):
            print("Found 'train' directory. Please check the structure:")
            print("Expected: train/train/ (with image files)")
        return

    if not os.path.exists("/kaggle/input/rice-pistachio-and-grapevine-leaf-classification/test/test"):
        print("Test directory 'test' not found!")
        print("Please create test directory with test images")
        return


    print("DEBUG: Starting quick_train_and_predict()...")

    classifier, submission = quick_train_and_predict()

    if classifier is not None and submission is not None:
        print("Training pipeline completed successfully!")
        print(f"Submission file created: submission.csv")
        print(f"All checkpoints saved in: {Config.CHECKPOINT_DIR}/")
        print(f"Training logs available in the log file")

        # Display submission summary
        print(f"Submission Summary:")
        print(f"Total predictions: {len(submission)}")
        print(f"Unique classes predicted: {submission['TARGET'].nunique()}")
        print(f"Most predicted class: {submission['TARGET'].mode().iloc[0]}")

    else:
        print("Training pipeline failed or was interrupted")

In [None]:
if __name__ == "__main__":
    main()

Enhanced Agricultural Classifier with Cross-Validation Training
Features:
- Automatic checkpointing every epoch and every 50 batches
- Memory monitoring and automatic cleanup
- Automatic batch size reduction on OOM errors
- Resume training from any checkpoint
- Early stopping to prevent overfitting
- Individual fold recovery in cross-validation
- Comprehensive logging to file and console
- Data validation and error handling
DEBUG: Starting quick_train_and_predict()...
Agricultural Multi-Class Classifier - Maximum Performance Training
Loading training data...
Loaded 6400 training images with labels
Loading test data...
Loaded 1600 test images
Training images: 6400
Test images: 1600
Number of classes: 20
=== Training fold 1/4 ===
Starting robust CNN training for fold 0
Found 20 classes: ['AK' 'ALA_IDRIS' 'ARBORIO' 'BASMATI' 'BD30' 'BD72' 'BD95' 'BINADHAN16'
 'BINADHAN25' 'BINADHAN7' 'BR22' 'BRRI67' 'BUZGULU' 'DIMNIT' 'IPSALA'
 'JASMINE' 'KARACADAG' 'KIRMIZI' 'NAZLI' 'SIIRT']
Using Strati