In [12]:
import os
import numpy as np
import joblib
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.transform import resize
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import MinMaxScaler
from sklearn.calibration import CalibratedClassifierCV

# Define paths
input_path = 'C:/Users/Victor/OneDrive/Documents/hackthorn/dataset'
categories = ['organic', 'recyclable']
batch_size = 50

# Initialize model and scaler
base_model = SGDClassifier(loss="hinge")  # Base SVM model (no probability)
model = CalibratedClassifierCV(base_model, cv=3)  # Enables probability estimation
scaler = MinMaxScaler()  # Normalize pixel values (better for images)
is_first_batch = True

def load_images_in_batches():
    """Generator to yield images in small batches."""
    data, labels = [], []
    
    for category_index, category in enumerate(categories):
        category_path = os.path.join(input_path, category)
        if not os.path.exists(category_path):
            print(f"⚠️ Warning: {category_path} does not exist!")
            continue

        files = os.listdir(category_path)[:400]  # Limit per category
        for file in files:
            img_path = os.path.join(category_path, file)
            try:
                # Load and preprocess image
                image = imread(img_path).astype(np.float32)
                if len(image.shape) == 3:  # Convert to grayscale if RGB
                    image = resize(image, (256, 256))
                    image = rgb2gray(image)
                
                data.append(image.flatten())  # Flatten image
                labels.append(category_index)

                # Yield batch
                if len(data) >= batch_size:
                    X_batch, y_batch = np.array(data), np.array(labels)
                    unique_classes = np.unique(y_batch)

                    if len(unique_classes) > 1:  # Ensure batch has both classes
                        yield X_batch, y_batch
                    else:
                        print("⚠️ Skipping batch with only one class.")

                    data, labels = [], []  # Reset batch

            except Exception as e:
                print(f"❌ Error processing {img_path}: {e}")

    # Yield last batch if not empty
    if data:
        X_batch, y_batch = np.array(data), np.array(labels)
        if len(np.unique(y_batch)) > 1:
            yield X_batch, y_batch
        else:
            print("⚠️ Skipping last batch with only one class.")

# Train model incrementally
batch_count = 0
for X_batch, y_batch in load_images_in_batches():
    X_batch = scaler.fit_transform(X_batch)  # Normalize data

    if is_first_batch:
        model.fit(X_batch, y_batch)  # First batch: Train from scratch
        is_first_batch = False
    else:
        model.base_estimator.partial_fit(X_batch, y_batch, classes=np.array([0, 1]))  # Incremental training

    batch_count += 1
    print(f"✅ Processed batch {batch_count}")

# Save model and scaler
joblib.dump(model, "svm_incremental.pkl")
joblib.dump(scaler, "scaler.pkl")
print("\n🎉 Model trained in batches and saved as 'svm_incremental.pkl'.")



🎉 Model trained in batches and saved as 'svm_incremental.pkl'.
