## CNN solution

In [1]:
import os
import cv2
import numpy as np
import random
from tqdm.notebook import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, random_split

from sklearn.model_selection import train_test_split
from skimage.feature import local_binary_pattern
from sklearn.ensemble import RandomForestClassifier

In [2]:
# Set random seeds for reproducibility
random.seed(42)
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x11b34d450>

In [3]:
# Assuming data/kaggle_data contains subfolders for each class with images inside
data_dir = '../data/kaggle_data'
classes = os.listdir(data_dir)
filepaths = []
labels = []

for cls in classes:
    cls_folder = os.path.join(data_dir, cls)
    if os.path.isdir(cls_folder):
        for fname in os.listdir(cls_folder):
            if fname.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
                filepaths.append(os.path.join(cls_folder, fname))
                labels.append(cls)

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    filepaths, labels, test_size=0.2, stratify=labels, random_state=42
)

In [4]:
transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor(),
    transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])

# Load all data
full_dataset = datasets.ImageFolder(root='../data/kaggle_data', transform=transform)

# Calculate sizes
total_size = len(full_dataset)
train_size = int(0.8 * total_size)
test_size = total_size - train_size  # Ensure all samples are used

# Split the dataset
train_dataset, test_dataset = random_split(
    full_dataset, [train_size, test_size],
    generator=torch.Generator() 
)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
test_loader  = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)


In [5]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 16, kernel_size=3, padding=1)   # Input: 3 channels, Output: 16
        self.pool = nn.MaxPool2d(2, 2)                            # Downsample by 2x
        self.conv2 = nn.Conv2d(16, 32, kernel_size=3, padding=1)  # 16 -> 32 channels
        self.fc1 = nn.Linear(32 * 32 * 32, 128)                   # Assuming input image is 128x128
        self.fc2 = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # (batch, 16, 64, 64)
        x = self.pool(F.relu(self.conv2(x)))  # (batch, 32, 32, 32)
        x = x.view(x.size(0), -1)             # Flatten
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [6]:
num_classes = len(full_dataset.classes)   # This will get the number of categories
model = SimpleCNN(num_classes=num_classes)

In [7]:
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')
print(f"Using device: {device}")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

Using device: mps


In [8]:
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    # Wrap train_loader with tqdm
    for images, labels in tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    avg_loss = running_loss / len(train_loader)
    print(f"Epoch [{epoch+1}/{num_epochs}] Loss: {avg_loss:.4f}")

Epoch 1/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [1/10] Loss: 1.6706


Epoch 2/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [2/10] Loss: 1.1087


Epoch 3/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [3/10] Loss: 0.8534


Epoch 4/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [4/10] Loss: 0.5980


Epoch 5/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [5/10] Loss: 0.3955


Epoch 6/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [6/10] Loss: 0.2330


Epoch 7/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [7/10] Loss: 0.1514


Epoch 8/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [8/10] Loss: 0.1007


Epoch 9/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [9/10] Loss: 0.0918


Epoch 10/10:   0%|          | 0/555 [00:00<?, ?it/s]

Epoch [10/10] Loss: 0.0771


In [9]:
model.eval()
correct = 0
total = 0
with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
print(f'Validation Accuracy: {100 * correct / total:.2f}%')

Validation Accuracy: 64.66%


## Feature Engineering

In [None]:
data_dir = '../data/kaggle_data'
X = []
y = []

for biome_name in os.listdir(data_dir):
    biome_path = os.path.join(data_dir, biome_name)
    if not os.path.isdir(biome_path):
        continue
    for img_name in os.listdir(biome_path):
        img_path = os.path.join(biome_path, img_name)
        img = cv2.imread(img_path)
        if img is None:
            continue
        # You’ll add feature extraction here
        X.append(img)
        y.append(biome_name)

Features included:
1. Color Histogram
- A histogram counts how often each color appears in the image. Here, the RGB (or BGR) space is split into bins (8 per channel, so 8x8x8 = 512 bins).
- Why useful: Different biomes have distinct color palettes. For example:
  - Forest: lots of green
  - Desert: lots of yellow/brown
  - Ocean: mostly blue
- What it captures: The overall color “signature” of the image, not just the average color but the distribution.

2. Local Binary Pattern (LBP) Texture Histogram**

* **What it is:** LBP is a simple but powerful method to describe the local texture of an image (how “rough” or “smooth” an area looks). It works by looking at each pixel and its neighborhood and encoding if each neighbor is lighter/darker than the center.
* **Why useful:** Some biomes have smooth textures (ocean, plains), others are rough or patterned (forests, mountains). LBP helps distinguish these.
* **What it captures:** Texture patterns—how pixel intensities change across the image.

3. Mean Color

* **What it is:** The average value of each color channel (R, G, B) across the whole image.
* **Why useful:** Gives a quick summary of the dominant color. For instance, if the mean green is very high, it might be a forest; if blue dominates, it might be ocean.
* **What it captures:** The “center of mass” of the color distribution—very quick summary information.


In [26]:
# def extract_features(image):
#     features = []
#     # Color histogram (flattened)
#     hist = cv2.calcHist([image], [0, 1, 2], None, [8,8,8], [0,256]*3)
#     features.extend(hist.flatten())
#     # LBP texture
#     gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#     lbp = local_binary_pattern(gray, P=8, R=1, method="uniform")
#     lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
#     features.extend(lbp_hist)
#     # Mean color
#     features.extend(np.mean(image, axis=(0,1)))
#     return np.array(features)

from skimage.measure import shannon_entropy
from sklearn.cluster import KMeans

def get_dominant_colors(image, k=3):
    img = image.reshape((-1, 3))
    kmeans = KMeans(n_clusters=k, n_init=10)
    kmeans.fit(img)
    centers = kmeans.cluster_centers_.flatten()
    return centers


def extract_features(image):
    features = []
    # Color histogram
    hist = cv2.calcHist([image], [0, 1, 2], None, [8,8,8], [0,256]*3)
    features.extend(hist.flatten())
    # LBP texture
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    lbp = local_binary_pattern(gray, P=8, R=1, method="uniform")
    lbp_hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    features.extend(lbp_hist)
    # Mean color
    features.extend(np.mean(image, axis=(0,1)))
    # Stddev of color
    features.extend(np.std(image, axis=(0,1)))
    # # # Dominant colors
    # # features.extend(get_dominant_colors(image, k=3))
    # # Edge density
    # edges = cv2.Canny(gray, 100, 200)
    # edge_density = np.sum(edges > 0) / edges.size
    # features.append(edge_density)
    # # Entropy
    # features.append(shannon_entropy(gray))
    # # Brightness
    # brightness = np.mean(cv2.cvtColor(image, cv2.COLOR_BGR2HSV)[:,:,2])
    # features.append(brightness)
    # # Green pixel ratio
    # hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)
    # lower_green = np.array([36, 25, 25])
    # upper_green = np.array([86, 255,255])
    # mask = cv2.inRange(hsv, lower_green, upper_green)
    # green_ratio = np.sum(mask > 0) / mask.size
    # features.append(green_ratio)
    return np.array(features)

In [27]:
# Replace X with feature vectors, using tqdm for progress bar
X_features = [extract_features(img) for img in tqdm(X, desc="Extracting features")]
X_features = np.array(X_features)

Extracting features:   0%|          | 0/22169 [00:00<?, ?it/s]

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)
clf = RandomForestClassifier(verbose=1)
clf.fit(X_train, y_train)
print("Train Accuracy:", clf.score(X_train, y_train))
print("Test Accuracy:", clf.score(X_test, y_test))

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    3.4s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    6.9s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s


Train Accuracy: 1.0
Test Accuracy: 0.7708615245827695


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.3s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished


Performance:
- Random Forest with color histogram, mean color, and LBP: 0.7682
- Random Forest with color histogram, mean & std color, and LBP: 0.7709
- Random Forest with 8 features: 0.7693

In [23]:
# For demonstration, let's extract each feature separately and build a model for each

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Color histogram (flattened, 512 features)
color_hist = np.array([cv2.calcHist([img], [0,1,2], None, [8,8,8], [0,256]*3).flatten() for img in tqdm(X, desc="Color hist")])

# LBP texture histogram (10 features)
lbp_hist = []
for img in tqdm(X, desc="LBP"):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    lbp = local_binary_pattern(gray, P=8, R=1, method="uniform")
    hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
    lbp_hist.append(hist)
lbp_hist = np.array(lbp_hist)

# Mean color (3 features)
mean_color = np.array([np.mean(img, axis=(0,1)) for img in tqdm(X, desc="Mean color")])

# Stddev color (3 features)
std_color = np.array([np.std(img, axis=(0,1)) for img in tqdm(X, desc="Std color")])

# Edge density (1 feature)
edge_density = []
for img in tqdm(X, desc="Edge density"):
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    edges = cv2.Canny(gray, 100, 200)
    edge_density.append(np.sum(edges > 0) / edges.size)
edge_density = np.array(edge_density).reshape(-1, 1)

# Entropy (1 feature)
from skimage.measure import shannon_entropy
entropy = np.array([shannon_entropy(cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)) for img in tqdm(X, desc="Entropy")]).reshape(-1, 1)

# Brightness (1 feature)
brightness = np.array([np.mean(cv2.cvtColor(img, cv2.COLOR_BGR2HSV)[:,:,2]) for img in tqdm(X, desc="Brightness")]).reshape(-1, 1)

# Green pixel ratio (1 feature)
green_ratio = []
for img in tqdm(X, desc="Green ratio"):
    hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
    lower_green = np.array([36, 25, 25])
    upper_green = np.array([86, 255,255])
    mask = cv2.inRange(hsv, lower_green, upper_green)
    green_ratio.append(np.sum(mask > 0) / mask.size)
green_ratio = np.array(green_ratio).reshape(-1, 1)

Color hist:   0%|          | 0/22169 [00:00<?, ?it/s]

LBP:   0%|          | 0/22169 [00:00<?, ?it/s]

Mean color:   0%|          | 0/22169 [00:00<?, ?it/s]

Std color:   0%|          | 0/22169 [00:00<?, ?it/s]

Edge density:   0%|          | 0/22169 [00:00<?, ?it/s]

Entropy:   0%|          | 0/22169 [00:00<?, ?it/s]

Brightness:   0%|          | 0/22169 [00:00<?, ?it/s]

Green ratio:   0%|          | 0/22169 [00:00<?, ?it/s]

In [25]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Build and evaluate a RandomForest model for each feature, record accuracy


# Color histogram
X_train_hist, X_test_hist, y_train_hist, y_test_hist = train_test_split(color_hist, y, test_size=0.2, random_state=42)
clf_hist = RandomForestClassifier(verbose=1)
clf_hist.fit(X_train_hist, y_train_hist)
acc_hist = accuracy_score(y_test_hist, clf_hist.predict(X_test_hist))

# LBP histogram
X_train_lbp, X_test_lbp, y_train_lbp, y_test_lbp = train_test_split(lbp_hist, y, test_size=0.2, random_state=42)
clf_lbp = RandomForestClassifier(verbose=1)
clf_lbp.fit(X_train_lbp, y_train_lbp)
acc_lbp = accuracy_score(y_test_lbp, clf_lbp.predict(X_test_lbp))

# Mean color
X_train_mean, X_test_mean, y_train_mean, y_test_mean = train_test_split(mean_color, y, test_size=0.2, random_state=42)
clf_mean = RandomForestClassifier(verbose=1)
clf_mean.fit(X_train_mean, y_train_mean)
acc_mean = accuracy_score(y_test_mean, clf_mean.predict(X_test_mean))

# Stddev color
X_train_std, X_test_std, y_train_std, y_test_std = train_test_split(std_color, y, test_size=0.2, random_state=42)
clf_std = RandomForestClassifier(verbose=1)
clf_std.fit(X_train_std, y_train_std)
acc_std = accuracy_score(y_test_std, clf_std.predict(X_test_std))

# Edge density
X_train_edge, X_test_edge, y_train_edge, y_test_edge = train_test_split(edge_density, y, test_size=0.2, random_state=42)
clf_edge = RandomForestClassifier(verbose=1)
clf_edge.fit(X_train_edge, y_train_edge)
acc_edge = accuracy_score(y_test_edge, clf_edge.predict(X_test_edge))

# Entropy
X_train_entropy, X_test_entropy, y_train_entropy, y_test_entropy = train_test_split(entropy, y, test_size=0.2, random_state=42)
clf_entropy = RandomForestClassifier(verbose=1)
clf_entropy.fit(X_train_entropy, y_train_entropy)
acc_entropy = accuracy_score(y_test_entropy, clf_entropy.predict(X_test_entropy))

# Brightness
X_train_bright, X_test_bright, y_train_bright, y_test_bright = train_test_split(brightness, y, test_size=0.2, random_state=42)
clf_bright = RandomForestClassifier(verbose=1)
clf_bright.fit(X_train_bright, y_train_bright)
acc_bright = accuracy_score(y_test_bright, clf_bright.predict(X_test_bright))

# Green pixel ratio
X_train_green, X_test_green, y_train_green, y_test_green = train_test_split(green_ratio, y, test_size=0.2, random_state=42)
clf_green = RandomForestClassifier(verbose=1)
clf_green.fit(X_train_green, y_train_green)
acc_green = accuracy_score(y_test_green, clf_green.predict(X_test_green))
accuracies = [acc_hist, acc_lbp, acc_mean, acc_std, acc_edge, acc_entropy, acc_bright, acc_green]
print(accuracies)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.8s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.7s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.0s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    2.8s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    5.7s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.3s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.7s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    1.4s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    2.8s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: 

[0.7625169147496617, 0.36400541271989173, 0.5155615696887687, 0.4158773116824538, 0.10847992783040145, 0.08750563824988723, 0.13035633739287325, 0.16080288678394228]


[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    3.1s finished
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.1s
[Parallel(n_jobs=1)]: Done 100 out of 100 | elapsed:    0.1s finished
