In [None]:
import os
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from skimage.feature import hog
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler


ModuleNotFoundError: No module named 'cv2'

In [None]:
DATASET_DIR = "../data/raw/dataset/images"
LABELS_PATH = "../data/raw/dataset/labels.csv"

df = pd.read_csv(LABELS_PATH)
df.head()


In [None]:
label_map = dict(zip(df["id"], df["genus"]))


In [None]:
IMG_SIZE = (128, 128)  # smaller is better for classical ML


def extract_hog_features(image_path):
    img = cv2.imread(image_path)
    img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = cv2.resize(img, IMG_SIZE)

    features = hog(
        img,
        orientations=9,
        pixels_per_cell=(8, 8),
        cells_per_block=(2, 2),
        block_norm='L2-Hys'
    )
    return features


In [None]:
X = []
y = []

for file in os.listdir(DATASET_DIR):
    if not file.endswith(".jpg"):
        continue

    image_id = int(os.path.splitext(file)[0])

    if image_id not in label_map:
        continue

    img_path = os.path.join(DATASET_DIR, file)

    features = extract_hog_features(img_path)

    X.append(features)
    y.append(label_map[image_id])

X = np.array(X)
y = np.array(y)

print("Feature matrix shape:", X.shape)
print("Labels shape:", y.shape)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [None]:
np.save("../data/processed/X_train.npy", X_train)
np.save("../data/processed/X_test.npy", X_test)
np.save("../data/processed/y_train.npy", y_train)
np.save("../data/processed/y_test.npy", y_test)
