# Logistic Regression

### Step 1 — Imports + paths (repo-safe)

In [1]:
# Step 1 — Imports + project paths
# Goal: set everything relative to the repository root (works on any machine).

import os
from pathlib import Path

import numpy as np
import pandas as pd

from PIL import Image

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score

import joblib

ROOT = Path.cwd()  # assumes you opened Jupyter from the repo folder
print("CWD:", ROOT)

# Dataset folders (relative to repo root)
TRAIN_DOG_DIR = ROOT / "data" / "dog_chicken" / "train" / "dog"
TRAIN_CHICKEN_DIR = ROOT / "data" / "dog_chicken" / "train" / "chicken"

TEST_DOG_DIR = ROOT / "data" / "dog_chicken" / "test" / "dog"
TEST_CHICKEN_DIR = ROOT / "data" / "dog_chicken" / "test" / "chicken"

for p in [TRAIN_DOG_DIR, TRAIN_CHICKEN_DIR, TEST_DOG_DIR, TEST_CHICKEN_DIR]:
    print(f"{p} exists? ->", p.exists())

CWD: /Users/albadepradahernandez/Bioimage/dog-learning
/Users/albadepradahernandez/Bioimage/dog-learning/data/dog_chicken/train/dog exists? -> True
/Users/albadepradahernandez/Bioimage/dog-learning/data/dog_chicken/train/chicken exists? -> True
/Users/albadepradahernandez/Bioimage/dog-learning/data/dog_chicken/test/dog exists? -> True
/Users/albadepradahernandez/Bioimage/dog-learning/data/dog_chicken/test/chicken exists? -> True


In [2]:
# Step 2 — Collect image paths
# Goal: create lists of filepaths for train and test images.

def list_images(folder: Path, exts=(".jpg", ".jpeg", ".png")):
    folder = Path(folder)
    if not folder.exists():
        return []
    return sorted([p for p in folder.rglob("*") if p.suffix.lower() in exts])

train_dog_paths = list_images(TRAIN_DOG_DIR)
train_chicken_paths = list_images(TRAIN_CHICKEN_DIR)

test_dog_paths = list_images(TEST_DOG_DIR)
test_chicken_paths = list_images(TEST_CHICKEN_DIR)

print("Train dog images:", len(train_dog_paths))
print("Train chicken images:", len(train_chicken_paths))
print("Test dog images:", len(test_dog_paths))
print("Test chicken images:", len(test_chicken_paths))

# Quick peek (optional)
if train_dog_paths:
    print("Example train dog:", train_dog_paths[0])
if train_chicken_paths:
    print("Example train chicken:", train_chicken_paths[0])

Train dog images: 500
Train chicken images: 500
Test dog images: 200
Test chicken images: 200
Example train dog: /Users/albadepradahernandez/Bioimage/dog-learning/data/dog_chicken/train/dog/dog_0001.jpg
Example train chicken: /Users/albadepradahernandez/Bioimage/dog-learning/data/dog_chicken/train/chicken/chicken_0001.jpg


In [3]:
# Step 3 — Define a simple, robust feature extractor
# Goal: produce one fixed-length numeric vector per image.

def load_image_rgb(path: Path, size=(128, 128)):
    """Load image as RGB and resize to a fixed size."""
    img = Image.open(path).convert("RGB")
    img = img.resize(size)
    return np.asarray(img, dtype=np.float32) / 255.0  # normalize to [0,1]

def extract_features(img_rgb: np.ndarray):
    """
    Simple features:
    - mean and std per RGB channel (6)
    - grayscale mean/std (2)
    - edge strength summary via simple gradients (2)
    - brightness histogram (10 bins)
    Total = 6 + 2 + 2 + 10 = 20 features
    """
    # RGB stats
    means = img_rgb.mean(axis=(0, 1))
    stds  = img_rgb.std(axis=(0, 1))

    # grayscale
    gray = (0.299*img_rgb[...,0] + 0.587*img_rgb[...,1] + 0.114*img_rgb[...,2])
    g_mean = gray.mean()
    g_std  = gray.std()

    # simple gradient magnitude (no extra deps)
    gx = np.diff(gray, axis=1)
    gy = np.diff(gray, axis=0)
    # pad to align shapes (optional, not strictly needed)
    grad_mag = np.sqrt(gx[:-1,:]**2 + gy[:,:-1]**2)

    edge_mean = grad_mag.mean()
    edge_std  = grad_mag.std()

    # brightness histogram
    hist, _ = np.histogram(gray, bins=10, range=(0.0, 1.0), density=True)

    feats = np.concatenate([means, stds, [g_mean, g_std, edge_mean, edge_std], hist])
    return feats

FEATURE_NAMES = (
    ["rgb_mean_r","rgb_mean_g","rgb_mean_b",
     "rgb_std_r","rgb_std_g","rgb_std_b",
     "gray_mean","gray_std","edge_mean","edge_std"] +
    [f"gray_hist_bin_{i}" for i in range(10)]
)

def features_from_paths(image_paths):
    rows = []
    for i, p in enumerate(image_paths, 1):
        img = load_image_rgb(p)
        feats = extract_features(img)
        row = dict(zip(FEATURE_NAMES, feats))
        row["filename"] = Path(p).name
        row["path"] = str(Path(p))
        rows.append(row)
        if i % 100 == 0:
            print(f"Processed {i}/{len(image_paths)} images...")
    return pd.DataFrame(rows)

print("Feature dimension:", len(FEATURE_NAMES))

Feature dimension: 20


In [4]:
# Step 4 — Extract TRAIN features and attach labels
# label convention: dog=1, chicken=0

dog_train_df = features_from_paths(train_dog_paths)
dog_train_df["label"] = 1
dog_train_df["class"] = "dog"

chicken_train_df = features_from_paths(train_chicken_paths)
chicken_train_df["label"] = 0
chicken_train_df["class"] = "chicken"

train_df = pd.concat([dog_train_df, chicken_train_df], ignore_index=True)

print("Train feature table shape:", train_df.shape)
train_df.head()

Processed 100/500 images...
Processed 200/500 images...
Processed 300/500 images...
Processed 400/500 images...
Processed 500/500 images...
Processed 100/500 images...
Processed 200/500 images...
Processed 300/500 images...
Processed 400/500 images...
Processed 500/500 images...
Train feature table shape: (1000, 24)


Unnamed: 0,rgb_mean_r,rgb_mean_g,rgb_mean_b,rgb_std_r,rgb_std_g,rgb_std_b,gray_mean,gray_std,edge_mean,edge_std,...,gray_hist_bin_4,gray_hist_bin_5,gray_hist_bin_6,gray_hist_bin_7,gray_hist_bin_8,gray_hist_bin_9,filename,path,label,class
0,0.600365,0.492232,0.43365,0.243382,0.308378,0.327455,0.517888,0.288559,0.045996,0.044884,...,0.875855,0.754394,0.547486,1.030884,1.252442,1.2854,dog_0001.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,1,dog
1,0.455141,0.389829,0.331983,0.227285,0.207572,0.208576,0.402766,0.209728,0.047281,0.048446,...,1.724854,1.390991,0.861206,0.172119,0.339966,0.289307,dog_0002.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,1,dog
2,0.548253,0.631957,0.359717,0.134691,0.18903,0.131444,0.575894,0.159815,0.050676,0.035775,...,0.687256,2.018432,4.078981,1.658325,0.108643,0.0,dog_0003.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,1,dog
3,0.580894,0.516685,0.471282,0.345343,0.369963,0.355135,0.530701,0.360092,0.060587,0.065498,...,0.303955,0.244751,0.369263,0.57373,1.16333,2.69226,dog_0004.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,1,dog
4,0.663931,0.541427,0.461156,0.221088,0.30617,0.361483,0.5689,0.286132,0.041851,0.048678,...,0.846558,0.441894,0.461426,0.852661,0.938111,2.100219,dog_0005.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,1,dog


In [5]:
# Step 5 — Quick sanity checks
# Goal: ensure numeric columns look OK and no missing values.

print("Missing values per column (top 15):")
display(train_df.isna().sum().sort_values(ascending=False).head(15))

print("\nDtypes:")
display(train_df.dtypes)

# Optional: if you want to drop any rows with NaNs (shouldn't happen here)
# train_df = train_df.dropna().reset_index(drop=True)

Missing values per column (top 15):


rgb_mean_r         0
rgb_mean_g         0
label              0
path               0
filename           0
gray_hist_bin_9    0
gray_hist_bin_8    0
gray_hist_bin_7    0
gray_hist_bin_6    0
gray_hist_bin_5    0
gray_hist_bin_4    0
gray_hist_bin_3    0
gray_hist_bin_2    0
gray_hist_bin_1    0
gray_hist_bin_0    0
dtype: int64


Dtypes:


rgb_mean_r         float64
rgb_mean_g         float64
rgb_mean_b         float64
rgb_std_r          float64
rgb_std_g          float64
rgb_std_b          float64
gray_mean          float64
gray_std           float64
edge_mean          float64
edge_std           float64
gray_hist_bin_0    float64
gray_hist_bin_1    float64
gray_hist_bin_2    float64
gray_hist_bin_3    float64
gray_hist_bin_4    float64
gray_hist_bin_5    float64
gray_hist_bin_6    float64
gray_hist_bin_7    float64
gray_hist_bin_8    float64
gray_hist_bin_9    float64
filename               str
path                   str
label                int64
class                  str
dtype: object

In [6]:
# Step 6 — Define features (X) and target (y)

TARGET = "label"
DROP_COLS = ["label", "class", "filename", "path"]

X = train_df.drop(columns=DROP_COLS, errors="ignore")
y = train_df[TARGET].astype(int)

print("X shape:", X.shape)
print("y distribution:\n", y.value_counts())

X shape: (1000, 20)
y distribution:
 label
1    500
0    500
Name: count, dtype: int64


In [7]:
# Step 7 — Train and evaluate (cross-validation)
# Goal: get an estimate of performance.

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", LogisticRegression(max_iter=2000))
])

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_f1 = cross_val_score(pipe, X, y, cv=cv, scoring="f1")

print("F1 scores per fold:", np.round(cv_f1, 4))
print("Mean F1:", cv_f1.mean().round(4), "+/-", cv_f1.std().round(4))

F1 scores per fold: [0.8911 0.8958 0.8842 0.8641 0.901 ]
Mean F1: 0.8872 +/- 0.0128


In [8]:
# Step 8 — Fit final model on all training data and save it

pipe.fit(X, y)

MODEL_PATH = ROOT / "logreg_dog_vs_chicken.joblib"
joblib.dump(pipe, MODEL_PATH)

print("Saved model to:", MODEL_PATH)

Saved model to: /Users/albadepradahernandez/Bioimage/dog-learning/logreg_dog_vs_chicken.joblib


In [9]:
# Step 9 — Extract TEST features and generate predictions
# Note: here we ALSO know the true label by folder, so we can evaluate too.

dog_test_df = features_from_paths(test_dog_paths)
dog_test_df["true_label"] = 1
dog_test_df["true_class"] = "dog"

chicken_test_df = features_from_paths(test_chicken_paths)
chicken_test_df["true_label"] = 0
chicken_test_df["true_class"] = "chicken"

test_df = pd.concat([dog_test_df, chicken_test_df], ignore_index=True)

X_test = test_df.drop(columns=["true_label", "true_class", "filename", "path"], errors="ignore")
y_test = test_df["true_label"].astype(int)

proba_dog = pipe.predict_proba(X_test)[:, 1]
pred_label = (proba_dog >= 0.5).astype(int)
pred_class = np.where(pred_label == 1, "dog", "chicken")

print("Test accuracy:", accuracy_score(y_test, pred_label))
print("Test F1:", f1_score(y_test, pred_label))
print("\nClassification report:\n", classification_report(y_test, pred_label, target_names=["chicken","dog"]))

pred_out = pd.DataFrame({
    "filename": test_df["filename"],
    "path": test_df["path"],
    "true_label": y_test,
    "true_class": test_df["true_class"],
    "pred_label": pred_label,
    "pred_class": pred_class,
    "prob_dog": proba_dog
}).sort_values("filename").reset_index(drop=True)

OUT_CSV = ROOT / "test_predictions.csv"
pred_out.to_csv(OUT_CSV, index=False)
print("Saved predictions to:", OUT_CSV)

pred_out.head(20)

Processed 100/200 images...
Processed 200/200 images...
Processed 100/200 images...
Processed 200/200 images...
Test accuracy: 0.79
Test F1: 0.7653631284916201

Classification report:
               precision    recall  f1-score   support

     chicken       0.74      0.90      0.81       200
         dog       0.87      0.69      0.77       200

    accuracy                           0.79       400
   macro avg       0.80      0.79      0.79       400
weighted avg       0.80      0.79      0.79       400

Saved predictions to: /Users/albadepradahernandez/Bioimage/dog-learning/test_predictions.csv


Unnamed: 0,filename,path,true_label,true_class,pred_label,pred_class,prob_dog
0,chicken_0501.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.026454
1,chicken_0505.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.031258
2,chicken_0507.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.070749
3,chicken_0508.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.17722
4,chicken_0512.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.319131
5,chicken_0514.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.051182
6,chicken_0517.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.087299
7,chicken_0518.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.180324
8,chicken_0519.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,0,chicken,0.013055
9,chicken_0520.jpg,/Users/albadepradahernandez/Bioimage/dog-learn...,0,chicken,1,dog,0.664562


In [10]:
# Step 10 — Reproducibility note (optional)
# If teammates run this, they need the same basic Python deps.

# Quick check:
import sys
print("Python:", sys.version)
import sklearn
print("scikit-learn:", sklearn.__version__)
print("numpy:", np.__version__)
print("pandas:", pd.__version__)

Python: 3.11.14 | packaged by conda-forge | (main, Jan 27 2026, 00:01:01) [Clang 19.1.7 ]
scikit-learn: 1.8.0
numpy: 2.4.2
pandas: 3.0.1
