### Propject Structure 
# twin_classification/
# ├── data/                  # Input images
# │   ├── you/
# │   └── twin/
# ├── models/                # Saved trained models
# │   ├── random_forest.pkl
# │   └── cnn_model.h5
# ├── notebooks/             # Notebooks for experimentation
# │   └── exploration.ipynb
# ├── src/                   # Python scripts
# │   ├── preprocess.py
# │   ├── train.py
# │   ├── evaluate.py
# │   └── predict.py
# ├── web/                   # Optional web interface
# │   ├── index.html
# │   ├── style.css
# │   └── app.py
# ├── requirements.txt       # Project dependencies
# └── README.md              # This file


# Twin Image Classifier Project 
**Goal:** Build a simple machine learning classifier that can tell the difference between
me and my identical twin using basic ML concepts (no deep learning).

## Step 1: Project Overview
- **Type:** Classification
- **Input: (Features)** Images of me and my twin
- **Output: (Targets)** "Neema" or "Amani"
- **Algorithms:** Decision Tree, Logistic Regression, KNN
- **Evaluation Metric:** Accuracy

In [None]:
# ------------------------------
# 1. IMPORT LIBRARIES
# ------------------------------

# For general use
import os
import numpy as np
import matplotlib.pyplot as plt

# For image processing
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# For CNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from tensorflow.keras.optimizers import Adam

# For classical ML
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# For saving/loading models
import joblib
from tensorflow.keras.models import load_model

# ------------------------------
# 2. SET PATHS AND PARAMETERS
# ------------------------------

# Path to your dataset
DATA_DIR = "data"  # Folder containing 'you/' and 'twin/'

# Image parameters
IMG_HEIGHT = 224
IMG_WIDTH = 224
BATCH_SIZE = 16

# ------------------------------
# 3. DATA LOADING AND PREPROCESSING
# ------------------------------

# Using Keras ImageDataGenerator to automatically preprocess and split data
datagen = ImageDataGenerator(
    rescale=1./255,          # Normalize pixel values
    validation_split=0.2,    # 80% train, 20% validation
)

# Training generator
train_generator = datagen.flow_from_directory(
    DATA_DIR,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='binary',     # binary classification: You vs Twin
    subset='training',
    shuffle=True
)

# Validation generator
val_generator = datagen.flow_from_directory(
    DATA_DIR,
    target_size=(IMG_HEIGHT, IMG_WIDTH),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    subset='validation',
    shuffle=False
)

# ------------------------------
# 4. OPTIONAL: PREPARE DATA FOR RANDOM FOREST
# ------------------------------

# Random Forest requires flat features, not images
# Flatten images into vectors
X_train_rf, y_train_rf = [], []
for i in range(len(train_generator)):
    imgs, labels = train_generator[i]
    X_train_rf.extend(imgs.reshape(imgs.shape[0], -1))  # flatten
    y_train_rf.extend(labels)
    if (i+1)*BATCH_SIZE >= train_generator.samples:
        break
X_train_rf = np.array(X_train_rf)
y_train_rf = np.array(y_train_rf)

X_val_rf, y_val_rf = [], []
for i in range(len(val_generator)):
    imgs, labels = val_generator[i]
    X_val_rf.extend(imgs.reshape(imgs.shape[0], -1))
    y_val_rf.extend(labels)
    if (i+1)*BATCH_SIZE >= val_generator.samples:
        break
X_val_rf = np.array(X_val_rf)
y_val_rf = np.array(y_val_rf)

# ------------------------------
# 5. TRAIN RANDOM FOREST CLASSIFIER
# ------------------------------

# Initialize Random Forest
rf_model = RandomForestClassifier(
    n_estimators=100,       # number of trees
    random_state=42
)

# Train the model
rf_model.fit(X_train_rf, y_train_rf)

# Save the model
joblib.dump(rf_model, "models/random_forest.pkl")

# Predict on validation set
y_pred_rf = rf_model.predict(X_val_rf)

# Evaluate
print("Random Forest Classification Report:")
print(classification_report(y_val_rf, y_pred_rf))
print("Random Forest Confusion Matrix:")
print(confusion_matrix(y_val_rf, y_pred_rf))

# ------------------------------
# 6. BUILD AND TRAIN A SIMPLE CNN
# ------------------------------

# CNN model architecture
cnn_model = Sequential([
    Conv2D(32, (3,3), activation='relu', input_shape=(IMG_HEIGHT, IMG_WIDTH, 3)),
    MaxPooling2D(2,2),
    
    Conv2D(64, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    
    Conv2D(128, (3,3), activation='relu'),
    MaxPooling2D(2,2),
    
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # binary output
])

# Compile the model
cnn_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Train the CNN
history = cnn_model.fit(
    train_generator,
    validation_data=val_generator,
    epochs=10
)

# Save the CNN model
cnn_model.save("models/cnn_model.h5")

# ------------------------------
# 7. EVALUATE CNN
# ------------------------------

# Predict on validation set
val_generator.reset()
y_pred_cnn_prob = cnn_model.predict(val_generator)
y_pred_cnn = (y_pred_cnn_prob > 0.5).astype(int)

# True labels
y_true_cnn = val_generator.classes

# Confusion matrix and classification report
print("CNN Classification Report:")
print(classification_report(y_true_cnn, y_pred_cnn))
print("CNN Confusion Matrix:")
print(confusion_matrix(y_true_cnn, y_pred_cnn))

# ------------------------------
# 8. PLOT TRAINING HISTORY (CNN)
# ------------------------------

plt.figure(figsize=(12,4))

# Accuracy
plt.subplot(1,2,1)
plt.plot(history.history['accuracy'], label='train_acc')
plt.plot(history.history['val_accuracy'], label='val_acc')
plt.title('CNN Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend()

# Loss
plt.subplot(1,2,2)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('CNN Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.show()