# Cats and Dogs Classification
<h4> Traditional Machine Learning With Logistic Regression </h4>

### 1. Packages 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from PIL import Image
import os
import pwd


### 2. Preprocessing Data

#### Load data

In [8]:
# copy path
data_folder = '/Users/minhquang/Documents/Learning/AI/data/kagglecatsanddogs_5340/PetImages'

In [9]:
os.listdir(data_folder)

['Cat', 'Dog']

In [5]:
def load_image_lazy(data_folder, target_size=(128, 128)):
    '''
    data_folder: folder contains all of dataset
    -return: images (array), labels
    target_size: fixed size for all images
    '''
    for label in os.listdir(data_folder):
        folder_path = os.path.join(data_folder, label) # take directory of each label
        
        if os.path.isdir(folder_path):
            for file_name in os.listdir(folder_path):
                file_path = os.path.join(folder_path, file_name)
                try:
                    with Image.open(file_path) as img:
                        img = img.convert('RGB')
                        img = img.resize(target_size)
                        img_array = np.asarray(img)
                        yield img_array, label
                except Exception as e:
                    print(f"Can't process the file image: {e}")
                    

In [6]:
def load_data(data_folder):
    X, y = [], []
    count_label = 0
    classes = {}
    for img_array, label in load_image_lazy(data_folder):
        yi = None
        X.append(img_array)
        if label not in classes:
            classes[label] = count_label
            count_label += 1
        y.append(classes[label])
    return np.array(X), np.array(y), classes

In [7]:
X_orig, y_orig, classes = load_data(data_folder)

Can't process the file image: cannot identify image file '/Users/minhquang/Documents/Learning/AI/MachineLearning/LogisticRegression/kagglecatsanddogs_5340/PetImages/Cat/Thumbs.db'
Can't process the file image: cannot identify image file '/Users/minhquang/Documents/Learning/AI/MachineLearning/LogisticRegression/kagglecatsanddogs_5340/PetImages/Cat/666.jpg'




Can't process the file image: cannot identify image file '/Users/minhquang/Documents/Learning/AI/MachineLearning/LogisticRegression/kagglecatsanddogs_5340/PetImages/Dog/Thumbs.db'
Can't process the file image: cannot identify image file '/Users/minhquang/Documents/Learning/AI/MachineLearning/LogisticRegression/kagglecatsanddogs_5340/PetImages/Dog/11702.jpg'


In [8]:
print("X_orig shape:", X_orig.shape)
print("y_orig shape:", y_orig.shape)
print("Classes mapping:", classes)

X_orig shape: (24998, 128, 128, 3)
y_orig shape: (24998,)
Classes mapping: {'Cat': 0, 'Dog': 1}


#### Process data

In [9]:
X_standardize = X_orig/255.
X = X_standardize.reshape(X_standardize.shape[0], -1).T
y = y_orig.reshape(1, -1)

In [10]:
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (49152, 24998)
y shape: (1, 24998)


#### 3. Logistic Regression

In [11]:
class LogisticRegression:
    def __init__(self):
        self.X = None
        self.y = None
        self.w = None
        self.b = None
    @staticmethod
    def sigmoid(z):
        return 1 / (1 + np.exp(-z))
    def propagate(self):
        m = self.X.shape[1]
        z = self.w.T @ self.X + self.b
        yhat = self.sigmoid(z)
        cost = (-1/m) * np.sum(self.y * np.log(yhat) + (1 - self.y) * np.log(1 - yhat))
        
        dw = (1/m) * (self.X @ (yhat - self.y).T)
        db = (1/m) * np.sum(yhat - self.y)
        
        grads = {
            "dw": dw,
            "db": db
        }
        return grads, cost
    def optimize(self, epochs=1000, lr=1e-3, decay_rate=0.9):
        costs = []
        for i in range(epochs):
            grads, cost = self.propagate()
            dw = grads["dw"]
            db = grads["db"]
            self.w -= lr * dw
            self.b -= lr * db
            costs.append(cost)
            if i % 100 == 0:
                print(f"Cost at epoch {i+1}: {cost}")
            lr *= decay_rate
        return costs
    def predict(self, X):
        threshold = 0.5
        prediction = np.zeros((1, X.shape[1]))
        
        z = self.w.T @ X + self.b
        yhat = self.sigmoid(z)
        
        for i in range(X.shape[1]):
            if yhat[0, i] >= threshold:
                prediction[0, i] = 1 # dog
            else:
                prediction[0, i] = 0 # cat
        return prediction
    def score(self, X, y):
        prediction = self.predict(X)
        accuracy = 100 * np.sum(y == prediction) / y.shape[1]
        return accuracy
        
    def fit(self, X, y, lr=1e-3, decay_rate=0.9, epochs=1000):
        self.X = X
        self.y = y
        self.w = np.zeros((X.shape[0], 1))
        self.b = 0
        self.optimize(epochs, lr, decay_rate)
        

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X.T, y.T, train_size=0.8)

In [18]:
print(y.shape)
print(X.shape)

(1, 24998)
(49152, 24998)


In [14]:
lr = LogisticRegression()
lr.fit(X_train.T, y_train.T)

Cost at epoch 1: 0.6931471805599453


In [15]:
lr.score(X_test.T, y_test.T)

np.float64(50.12)

In [16]:
from sklearn.decomposition import PCA

n_components = 100  
pca = PCA(n_components=n_components)
X_reduced = pca.fit_transform(X.T).T 

print("X shape sau PCA:", X_reduced.shape) 

X shape sau PCA: (100, 24998)


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X_reduced.T, y.T, train_size=0.8)

In [21]:
lr.fit(X_train.T, y_train.T)

Cost at epoch 1: 0.6931471805599453


In [23]:
lr.score(X_test.T, y_test.T)

np.float64(56.28)

=> The accuracy is so low although using the PCA for feature engineering

<b><i> Let try it again with Deep Learning </i></b> <br><br>
-> DeepLearning/Project/cats_and_dogs_classification