In [1]:
import os
import cv2
import numpy as np
from skimage.io import imread
from skimage.transform import resize
from skimage.feature import hog
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as transforms
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier

In [2]:
import os
from zipfile import ZipFile


!pip install kaggle


os.environ['KAGGLE_USERNAME'] = "majisouvik1099"
os.environ['KAGGLE_KEY'] = "9464225218d56f8bfea9dd9cc437489f"


!kaggle datasets download -d jessicali9530/lfw-dataset


with ZipFile('lfw-dataset.zip', 'r') as zip_ref:
    zip_ref.extractall('lfw-dataset')


os.remove('lfw-dataset.zip')


Downloading lfw-dataset.zip to /content
 93% 105M/112M [00:01<00:00, 58.6MB/s] 
100% 112M/112M [00:01<00:00, 70.0MB/s]


In [3]:

def extract_features(image_path, model):
    image = Image.open(image_path).convert('RGB')
    preprocess = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    image = preprocess(image)
    image = image.unsqueeze(0)
    with torch.no_grad():
        features = model(image)
    features = features.squeeze(0)
    return features


# def compute_hog(img):
#     resized_img = resize(img, (128*4, 64*4))
#     fd, hog_image = hog(resized_img, orientations=9, pixels_per_cell=(8, 8),
#                     cells_per_block=(2, 2), visualize=True, multichannel=True)
#     return fd


def get_pixel(img, center, x, y):
    new_value = 0
    try:
        if img[x][y] >= center:
            new_value = 1
    except:
        pass
    return new_value

# def lbp_calculated_pixel(img, x, y):
#     center = img[x][y]
#     val_ar = []
#     val_ar.append(get_pixel(img, center, x-1, y+1))
#     val_ar.append(get_pixel(img, center, x, y+1))
#     val_ar.append(get_pixel(img, center, x+1, y+1))
#     val_ar.append(get_pixel(img, center, x+1, y))
#     val_ar.append(get_pixel(img, center, x+1, y-1))
#     val_ar.append(get_pixel(img, center, x, y-1))
#     val_ar.append(get_pixel(img, center, x-1, y-1))
#     val_ar.append(get_pixel(img, center, x-1, y))

#     power_val = [1, 2, 4, 8, 16, 32, 64, 128]
#     val = 0
#     for i in range(len(val_ar)):
#         val += val_ar[i] * power_val[i]
#     return val

# def calcLBP(img):
#     height, width, channel = img.shape
#     img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#     img_lbp = np.zeros((height, width,3), np.uint8)
#     for i in range(0, height):
#         for j in range(0, width):
#              img_lbp[i, j] = lbp_calculated_pixel(img_gray, i, j)
#     hist_lbp = cv2.calcHist([img_lbp], [0], None, [256], [0, 256])
#     return hist_lbp.flatten()


In [4]:
resnet = models.resnet50(pretrained=True)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet.eval()

Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:01<00:00, 82.8MB/s]


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [6]:
lfw_folder = '/content/lfw-dataset/lfw-deepfunneled/lfw-deepfunneled'
X, y = [], []
for folder_name in os.listdir(lfw_folder):
    folder_path = os.path.join(lfw_folder, folder_name)
    if os.path.isdir(folder_path):
        num_images = len(os.listdir(folder_path))
        if num_images > 70:
            for image_name in os.listdir(folder_path):
                image_path = os.path.join(folder_path, image_name)
                image = imread(image_path)
                # hog_feature = compute_hog(image)
                # lbp_feature = calcLBP(image)
                cnn_feature = extract_features(image_path, resnet).numpy()
                # hog_feature = hog_feature.reshape(-1)
                # lbp_feature = lbp_feature.reshape(-1)
                cnn_feature = cnn_feature.flatten()
                # combined_feature = np.concatenate((hog_feature, lbp_feature, cnn_feature))

                X.append(cnn_feature)
                y.append(folder_name)


In [10]:
X

array([[4.46720541e-01, 1.28536057e+00, 6.44453347e-01, ...,
        6.56331107e-02, 1.60527021e-01, 1.08737595e-01],
       [2.96980768e-01, 1.08681452e+00, 1.95027459e+00, ...,
        1.56677827e-01, 9.73988846e-02, 1.68455258e-01],
       [2.84780651e-01, 9.44002450e-01, 9.02758598e-01, ...,
        1.63003162e-03, 1.31817728e-01, 1.19959459e-01],
       ...,
       [3.60536426e-01, 5.89500487e-01, 7.05947876e-01, ...,
        3.96938361e-02, 1.01420663e-01, 2.91724920e-01],
       [1.68791384e-01, 9.93731916e-01, 1.06215811e+00, ...,
        1.31587954e-02, 1.08631611e-01, 1.77908331e-01],
       [2.46273249e-01, 1.13266075e+00, 6.98997498e-01, ...,
        1.55119365e-02, 5.14365267e-03, 1.02916092e-01]], dtype=float32)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)

In [12]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

Decision Tree without PCA or LDA

In [13]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train, y_train_encoded)
y_pred_dt = dt_clf.predict(X_test)
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt)
print("Decision Tree Classifier")
print("Accuracy:", accuracy_dt)
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred_dt, target_names=label_encoder.classes_))

Decision Tree Classifier
Accuracy: 0.45348837209302323
Classification Report:
                   precision    recall  f1-score   support

     Ariel_Sharon       0.55      0.38      0.44        16
     Colin_Powell       0.54      0.48      0.51        60
  Donald_Rumsfeld       0.16      0.19      0.17        21
    George_W_Bush       0.62      0.58      0.60       108
Gerhard_Schroeder       0.11      0.11      0.11        19
      Hugo_Chavez       0.14      0.17      0.15        12
       Tony_Blair       0.32      0.50      0.39        22

         accuracy                           0.45       258
        macro avg       0.35      0.34      0.34       258
     weighted avg       0.47      0.45      0.46       258



  Grid Search for Decision Tree

In [22]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_features': ['auto', 'sqrt', 'log2'],
              'ccp_alpha': [0.1, .01, .001],
              'max_depth' : [5, 6, 7, 8, 9],
              'criterion' :['gini', 'entropy']
             }
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5, verbose=True)
grid_search.fit(X_train, y_train_encoded)

Fitting 5 folds for each of 90 candidates, totalling 450 fits




In [23]:
final_model = grid_search.best_estimator_
final_model

In [28]:
dt_clf = DecisionTreeClassifier(ccp_alpha=0.01, class_weight=None, criterion='entropy',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0,
                       random_state=1024, splitter='best')
dt_clf.fit(X_train, y_train_encoded)
y_pred_dt = dt_clf.predict(X_test)
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt)

print("Decision Tree Classifier")
print("Accuracy:", accuracy_dt)
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred_dt, target_names=label_encoder.classes_))


Decision Tree Classifier
Accuracy: 0.4069767441860465
Classification Report:
                   precision    recall  f1-score   support

     Ariel_Sharon       0.25      0.12      0.17        16
     Colin_Powell       0.51      0.38      0.44        60
  Donald_Rumsfeld       0.33      0.10      0.15        21
    George_W_Bush       0.50      0.64      0.56       108
Gerhard_Schroeder       0.08      0.05      0.06        19
      Hugo_Chavez       0.18      0.25      0.21        12
       Tony_Blair       0.16      0.23      0.19        22

         accuracy                           0.41       258
        macro avg       0.29      0.25      0.25       258
     weighted avg       0.40      0.41      0.39       258



Decision Tree with PCA

In [19]:
from sklearn.decomposition import PCA
pca = PCA(n_components=0.95)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)


dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_pca, y_train_encoded)
y_pred_dt = dt_clf.predict(X_test_pca)
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt)

print("Decision Tree Classifier")
print("Accuracy:", accuracy_dt)
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred_dt, target_names=label_encoder.classes_))

Decision Tree Classifier
Accuracy: 0.36046511627906974
Classification Report:
                   precision    recall  f1-score   support

     Ariel_Sharon       0.15      0.12      0.14        16
     Colin_Powell       0.44      0.32      0.37        60
  Donald_Rumsfeld       0.16      0.19      0.17        21
    George_W_Bush       0.51      0.52      0.51       108
Gerhard_Schroeder       0.22      0.26      0.24        19
      Hugo_Chavez       0.12      0.17      0.14        12
       Tony_Blair       0.18      0.23      0.20        22

         accuracy                           0.36       258
        macro avg       0.26      0.26      0.25       258
     weighted avg       0.38      0.36      0.36       258



In [29]:
dt_clf = DecisionTreeClassifier(ccp_alpha=0.01, class_weight=None, criterion='entropy',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0,
                       random_state=1024, splitter='best')
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)
dt_clf.fit(X_train_pca, y_train_encoded)
y_pred_dt = dt_clf.predict(X_test_pca)
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt)

print("Decision Tree Classifier")
print("Accuracy:", accuracy_dt)
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred_dt, target_names=label_encoder.classes_))


Decision Tree Classifier
Accuracy: 0.43023255813953487
Classification Report:
                   precision    recall  f1-score   support

     Ariel_Sharon       0.45      0.31      0.37        16
     Colin_Powell       0.33      0.13      0.19        60
  Donald_Rumsfeld       0.38      0.14      0.21        21
    George_W_Bush       0.45      0.81      0.58       108
Gerhard_Schroeder       1.00      0.11      0.19        19
      Hugo_Chavez       0.33      0.42      0.37        12
       Tony_Blair       0.00      0.00      0.00        22

         accuracy                           0.43       258
        macro avg       0.42      0.28      0.27       258
     weighted avg       0.42      0.43      0.36       258



Decision Tree with LDA

In [31]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=None)
X_train_lda = lda.fit_transform(X_train, y_train_encoded)
X_test_lda = lda.transform(X_test)
dt_clf = DecisionTreeClassifier()
dt_clf.fit(X_train_pca, y_train_encoded)
y_pred_dt = dt_clf.predict(X_test_pca)
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt)
print("KNN Classifier")
print("Accuracy:", accuracy_dt)
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred_dt, target_names=label_encoder.classes_))


KNN Classifier
Accuracy: 0.3488372093023256
Classification Report:
                   precision    recall  f1-score   support

     Ariel_Sharon       0.17      0.12      0.14        16
     Colin_Powell       0.47      0.33      0.39        60
  Donald_Rumsfeld       0.10      0.14      0.12        21
    George_W_Bush       0.52      0.51      0.51       108
Gerhard_Schroeder       0.17      0.21      0.19        19
      Hugo_Chavez       0.00      0.00      0.00        12
       Tony_Blair       0.19      0.27      0.22        22

         accuracy                           0.35       258
        macro avg       0.23      0.23      0.22       258
     weighted avg       0.37      0.35      0.36       258



In [32]:
dt_clf = DecisionTreeClassifier(ccp_alpha=0.01, class_weight=None, criterion='entropy',
                       max_depth=5, max_features='sqrt', max_leaf_nodes=None,
                       min_impurity_decrease=0.0,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0,
                       random_state=1024, splitter='best')

dt_clf.fit(X_train_lda, y_train_encoded)
y_pred_dt = dt_clf.predict(X_test_lda)
accuracy_dt = accuracy_score(y_test_encoded, y_pred_dt)

print("Decision Tree Classifier")
print("Accuracy:", accuracy_dt)
print("Classification Report:")
print(classification_report(y_test_encoded, y_pred_dt, target_names=label_encoder.classes_))


Decision Tree Classifier
Accuracy: 0.624031007751938
Classification Report:
                   precision    recall  f1-score   support

     Ariel_Sharon       0.41      0.81      0.54        16
     Colin_Powell       0.89      0.52      0.65        60
  Donald_Rumsfeld       0.45      0.43      0.44        21
    George_W_Bush       0.70      0.81      0.75       108
Gerhard_Schroeder       0.45      0.26      0.33        19
      Hugo_Chavez       0.29      0.58      0.39        12
       Tony_Blair       0.82      0.41      0.55        22

         accuracy                           0.62       258
        macro avg       0.57      0.55      0.52       258
     weighted avg       0.68      0.62      0.62       258

