In [None]:
# Import required libraries for image processing, feature extraction, and machine learning
import cv2
import dlib
import landmarks as l2
import numpy as np
from sklearn.metrics import classification_report,accuracy_score
from sklearn import svm

In [None]:
# Function to extract features and labels from the dataset
# Returns training and testing features and labels for gender and smile classification

def get_all_data():
    tr_X, tr_Y_gender, tr_Y_smile  = l2.extract_features_labels("train")
    te_X, te_Y_gender, te_Y_smile  = l2.extract_features_labels("test")

    tr_X = tr_X.reshape(tr_X.shape[0], -1)
    te_X = te_X.reshape(te_X.shape[0], -1)

    return tr_X, tr_Y_gender, tr_Y_smile, te_X, te_Y_gender, te_Y_smile
    

In [None]:
# Load all data (features and labels) for training and testing
tr_X, tr_Y_gender, tr_Y_smile, te_X, te_Y_gender, te_Y_smile = get_all_data()

In [None]:
# Print the shapes of the loaded datasets to verify correct loading
print(tr_X.shape)
print(tr_Y_gender.shape)
print(tr_Y_smile.shape)
print(te_X.shape)
print(te_Y_gender.shape)
print(te_Y_smile.shape)

(4795, 136)
(4795,)
(4795,)
(969, 136)
(969,)
(969,)


In [None]:
# Import various classifiers and metrics from scikit-learn
from sklearn import svm, metrics
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis

# Function to train and evaluate a classifier on the CelebA dataset
# Supports multiple classifier types

def train_and_evaluate_classifier(tr_X, tr_Y, te_X, te_Y, classifier_type):
    """
    Trains and evaluates models on the celeba dataset.
    :tr_X: Training features, shape (n_train_samples, n_features)
    :tr_Y: Training labels, shape (n_train_samples,)
    :te_X: Testing features, shape (n_test_samples, n_features)
    :te_Y: Testing labels, shape (n_test_samples,)
    :classifier_type: Type of classifier to use ('SVM', 'LogisticRegression', 'DecisionTree', 'RandomForest')
    """
    # Select the classifier
    if classifier_type == 'SVM':
        clf = svm.SVC(kernel='linear', probability=True)
    elif classifier_type == 'LogisticRegression':
        clf = LogisticRegression(max_iter=10000)
    elif classifier_type == 'DecisionTree':
        clf = DecisionTreeClassifier()
    elif classifier_type == 'RandomForest':
        clf = RandomForestClassifier()
    elif classifier_type == 'KNN':
        clf = KNeighborsClassifier()
    elif classifier_type == 'LDA':
        clf = LinearDiscriminantAnalysis()
    elif classifier_type == 'QDA':
        clf = QuadraticDiscriminantAnalysis()
    elif classifier_type == 'NearestCentroid':
        clf = NearestCentroid()

    # Train the classifier
    clf.fit(tr_X, tr_Y)

    # training set
    train_predictions = clf.predict(tr_X)
    train_accuracy = metrics.accuracy_score(tr_Y, train_predictions)

    # testing set
    predictions = clf.predict(te_X)
    test_accuracy = metrics.accuracy_score(te_Y, predictions)

    # evaluations
    report = metrics.classification_report(te_Y, predictions)
    confusion = metrics.confusion_matrix(te_Y, predictions)

    print(f"Classifier: {classifier_type}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(confusion)


In [None]:
# Import additional tools for model selection and preprocessing
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler

# Function to train and evaluate a classifier with hyperparameter tuning and feature scaling
# Uses GridSearchCV for parameter optimization

def train_and_evaluate_classifier(tr_X, tr_Y, te_X, te_Y, classifier_type):
    """
    :tr_X: Training features
    :tr_Y: Training labels
    :te_X: Testing features
    :te_Y: Testing labels
    """
    scaler = StandardScaler()
    tr_X_scaled = scaler.fit_transform(tr_X) # Scale data for better convergence
    te_X_scaled = scaler.transform(te_X)
    # tr_X_scaled = tr_X
    # te_X_scaled = te_X
    
    if classifier_type == 'SVM':
        clf = svm.SVC()
        param_grid = {
            'C': [0.1],
            'kernel': ['linear'] # Fixed values for faster grid search
        }
    elif classifier_type == 'LogisticRegression':
        clf = LogisticRegression(max_iter=10000) # Increased max_iter for convergence
        param_grid = {
            'C': [0.01, 0.1, 1, 10],
            'penalty': ['l2']
        }
    elif classifier_type == 'DecisionTree':
        clf = DecisionTreeClassifier()
        param_grid = {
            'max_depth': [5, 10, 20],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    elif classifier_type == 'KNN':
        clf = KNeighborsClassifier()
        param_grid = {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance']
        }
    elif classifier_type == 'LDA':
        clf = LinearDiscriminantAnalysis()
        param_grid = {}
    elif classifier_type == 'QDA':
        clf = QuadraticDiscriminantAnalysis()
        param_grid = {}
    elif classifier_type == 'NearestCentroid':
        clf = NearestCentroid()
        param_grid = {
            'metric': ['euclidean', 'manhattan']
        }
    
    # Hyperparameter tuning with grid search
    grid_search = GridSearchCV(clf, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(tr_X_scaled, tr_Y)
    best_clf = grid_search.best_estimator_
    
    
    # Training set predictions to check for overfitting
    train_predictions = best_clf.predict(tr_X_scaled)
    train_accuracy = accuracy_score(tr_Y, train_predictions)
    
    # Test set predictions
    predictions = best_clf.predict(te_X_scaled)
    test_accuracy = accuracy_score(te_Y, predictions)
    
    # Evaluation metrics
    report = classification_report(te_Y, predictions)
    confusion = metrics.confusion_matrix(te_Y, predictions)
    
    print(f"Classifier: {classifier_type}")
    print(f"Best Parameters: {grid_search.best_params_}")
    print(f"Training Accuracy: {train_accuracy:.4f}")
    print(f"Testing Accuracy: {test_accuracy:.4f}")
    print("Classification Report:")
    print(report)
    print("Confusion Matrix:")
    print(confusion)

In [None]:
# Train and evaluate all classifiers for both gender and smile classification tasks
for clf_type in ['SVM', 'LogisticRegression', 'DecisionTree', 'KNN', 'LDA', 'QDA', 'NearestCentroid']:
        print(f"\n                                                Training and Evaluating: {clf_type}")
        # Gender Classification
        print(f"\n                                                    Gender Classification")
        train_and_evaluate_classifier(tr_X, tr_Y_gender, te_X, te_Y_gender, classifier_type=clf_type)
        
        print(f"\n                                                     Smile Classification")
        # Smile Classification
        train_and_evaluate_classifier(tr_X, tr_Y_smile, te_X, te_Y_smile, classifier_type=clf_type)
        print("_________________________________________________________________________________________________________________________________________________________")


                                                Training and Evaluating: SVM

                                                    Gender Classification


Classifier: SVM
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Training Accuracy: 0.9328
Testing Accuracy: 0.9092
Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.92      0.91       488
         1.0       0.92      0.90      0.91       481

    accuracy                           0.91       969
   macro avg       0.91      0.91      0.91       969
weighted avg       0.91      0.91      0.91       969

Confusion Matrix:
[[449  39]
 [ 49 432]]

                                                     Smile Classification
Classifier: SVM
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Training Accuracy: 0.9047
Testing Accuracy: 0.9061
Classification Report:
              precision    recall  f1-score   support

         0.0       0.90      0.91      0.90       472
         1.0       0.91      0.90      0.91       497

    accuracy                           0.91       969
   macro avg       0.91      0.91      0.91       969
weighted 

In [None]:
# Example: Train and evaluate SVM classifier for gender classification
train_and_evaluate_classifier(tr_X, tr_Y_gender, te_X, te_Y_gender, classifier_type="SVM")

Classifier: SVM
Best Parameters: {'C': 0.1, 'kernel': 'linear'}
Training Accuracy: 0.9343
Testing Accuracy: 0.9051
Classification Report:
              precision    recall  f1-score   support

         0.0       0.89      0.92      0.91       488
         1.0       0.92      0.89      0.90       481

    accuracy                           0.91       969
   macro avg       0.91      0.90      0.91       969
weighted avg       0.91      0.91      0.91       969

Confusion Matrix:
[[450  38]
 [ 54 427]]
