# This notebook consists of loading extracted LBP features , applying PCA on it and then some models for testing and then storing in .pkl files


In [2]:
import torch

In [3]:
import numpy as np

In [4]:
lbp_features = torch.load('extracted_lbp_features.pt')

In [5]:
import numpy as np

class FacesData:
    def __init__(self, data_dict):
        self.data_dict = data_dict

    def get_X_y(self):
        X = []
        y = []
        for key, value in self.data_dict.items():
            # Extract the label from the file path
            label = key.split('/')[1]
            X.append(value.flatten())  # Flatten the feature array
            y.append(label)
        return np.array(X), np.array(y)



faces_data = FacesData(lbp_features)
X, y = faces_data.get_X_y()

print('Features (X):', X.shape)
print('Labels (y):', y.shape)


Features (X): (13233, 256)
Labels (y): (13233,)


In [11]:
print(np.unique(y).shape)

(5749,)


### Till above are imports and code for face data extraction from LBP

### Code for PCA

In [6]:
# from sklearn.decomposition import PCA
# transformed_data = []
# pca = PCA(n_components=16)
# X_pca = pca.fit_transform(X)
# transformed_data = X_pca
from sklearn.decomposition import PCA

# Define the number of features for each split
n_features_list = [32, 64, 128]

# Dictionary to store the transformed data for each feature set
transformed_data_dict = {}

# Fit PCA with the maximum number of components needed
pca = PCA(n_components=max(n_features_list))
X_pca = pca.fit_transform(X)

# Split the data into different feature sets
for n_features in n_features_list:
    transformed_data_dict[n_features] = X_pca[:, :n_features]

# Access the transformed data for a specific number of features
transformed_data_32 = transformed_data_dict[32]
transformed_data_64 = transformed_data_dict[64]
transformed_data_128 = transformed_data_dict[128]

### Random Forest Model

In [7]:
from sklearn.ensemble import RandomForestClassifier
import joblib
import time

accuracies_RF = {}
# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=10, max_depth=20, max_features='sqrt', random_state=42, n_jobs=-1)

# Start timing
start_time = time.time()

# Train the Random Forest model on the dataset
rf_model.fit(transformed_data_32, y)

# Calculate elapsed time
elapsed_time = time.time() - start_time

# Save the trained Random Forest model
joblib.dump(rf_model, 'LBP_random_forest_model_32_features.pkl')

# Print accuracy on the training dataset
accuracies_RF['32'] = rf_model.score(transformed_data_32, y)

# Print the time taken to train the model
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")

Time taken to fit the model: 23.12 seconds


In [8]:
print(accuracies_RF['32'])

0.3895564120003023


### XGBoost Model

In [8]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import time
# Assuming transformed_data_32, transformed_data_64, and transformed_data_128 are your feature sets
# and 'y' is your target variable with class labels

# Encode the class labels in 'y'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initialize a dictionary to store accuracies for different feature sets
accuracies_XGB = {}

# Set n_jobs to -1 to use all available cores
xgb_model_32 = xgb.XGBClassifier(n_estimators=20, max_depth=20, random_state=42, n_jobs=-1)

start_time = time.time()
xgb_model_32.fit(transformed_data_32, y_encoded)
elapsed_time = time.time() - start_time

joblib.dump(xgb_model_32, 'LBP_xgb_model_32_features.pkl')

predictions_32 = xgb_model_32.predict(transformed_data_32)
accuracies_XGB['32'] = accuracy_score(y_encoded, predictions_32)
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")

Time taken to fit the model: 129.59 seconds


In [9]:
print(accuracies_XGB['32'])

0.040051386684803146


### Logistic Regression Model

In [10]:
from sklearn.linear_model import LogisticRegression
import joblib
import time
accuracies = {}

# Train logistic regression for 32 PCA components
clf_32 = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, tol=0.01, n_jobs=-1)
start_time = time.time()
clf_32.fit(transformed_data_32, y)
elapsed_time = time.time() - start_time
joblib.dump(clf_32, 'LBP_logistic_regression_32_features.pkl')
accuracies['32'] = clf_32.score(transformed_data_32, y)
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")

Time taken to fit the model: 1643.13 seconds


In [11]:
print(accuracies['32'])

0.03317463915967657


### Testing out metrics

In [15]:
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.metrics import top_k_accuracy_score

# Path to the saved model file
model_path = 'LBP_random_forest_model_32_features.pkl'

# Load the logistic regression model from the pickle file
clf_32_loaded = joblib.load(model_path)

# Use the loaded model to predict the probabilities of the training data
probabilities = clf_32_loaded.predict_proba(transformed_data_32)

# Evaluate the top-5 accuracy of the model on the training data
top_5_accuracy = top_k_accuracy_score(y, probabilities, k=5)
print("Top-5 Accuracy:", top_5_accuracy)

Top-5 Accuracy: 0.5091060228217336


### LinearSVC Model

In [7]:
from sklearn.svm import LinearSVC
import joblib
import time

accuracies_SVM = {}

# Initialize the LinearSVC
linear_svm_model = LinearSVC(C=1.0, random_state=42, max_iter=1000)

# Start timing
start_time = time.time()

# Train the LinearSVC model on the dataset with 32 PCA components
linear_svm_model.fit(transformed_data_32, y)  

# Calculate elapsed time
end_time = time.time() 
elapsed_time = end_time - start_time
# Save the trained SVM model
joblib.dump(linear_svm_model, 'LBP_linear_svc_32_features.pkl')

# Print accuracy on the training dataset
accuracies_SVM['32'] = linear_svm_model.score(transformed_data_32, y)

# Print the time taken to train the model
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")




Time taken to fit the model: 19080.54 seconds


### KNN Model

In [7]:
from sklearn.neighbors import KNeighborsClassifier
import joblib

accuracies_KNN = {}

# Initialize the KNN model
# You can adjust 'n_neighbors' and other parameters to balance between performance and training time
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Train the KNN model on the dataset using 32 PCA components
# You can change to transformed_data_64 or transformed_data_128 as needed
knn_model.fit(transformed_data_32, y)

# Save the trained KNN model
joblib.dump(knn_model, 'LBP_knn_model_32_features.pkl')

# Print accuracy on the training dataset
accuracies_KNN['32'] = knn_model.score(transformed_data_32, y)

In [8]:
print(accuracies_KNN['32'])

0.2185445477216051


### Testing out metrics

In [9]:
import numpy as np
import joblib
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import accuracy_score

# Load the trained KNN model
knn_model = joblib.load('LBP_knn_model_32_features.pkl')  # Update path as needed

# Assuming 'transformed_data_32' is your feature data and 'y' are your labels

# KNN does not have a decision_function method, but we can use predict_proba for top-k accuracy
probabilities = knn_model.predict_proba(transformed_data_32)
# Calculate top-k accuracy using the probabilities
sklearn_top5_accuracy = top_k_accuracy_score(y, probabilities, k=5)

# Regular accuracy using predictions
predictions = knn_model.predict(transformed_data_32)
accuracy = accuracy_score(y, predictions)
print(f"Regular Accuracy for KNN_32 : {accuracy:.2f}")
print(f"Top-5 Accuracy for KNN_32: {sklearn_top5_accuracy:.2f}")

Regular Accuracy for KNN_32 : 0.22
Top-5 Accuracy for KNN_32: 1.00
