# This notebook consists of loading extracted CNN features , applying PCA on it and then some models for testing and then storing in .pkl files


In [2]:
import torch

In [3]:
import numpy as np

In [4]:
import sklearn.model_selection 

In [5]:
cnn_features = torch.load('extracted_resnet_features.pt')

In [6]:
import numpy as np

class FacesData:
    def __init__(self, data_dict):
        self.data_dict = data_dict

    def get_X_y(self):
        X = []
        y = []
        for key, value in self.data_dict.items():
            # Extract the label from the file path
            label = key.split('/')[1]
            X.append(value.flatten())  # Flatten the feature array
            y.append(label)
        return np.array(X), np.array(y)



faces_data = FacesData(cnn_features)
X, y = faces_data.get_X_y()

print('Features (X):', X.shape)
print('Labels (y):', y.shape)


Features (X): (13233, 2048)
Labels (y): (13233,)


### Till above are imports and code for face data using pre extracted CNN

### Code for PCA

In [7]:
# from sklearn.decomposition import PCA
# transformed_data = []
# pca = PCA(n_components=16)
# X_pca = pca.fit_transform(X)
# transformed_data = X_pca
from sklearn.decomposition import PCA

# Define the number of features for each split
n_features_list = [32, 64, 128]

# Dictionary to store the transformed data for each feature set
transformed_data_dict = {}

# Fit PCA with the maximum number of components needed
pca = PCA(n_components=max(n_features_list))
X_pca = pca.fit_transform(X)

# Split the data into different feature sets
for n_features in n_features_list:
    transformed_data_dict[n_features] = X_pca[:, :n_features]

# Access the transformed data for a specific number of features
transformed_data_32 = transformed_data_dict[32]
transformed_data_64 = transformed_data_dict[64]
transformed_data_128 = transformed_data_dict[128]

### Logisitic Regression model

In [7]:
from sklearn.linear_model import LogisticRegression
import joblib

accuracies = {}
clf_64 = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, tol=0.01, n_jobs=-1)
clf_64.fit(transformed_data_64, y)
joblib.dump(clf_64, 'logistic_regression_64_features.pkl')
accuracies['64'] = clf_64.score(transformed_data_64, y)

In [8]:
print(accuracies['64'])

0.6956850298496183


### Random Forest Model

In [9]:
from sklearn.ensemble import RandomForestClassifier
import joblib
accuracies_RF = {}
# Initialize the Random Forest model
# You can adjust n_estimators and max_features to balance between performance and training time
rf_model = RandomForestClassifier(n_estimators=10, max_depth = 20 , max_features='sqrt', random_state=42, n_jobs=-1)

# Train the Random Forest model on the dataset
# Use transformed_data_32, transformed_data_64, or transformed_data_128 depending on the PCA components you want to use
rf_model.fit(transformed_data_64, y)

# Save the trained Random Forest model
joblib.dump(rf_model, 'random_forest_model_64_features.pkl')
# Print accuracy on the training dataset
accuracies_RF['64'] = rf_model.score(transformed_data_64, y)

In [10]:
print(accuracies_RF['64'])

0.5502909393183707


### XGBoost Model

In [9]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import time
# Assuming transformed_data_32, transformed_data_64, and transformed_data_128 are your feature sets
# and 'y' is your target variable with class labels

# Encode the class labels in 'y'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initialize a dictionary to store accuracies for different feature sets
accuracies_XGB = {}

# Set n_jobs to -1 to use all available cores
xgb_model_64 = xgb.XGBClassifier(n_estimators=20, max_depth=20, random_state=42, n_jobs=-1)
start_time = time.time()
xgb_model_64.fit(transformed_data_64, y_encoded)
end_time = time.time() 
elaspsed_time = end_time - start_time
joblib.dump(xgb_model_64, 'xgb_model_64_features.pkl')
predictions_64 = xgb_model_64.predict(transformed_data_64)
accuracies_XGB['64'] = accuracy_score(y_encoded, predictions_64)
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")

Time taken to fit the model: 456.22 seconds


In [11]:
print(accuracies_XGB['64'])

0.22270082369832994


### LinearSVC Model

In [12]:
from sklearn.svm import LinearSVC
import joblib
import time

accuracies_SVM = {}

# Initialize the LinearSVC
linear_svm_model = LinearSVC(C=1.0, random_state=42, max_iter=1000)

# Start timing
start_time = time.time()

# Train the LinearSVC model on the dataset with 64 PCA components
linear_svm_model.fit(transformed_data_64, y)

# Calculate elapsed time
end_time = time.time() 
elapsed_time = end_time - start_time
# Save the trained SVM model
joblib.dump(linear_svm_model, 'linear_svc_64_features.pkl')

# Print accuracy on the training dataset
accuracies_SVM['64'] = linear_svm_model.score(transformed_data_64, y)

# Print the time taken to train the model
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")




Time taken to fit the model: 1081.61 seconds


In [13]:
print(accuracies_SVM['64'])

0.8703241895261845


### Balanced Logistic Regression

In [15]:
from sklearn.linear_model import LogisticRegression
import joblib

accuracies = {}

# Train logistic regression for 64 PCA components
clf_64 = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, tol=0.01, n_jobs=-1 , class_weight = 'balanced')
clf_64.fit(transformed_data_64, y)
joblib.dump(clf_64, 'balanced_logistic_regression_64_features.pkl')
accuracies['64'] = clf_64.score(transformed_data_64, y)

In [16]:
print(accuracies['64'])

0.6103680193455755


### KNN Model

In [8]:
from sklearn.neighbors import KNeighborsClassifier
import joblib

accuracies_KNN = {}

# Initialize the KNN model
# You can adjust 'n_neighbors' and other parameters to balance between performance and training time
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Train the KNN model on the dataset using 64 PCA components
knn_model.fit(transformed_data_64, y)

# Save the trained KNN model
joblib.dump(knn_model, 'knn_model_64_features.pkl')

# Print accuracy on the training dataset
accuracies_KNN['64'] = knn_model.score(transformed_data_64, y)

In [9]:
print(accuracies_KNN['64'])

0.31209854152497546
