# This notebook consists of loading extracted CNN features , applying PCA on it and then some models for testing and then storing in .pkl files


In [1]:
import torch

In [2]:
import numpy as np

In [3]:
import sklearn.model_selection 

In [4]:
cnn_features = torch.load('extracted_resnet_features.pt')

In [5]:
import numpy as np


class FacesData:
    def __init__(self, data_dict):
        self.data_dict = data_dict

    def get_X_y(self):
        X = []
        y = []
        for key, value in self.data_dict.items():
            # Extract the label from the file path
            label = key.split('/')[1]
            X.append(value.flatten())  # Flatten the feature array
            y.append(label)
        return np.array(X), np.array(y)



faces_data = FacesData(cnn_features)
X, y = faces_data.get_X_y()

print('Features (X):', X.shape)
print('Labels (y):', y.shape)


Features (X): (13233, 2048)
Labels (y): (13233,)


### Till above are imports and code for face data using pre extracted CNN

In [22]:
print(X[0].shape)

(2048,)


### Code for PCA

In [12]:
from sklearn.decomposition import PCA

# Define the number of features for each split
n_features_list = [32, 64, 128]

# Dictionary to store the transformed data for each feature set
transformed_data_dict = {}
# Fit PCA with the maximum number of components needed
pca = PCA(n_components=max(n_features_list) )
X_pca = pca.fit_transform(X)

# Split the data into different feature sets
for n_features in n_features_list:
    transformed_data_dict[n_features] = X_pca[:, :n_features]

# Access the transformed data for a specific number of features
transformed_data_32 = transformed_data_dict[32]
transformed_data_64 = transformed_data_dict[64]
transformed_data_128 = transformed_data_dict[128]

### Logisitic Regression model

In [10]:
from sklearn.linear_model import LogisticRegression
import joblib

accuracies = {}

# Train logistic regression for 32 PCA components
clf_32 = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, tol=0.01, n_jobs=-1)
clf_32.fit(transformed_data_32, y)
joblib.dump(clf_32, 'logistic_regression_32_features.pkl')
accuracies['32'] = clf_32.score(transformed_data_32, y)

In [11]:
print(accuracies['32'])

0.5230106551802313


### Random Forest Model

In [16]:
from sklearn.ensemble import RandomForestClassifier
import joblib
accuracies_RF = {}
# Initialize the Random Forest model
# You can adjust n_estimators and max_features to balance between performance and training time
rf_model = RandomForestClassifier(n_estimators=10, max_depth = 20 , max_features='sqrt', random_state=42, n_jobs=-1)

# Train the Random Forest model on the dataset
# Use transformed_data_32, transformed_data_64, or transformed_data_128 depending on the PCA components you want to use
rf_model.fit(transformed_data_32, y)

# Save the trained Random Forest model
joblib.dump(rf_model, 'random_forest_model_32_features.pkl')
# Print accuracy on the training dataset
accuracies_RF['32'] = rf_model.score(transformed_data_32, y)

In [17]:
print(accuracies_RF['32'])

0.6315272424998111


### XGBoost Model

In [18]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Assuming transformed_data_32, transformed_data_64, and transformed_data_128 are your feature sets
# and 'y' is your target variable with class labels

# Encode the class labels in 'y'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initialize a dictionary to store accuracies for different feature sets
accuracies_XGB = {}

# Set n_jobs to -1 to use all available cores
xgb_model_32 = xgb.XGBClassifier(n_estimators=20, max_depth=20, random_state=42, n_jobs=-1)
xgb_model_32.fit(transformed_data_32, y_encoded)
joblib.dump(xgb_model_32, 'xgb_model_32_features.pkl')
predictions_32 = xgb_model_32.predict(transformed_data_32)
accuracies_XGB['32'] = accuracy_score(y_encoded, predictions_32)

In [30]:
print(accuracies_XGB['32'])

0.210685407692889


### Testing out a few metrics to write in report

In [9]:
from sklearn.linear_model import LogisticRegression
import joblib
from sklearn.metrics import top_k_accuracy_score

# Path to the saved model file
model_path = 'logistic_regression_32_features.pkl'

# Load the logistic regression model from the pickle file
clf_32_loaded = joblib.load(model_path)

# Use the loaded model to predict the probabilities of the training data
probabilities = clf_32_loaded.predict_proba(transformed_data_32)

# Evaluate the top-5 accuracy of the model on the training data
top_5_accuracy = top_k_accuracy_score(y, probabilities, k=1)
print("Top-5 Accuracy:", top_5_accuracy)

Top-5 Accuracy: 0.5230106551802313


### LinearSVC Model

In [7]:
from sklearn.svm import LinearSVC
import joblib
import time

accuracies_SVM = {}

# Initialize the LinearSVC
linear_svm_model = LinearSVC(C=1.0, random_state=42, max_iter=1000)

# Start timing
start_time = time.time()

# Train the LinearSVC model on the dataset with 32 PCA components
linear_svm_model.fit(transformed_data_32, y)

# Calculate elapsed time
elapsed_time = time.time() - start_time

# Save the trained SVM model
joblib.dump(linear_svm_model, 'linear_svc_32_features.pkl')

# Print accuracy on the training dataset
accuracies_SVM['32'] = linear_svm_model.score(transformed_data_32, y)

# Print the time taken to train the model
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")




Time taken to fit the model: 796.62 seconds


In [9]:
print(accuracies_SVM['32'])

0.6727121589964483


### Testing out metrics

In [13]:
import numpy as np
import joblib
from sklearn.metrics import top_k_accuracy_score

# Load the trained LinearSVC model
linear_svm_model = joblib.load('linear_svc_32_features.pkl')

# Assuming 'transformed_data_32' is your feature data and 'y' are your labels
decision_scores = linear_svm_model.decision_function(transformed_data_32)
sklearn_top5_accuracy = top_k_accuracy_score(y, decision_scores, k=5)
print("Sklearn Top-5 Accuracy: {:.2f}".format(sklearn_top5_accuracy))

# Regular accuracy
predictions = linear_svm_model.predict(transformed_data_32)
accuracy = accuracy_score(y, predictions)
print(f"Regular Accuracy: {accuracy:.2f}")

Sklearn Top-5 Accuracy: 0.82
Regular Accuracy: 0.67


### Trying out a balanced version of Logistic Regression to check its results

In [20]:
from sklearn.linear_model import LogisticRegression
import joblib

accuracies = {}

# Train logistic regression for 32 PCA components
clf_32 = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, tol=0.01, n_jobs=-1 , class_weight = 'balanced')
clf_32.fit(transformed_data_32, y)
joblib.dump(clf_32, 'balanced_logistic_regression_32_features.pkl')
accuracies['32'] = clf_32.score(transformed_data_32, y)

In [21]:
print(accuracies['32'])

0.5422806619814101


### KNN Model code

In [19]:
from sklearn.neighbors import KNeighborsClassifier
import joblib

accuracies_KNN = {}

# Initialize the KNN model
# You can adjust 'n_neighbors' and other parameters to balance between performance and training time
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Train the KNN model on the dataset using 32 PCA components
# You can change to transformed_data_64 or transformed_data_128 as needed
knn_model.fit(transformed_data_32, y)

# Save the trained KNN model
joblib.dump(knn_model, 'knn_model_32_features.pkl')

# Print accuracy on the training dataset
accuracies_KNN['32'] = knn_model.score(transformed_data_32, y)

In [16]:
print(accuracies_KNN['32'])

0.29486888838509784


In [17]:
import numpy as np
import joblib
from sklearn.metrics import top_k_accuracy_score
from sklearn.metrics import accuracy_score

# Load the trained KNN model
knn_model = joblib.load('knn_model_32_features.pkl')  # Update path as needed

# Assuming 'transformed_data_32' is your feature data and 'y' are your labels

# KNN does not have a decision_function method, but we can use predict_proba for top-k accuracy
probabilities = knn_model.predict_proba(transformed_data_32)
# Calculate top-k accuracy using the probabilities
sklearn_top5_accuracy = top_k_accuracy_score(y, probabilities, k=5)

# Regular accuracy using predictions
predictions = knn_model.predict(transformed_data_32)
accuracy = accuracy_score(y, predictions)
print(f"Regular Accuracy for KNN_32 : {accuracy:.2f}")
print(f"Top-5 Accuracy for KNN_32: {sklearn_top5_accuracy:.2f}")

Regular Accuracy for KNN_32 : 0.29
Top-5 Accuracy for KNN_32: 1.00


In [20]:
import sklearn
print(sklearn.__version__)


1.3.0
