# This notebook consists of loading extracted HoG features , applying PCA on it and then some models for testing and then storing in .pkl files


In [1]:
import numpy as np
import torch

class FacesData:
    def __init__(self, data_paths):
        self.data_paths = data_paths
        self.data_dict = self.load_data()

    def load_data(self):
        combined_data = {}
        for path in self.data_paths:
            data = torch.load(path)  # Assuming data is a dictionary as before
            combined_data.update(data)
        return combined_data

    def get_X_y(self):
        X = []
        y = []
        sample_count = 0  # Counter to keep track of the number of samples processed
        for key, value in self.data_dict.items():
            label = key.split('/')[1]  # Extract the label from the file path
            X.append(value.flatten())  # Flatten the feature array
            y.append(label)
            sample_count += 1
            if sample_count % 200 == 0:  # Print progress every 200 samples
                print(f"{sample_count} samples processed.")
        return np.array(X), np.array(y)

# List of paths to your .pt files
data_paths = ['extracted_hog_features_0_2000.pt',
              'extracted_hog_features_2000_4000.pt',
              'extracted_hog_features_4000_6000.pt',
              'extracted_hog_features_6000_8000.pt',
              'extracted_hog_features_8000_10000.pt',
              'extracted_hog_features_10000_12000.pt',
              'extracted_hog_features_12000_end.pt']

faces_data = FacesData(data_paths)
X, y = faces_data.get_X_y()

print('Features (X):', X.shape)
print('Labels (y):', y.shape)

200 samples processed.
400 samples processed.
600 samples processed.
800 samples processed.
1000 samples processed.
1200 samples processed.
1400 samples processed.
1600 samples processed.
1800 samples processed.
2000 samples processed.
2200 samples processed.
2400 samples processed.
2600 samples processed.
2800 samples processed.
3000 samples processed.
3200 samples processed.
3400 samples processed.
3600 samples processed.
3800 samples processed.
4000 samples processed.
4200 samples processed.
4400 samples processed.
4600 samples processed.
4800 samples processed.
5000 samples processed.
5200 samples processed.
5400 samples processed.
5600 samples processed.
5800 samples processed.
6000 samples processed.
6200 samples processed.
6400 samples processed.
6600 samples processed.
6800 samples processed.
7000 samples processed.
7200 samples processed.
7400 samples processed.
7600 samples processed.
7800 samples processed.
8000 samples processed.
8200 samples processed.
8400 samples process

In [2]:
print(np.unique(y).shape)

(5749,)


In [3]:
import numpy as np
from sklearn.decomposition import PCA
import joblib

# Define the dataset, X
# Note: You should replace this with your actual dataset loading/preprocessing logic
# Example: X = np.random.rand(100, 200)  # 100 samples, 200 features

# Define the number of features for each PCA component set
n_features_list = [32, 64, 128]

# Fit PCA with the maximum number of components needed
pca = PCA(n_components=max(n_features_list))
X_pca = pca.fit_transform(X)

# Store the PCA model
joblib.dump(pca, 'pca_model.joblib')

# Dictionary to store the transformed data for each feature set
transformed_data_dict = {
    n_features: X_pca[:, :n_features] for n_features in n_features_list
}

# Save the transformed features
for n_features, data in transformed_data_dict.items():
    joblib.dump(data, f'HoG_transformed_data_{n_features}.joblib')


In [4]:
import joblib

# Assuming y is your labels dataset
# Note: You should replace this with your actual labels loading logic
# Example: y = np.array([...])

# Save the labels
joblib.dump(y, 'labels.joblib')


['labels.joblib']

### For the HoG extracted features PCA was applied on them and then the transformed data was saved for future use . Code Above

### Loading PCA transformed data

In [7]:
# Load the PCA model
pca_loaded = joblib.load('pca_model.joblib')

# Load the transformed data
transformed_data_32 = joblib.load('HoG_transformed_data_32.joblib')
transformed_data_64 = joblib.load('HoG_transformed_data_64.joblib')
transformed_data_128 = joblib.load('HoG_transformed_data_128.joblib')

# Now, transformed_data_32, transformed_data_64, and transformed_data_128 can be used as needed


### Random Forest Model

In [10]:
from sklearn.ensemble import RandomForestClassifier
import joblib
accuracies_RF = {}
import time
# Initialize the Random Forest model
# You can adjust n_estimators and max_features to balance between performance and training time
rf_model = RandomForestClassifier(n_estimators=10, max_depth = 20 , max_features='sqrt', random_state=42, n_jobs=-1)

# Train the Random Forest model on the dataset
# Use transformed_data_32, transformed_data_64, or transformed_data_128 depending on the PCA components you want to use
start_time = time.time()
rf_model.fit(transformed_data_32, y)
end_time = time.time()
elapsed_time =  end_time - start_time
print("Time taken to fit model :" ,elapsed_time)
# Save the trained Random Forest model
joblib.dump(rf_model, 'HoG_random_forest_model_32_features.pkl')
# Print accuracy on the training dataset
accuracies_RF['32'] = rf_model.score(transformed_data_32, y)

Time taken to fit model : 22.662403106689453


In [11]:
print(accuracies_RF['32'])

0.41509861709362955


### XGBoost Model

In [13]:
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import joblib
import time
# Assuming transformed_data_32, transformed_data_64, and transformed_data_128 are your feature sets
# and 'y' is your target variable with class labels

# Encode the class labels in 'y'
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Initialize a dictionary to store accuracies for different feature sets
accuracies_XGB = {}

# Set n_jobs to -1 to use all available cores
xgb_model_32 = xgb.XGBClassifier(n_estimators=20, max_depth=20, random_state=42, n_jobs=-1)
start_time = time.time()
xgb_model_32.fit(transformed_data_32, y_encoded)
end_time = time.time()
elapsed_time =  end_time - start_time
print("Time taken to fit model :" ,elapsed_time)
joblib.dump(xgb_model_32, 'HoG_xgb_model_32_features.pkl')
predictions_32 = xgb_model_32.predict(transformed_data_32)
accuracies_XGB['32'] = accuracy_score(y_encoded, predictions_32)

Time taken to fit model : 228.61008739471436


In [14]:
print(accuracies_XGB['32'])

0.16496637194891559


### Logistic Regression Model

In [15]:
from sklearn.linear_model import LogisticRegression
import joblib
import time
accuracies = {}

# Train logistic regression for 32 PCA components
clf_32 = LogisticRegression(solver='saga', penalty='l2', max_iter=1000, tol=0.01, n_jobs=-1)
start_time = time.time()
clf_32.fit(transformed_data_32, y)
end_time = time.time()
elapsed_time =  end_time - start_time
print("Time taken to fit model :" ,elapsed_time)
joblib.dump(clf_32, 'HoG_logistic_regression_32_features.pkl')
accuracies['32'] = clf_32.score(transformed_data_32, y)

Time taken to fit model : 1059.140419960022


In [16]:
print(accuracies['32'])

0.5748507519081085


### LinearSVC Model

In [17]:
from sklearn.svm import LinearSVC
import joblib
import time

accuracies_SVM = {}

# Initialize the LinearSVC
linear_svm_model = LinearSVC(C=1.0, random_state=42, max_iter=1000)

# Start timing
start_time = time.time()

# Train the LinearSVC model on the dataset with 32 PCA components
linear_svm_model.fit(transformed_data_32, y)

# Calculate elapsed time
end_time = time.time()
elapsed_time =  end_time - start_time
print("Time taken to fit model :" ,elapsed_time)

# Save the trained SVM model
joblib.dump(linear_svm_model, 'HoG_linear_svc_32_features.pkl')

# Print accuracy on the training dataset
accuracies_SVM['32'] = linear_svm_model.score(transformed_data_32, y)

# Print the time taken to train the model
print(f"Time taken to fit the model: {elapsed_time:.2f} seconds")




Time taken to fit model : 1208.382797241211
Time taken to fit the model: 1208.38 seconds


In [18]:
print(accuracies_SVM['32'])

0.5842968336733922


### KNN Model

In [21]:
from sklearn.neighbors import KNeighborsClassifier
import joblib

accuracies_KNN = {}

# Initialize the KNN model
# You can adjust 'n_neighbors' and other parameters to balance between performance and training time
knn_model = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Train the KNN model on the dataset using 32 PCA components
# You can change to transformed_data_64 or transformed_data_128 as needed
knn_model.fit(transformed_data_32, y)

# Save the trained KNN model
joblib.dump(knn_model, 'HoG_knn_model_32_features.pkl')

# Print accuracy on the training dataset
accuracies_KNN['32'] = knn_model.score(transformed_data_32, y)

In [20]:
print(accuracies_KNN['32'])

0.2362276127862163
