In [3]:
import cv2
import numpy as np
import os
from sklearn import preprocessing
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.decomposition import IncrementalPCA
import pickle
import random

# Define the path to your "train" folder
train_folder_path = 'TRAIN'

# Image resize dimensions
resize_width = 50
resize_height = 50

# Maximum number of samples to load
max_samples = 30000

# Number of SIFT features to extract per image
num_sift_features = 100  

# Initialize lists to store features and labels
X = []
y = []

# Get the list of subfolders (class labels) in the "train" folder
class_labels = os.listdir(train_folder_path)

# Load and preprocess a random sample of images from each class
for label in class_labels:
    class_folder = os.path.join(train_folder_path, label)
    image_files = [os.path.join(class_folder, filename) for filename in os.listdir(class_folder) if filename.endswith('.jpg')]
    
    # Shuffle the list of image files
    random.shuffle(image_files)
    
    for image_file in image_files[:max_samples // len(class_labels)]:
        image = cv2.imread(image_file)
        image = cv2.resize(image, (resize_width, resize_height))

        # Feature extraction using SIFT
        sift = cv2.SIFT_create()
        keypoints, descriptors = sift.detectAndCompute(image, None)
        
        if descriptors is not None:
            # Ensure a consistent number of SIFT features for each image
            if descriptors.shape[0] < num_sift_features:
                # If fewer features are detected, add zero rows
                zero_rows = np.zeros((num_sift_features - descriptors.shape[0], descriptors.shape[1]))
                descriptors = np.vstack((descriptors, zero_rows))
            elif descriptors.shape[0] > num_sift_features:
                # If more features are detected, truncate to the desired number
                descriptors = descriptors[:num_sift_features, :]

            X.extend(descriptors)
            y.extend([class_labels.index(label)] * num_sift_features)  # Assign labels based on class folders

print("--------------done processing-------------------")

## Store the values of X and Y in nump
X = np.array(X) if len(X) > 0 else np.empty((0, 128))  # Assuming SIFT descriptors have 128 dimensions
y = np.array(y)

# Normalization of features using scaler
# Reference : https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
scaler_norm = preprocessing.StandardScaler()
X = scaler_norm.fit_transform(X)


# Reducing the dimension of features using PCA which will help us to extract latent features and reduce dimensions

# Define variables required for PCA
total_samples, total_features = X.shape
number_of_components = min(total_samples, total_features) 

# Define PCA (Incremental PCA)
incremental_pca = IncrementalPCA(n_components=number_of_components)

# Process in batches otherwise the run time will be forever

# Reference - https://stackoverflow.com/questions/66716370/batch-size-and-training-time
batch_size = total_samples  
for i in range(0, X.shape[0], batch_size):
    X_batch_pca = X[i:i + batch_size]
    incremental_pca.partial_fit(X_batch_pca)

# Reference : https://scikit-learn.org/stable/auto_examples/decomposition/plot_incremental_pca.html
X_pca = incremental_pca.transform(X)  


print("--------------done pca-------------------")



--------------done processing-------------------
--------------done scaling-------------------
--------------done pca-------------------


In [4]:
import pandas as pd
data = np.column_stack((X_pca, y))
column_names = [f"Feature_{i}" for i in range(X_pca.shape[1])] + ["Label"]
df = pd.DataFrame(data, columns=column_names)

shuffled_df_3 = df.sample(frac=1, random_state=42)  # "frac=1" shuffles all rows

# Display the DataFrame
#Select the first 10 rows
selected_df_6 = shuffled_df_3.head(10000)

# Display the selected DataFrame
print(selected_df_6)




# Initialize an SVM classifier
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

X = selected_df_6.iloc[:, :-1]  # All columns except the last one
y = selected_df_6.iloc[:, -1]   # The last column


# Split the dataset into a training set and a testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize an SVM classifier
clf = SVC(C=1.0, kernel='rbf', gamma='auto')

# Train the SVM model on the training data
clf.fit(X_train, y_train)


print("--------------done SVM-------------------")

# Make predictions on the test set
y_pred = clf.predict(X_test)

    # Calculate accuracy for this fold
accuracy = accuracy_score(y_test, y_pred)


# Calculate test accuracy
test_accuracy = accuracy_score(y_test, y_pred)

print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

# Save the trained model and PCA to a single pickle file
model_and_pca = {
    'model': clf,
    'pca': ipca,
    'scaler': scaler
}

model_path = 'svm_model_pca.pkl'
with open(model_path, 'wb') as model_file:
    pickle.dump(model_and_pca, model_file)


         Feature_0  Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  \
435200   13.831749  -7.929849  -0.058160  -2.698314  -0.868951  -0.644826   
539910   -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   
1369603   8.924907  -4.055584   5.103814  -4.658314  -0.702178  -2.208232   
1455361  -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   
2014138  -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   
...            ...        ...        ...        ...        ...        ...   
2354380  -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   
2456894  -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   
11810    16.448621  -2.116250 -10.686191  -0.677939  -5.790890   4.822818   
918957   -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   
1938778  -3.402186  -0.092564  -0.056431   0.000413   0.035653   0.071824   

         Feature_6  Feature_7  Feature_8  Feature_9  ...  Feature_119  \
43

In [5]:
from sklearn.metrics import classification_report,confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, cohen_kappa_score, matthews_corrcoef
# Calculate test accuracy of SVM model
# Reference : https://scikit-learn.org/stable/modules/model_evaluation.html
test_accuracy = accuracy_score(y_test, y_pred)
sensitivity = recall_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
f1_score = f1_score(y_test, y_pred)
cohen_kappa = cohen_kappa_score(y_test, y_pred)
mcc = matthews_corrcoef(y_test, y_pred)
      


# Evaluation metrics
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")
print(f'Sensitivity is {sensitivity * 100:.2f}%')
print(f'Precision is {precision * 100:.2f}%')
print(f'F1-score is {f1_score * 100:.2f}%')
print(f'Cohen - kappa statistic is {cohen_kappa * 100:.2f}%')
print(f'MCC is {mcc * 100:.2f}%')
print('Confusion Matrix: \n', confusion_matrix(y_test,y_pred))
print('Classification Report: \n', classification_report(y_test,y_pred))

Test Accuracy: 55.80%
Sensitivity is 1.36%
Precision is 46.15%
F1-score is 2.64%
Cohen - kappa statistic is 0.12%
MCC is 0.47%
Confusion Matrix: 
 [[1104   14]
 [ 870   12]]
Classification Report: 
               precision    recall  f1-score   support

         0.0       0.56      0.99      0.71      1118
         1.0       0.46      0.01      0.03       882

    accuracy                           0.56      2000
   macro avg       0.51      0.50      0.37      2000
weighted avg       0.52      0.56      0.41      2000

