# Main Script

In [23]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import os
import utils
import models
import process
import pickle

In [24]:
""" 
SET UP EXPERIMENTAL DESIGN HERE
Choose if you want to load data directly in from pickle file or make dicts by starting feature extraction
"""
make_dicts = False
want_to_pickle = False

In [25]:
if make_dicts:
    path = '/Users/georgienahass/Desktop/fall_classes_PhD/CS415/clean_images.nosync'
    data_train_loaded, data_test_loaded = process.build_dictionaries(path)
    if want_to_pickle:
        process.pickle_out(data_train_loaded, data_test_loaded)
else:
    data_train_loaded, data_test_loaded = process.pickle_in()

### Make thresholded dicts on un normalized data as this is faster and can be used as its own experiment

In [26]:
orig_images = '/Users/georgienahass/Desktop/fall_classes_PhD/CS415/CXR_png.nosync'

# make train and test dictionary of white pixel percentages
data_train_thresh, data_test_thresh = process.build_thresh_dicts(orig_images)

# train and test models in less complicated way due to not having to compute so many things
X_train, X_test, y_train, y_test = process.build_thresh_vectors(data_train_thresh, data_test_thresh)
print(len(X_train))

Training set size: 529
Testing set size: 133
Train image: 0
Train image: 1
Train image: 2
Train image: 3
Train image: 4
Train image: 5
Train image: 6
Train image: 7
Train image: 8
Train image: 9
Train image: 10
Train image: 11
Train image: 12
Train image: 13
Train image: 14
Train image: 15
Train image: 16
Train image: 17
Train image: 18
Train image: 19
Train image: 20
Train image: 21
Train image: 22
Train image: 23
Train image: 24
Train image: 25
Train image: 26
Train image: 27
Train image: 28
Train image: 29
Train image: 30
Train image: 31
Train image: 32
Train image: 33
Train image: 34
Train image: 35
Train image: 36
Train image: 37
Train image: 38
Train image: 39
Train image: 40
Train image: 41
Train image: 42
Train image: 43
Train image: 44
Train image: 45
Train image: 46
Train image: 47
Train image: 48
Train image: 49
Train image: 50
Train image: 51
Train image: 52
Train image: 53
Train image: 54
Train image: 55
Train image: 56
Train image: 57
Train image: 58
Train image: 59
Train

In [27]:
print(len(X_test), len(X_train), len(y_test), len(y_train))
print(len(data_test_thresh['label']))

133 529 133 529
133


In [28]:
thresh_performance_metrics_svc, thresh_performance_metrics_rf, thresh_performance_metrics_nb, thresh_performance_metrics_mlp = models.threshold_engine(X_train, X_test, y_train, y_test)



In [29]:
def dict_to_csv(performance_dict, filename):
    # Convert dictionary to DataFrame
    df = pd.DataFrame.from_dict(performance_dict, orient='index')
    # Reset index to make it a column in the DataFrame
    df.reset_index(inplace=True)
    # Rename columns for clarity
    df.columns = ['k_value', 'accuracy', 'precision', 'recall', 'f1_score', 'specificity', 'auroc_score']
    # Write to CSV
    df.to_csv(filename, index=False)

extractor = 'threshold' 
# Convert and save each performance metrics dictionary to a CSV file
dict_to_csv(thresh_performance_metrics_svc, f'svc_{extractor}.csv')
dict_to_csv(thresh_performance_metrics_rf, f'rf_{extractor}.csv')
dict_to_csv(thresh_performance_metrics_nb, f'nb_{extractor}.csv')
dict_to_csv(thresh_performance_metrics_mlp, f'mlp_{extractor}.csv')

In [30]:
# def dict_to_csv(performance_dict, filename):
#     # Convert dictionary to DataFrame
#     df = pd.DataFrame.from_dict(performance_dict, orient='index')
#     # Reset index to make it a column in the DataFrame
#     df.reset_index(inplace=True)
#     # Rename columns for clarity
#     df.columns = ['k_value', 'accuracy', 'precision', 'recall', 'f1_score', 'specificity', 'auroc_score']
#     # Write to CSV
#     df.to_csv(filename, index=False)

### Run the actual experiments. See documentation in other files for details on arguments

In [31]:
# extractors = ['sift', 'orb', 'hog']

# for extractor in extractors:
#     print(len(data_train_loaded['label']))


#     training_stacked, train_labels = utils.prepare_data(data_train_loaded, extractor_type=extractor)
#     k_means_model, accuracies, performance_svc, performance_rf, performance_nb,performance_mlp=  models.engine(np.array(training_stacked), train_labels, data_train_loaded, data_test_loaded, extractor=extractor,  k_predicting=False, bow=True, tfidf=False )
    
#     print(performance_mlp)
#     # Convert and save each performance metrics dictionary to a CSV file
#     dict_to_csv(performance_svc, f'high_k_base_performance_svc_{extractor}.csv')
#     dict_to_csv(performance_rf, f'high_k_base_performance_rf_{extractor}.csv')
#     dict_to_csv(performance_nb, f'high_k_base_performance_nb_{extractor}.csv')
#     dict_to_csv(performance_mlp, f'high_k_base_performance_mlp_{extractor}.csv')

### Run experiments classifying based on proximity to clustering with k means


In [32]:
# extractors = ['sift', 'orb', 'hog']
# def dict_to_csv_k_classification(performance_dict, filename):
#     # Convert dictionary to DataFrame
#     df = pd.DataFrame.from_dict(performance_dict, orient='index')
#     # Reset index to make it a column in the DataFrame
#     df.reset_index(inplace=True)
#     # Rename columns for clarity
#     df.columns = ['k_value', 'accuracy']
#     # Write to CSV
#     df.to_csv(filename, index=False)
# sampling_percent = 100
# print(len(data_train_loaded['label']))

# data_train_loaded = utils.sample_data(data_train_loaded, sampling_percent)
# data_test_loaded = utils.sample_data(data_test_loaded, sampling_percent)
# for extractor in extractors:
#     training_stacked, train_labels = utils.prepare_data(data_train_loaded, extractor_type=extractor)
#     k_means_model, accuracies, performance_svc, performance_rf, performance_nb,performance_mlp=  models.engine(np.array(training_stacked), train_labels, data_train_loaded, data_test_loaded, extractor=extractor,  k_predicting=True, bow=False, tfidf=False )
#     # Convert and save each performance metrics dictionary to a CSV file
#     print(accuracies)
#     dict_to_csv_k_classification(accuracies, f'k_class_{extractor}.csv')



In [33]:

# accuracies = {}  

# for k in k_list:
#     k=int(k)
#     print(k)
#     # Perform k-means on the dataset
#     # Define criteria = ( type, max_iter = 10 , epsilon = 1.0 )
#     criteria = (cv2.TERM_CRITERIA_EPS + cv2.TERM_CRITERIA_MAX_ITER, 10, 1.0)
#     # Set flags (Just to avoid line break in the code)
#     flags = cv2.KMEANS_RANDOM_CENTERS
#     # Apply KMeans with number of centers, k=3
#     compactness,labels,centers = cv2.kmeans(sift_training_stacked,k,None,criteria,10,flags)
    
#     # Determine majority label for each cluster- first make a blank dictionary with k subdicts 
#     cluster_lists = {i: {'healthy': 0, 'tb': 0} for i in range(k)}
    
#     # iterate through all of the labels from k means
#     for idx, label in enumerate(labels):
#         # find the associated ground truth label
#         gt = sift_train_labels[idx]
        
#         # count number of correct predictions in each cluster
#         if gt == 0:
#             cluster_lists[label[0]]['healthy'] += 1
#         else:
#             cluster_lists[label[0]]['tb'] += 1

#     # assign majority labels to all of the clusters
#     majority_labels = {}
#     for cluster, counts in cluster_lists.items():
#         if counts['healthy'] > counts['tb']:
#             majority_labels[cluster] = 0
#         else:
#             majority_labels[cluster] = 1
                
#     # Evaluate the testing data using the clusters with a given K
#     correct_predictions = 0
    
#     correct_predictions = 0
#     for idx, image_descriptors in enumerate(data_test['sift']):  
#         # Loop over the list of descriptors for each image
#         image_labels = [assign_label_to_descriptor(desc, centers, majority_labels) for desc in image_descriptors]
        
#         # Assign the majority label to the image
#         predicted_label = max(image_labels, key=image_labels.count)
        
#         # grab the ground truth label from initial labels and compare
#         ground_truth = data_test['label'][idx]
#         if predicted_label == ground_truth:
#             correct_predictions += 1
    
    
#     accuracy = correct_predictions / len(data_test['sift'])
#     print(accuracy)
#     accuracies[k]=accuracy
        
        
# plt.plot(list(accuracies.keys()), list(accuracies.values()), '-o')
# plt.xlabel('Number of Clusters (k)')
# plt.ylabel('Accuracy')
# plt.title('Accuracy vs. Number of Clusters')
# plt.savefig('k_results.png')
# plt.show()


In [34]:
                # HCD tuning
                # blocks = [10, 15,20,30, 50]
                # ks = [3,5,11,15,21]
                # for bs in blocks:
                #     for k_size in ks:            
                #         print(bs, k_size)
                #         harris = harris_detection(cont_image, blocksize=bs, ksize=k_size )
                #         print(harris.nonzero())
                #         # plot hcd on color image
                #         rgb_img = cv2.cvtColor(cont_image,cv2.COLOR_GRAY2BGR)
                #         # rgb_img[harris > 0.01 * harris.max()] = [0,0,255]
                #         lower_percentile = np.percentile(harris, 99.7)   # Change 1 to your desired lower percentile
                #         upper_percentile = np.percentile(harris, 99.9)  # Change 20 to your desired upper percentile

                #         # Find coordinates of strong corners
                #         corners = np.argwhere((harris > np.percentile(harris, 99.7)) & (harris < np.percentile(harris, 99.9)))
                    
                #         print(len(corners))
                #         for y, x in corners:
                #             cv2.circle(rgb_img, (x, y), 20, (255, 0, 255), -1)

                #         plt.imshow(rgb_img)
                #         plt.show()
                        
                # print(harris)