In [37]:
import pandas as pd
import os
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, f1_score, roc_auc_score
import pickle
import Functions2
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [38]:
# Load the data
data_path = r"/Users/bogdancristianmihaila/Desktop/2nd Semester/Github/project2/Project-2-Medical-Imaging/data/full_data.csv"
df = pd.read_csv(data_path)

# Preprocess the diagnostic column
df['diagnostic'] = df['diagnostic'].map({'BCC': 1, 'MEL': 1, 'SCC': 1, 'ACK': 0, 'NEV': 0, 'SEK': 0})

In [39]:
# Define the function to extract features
def extract_features(folder_path):
    feature_1 = []
    feature_2 = []
    feature_3 = []
    feature_4 = []
    feature_5 = []
    feature_6 = []
    feature_7 = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_path = os.path.join(folder_path, filename)
            original = io.imread(image_path)

            # Ignore the alpha channel (e.g. transparency)
            if original.shape[-1] == 4:
                original = original[..., :3]

            feature_1.append(Functions2.measure_pigment_network(original))
            feature_2.append(Functions2.measure_blue_veil(original))
            feature_3.append(Functions2.measure_vascular(original))
            feature_4.append(Functions2.measure_globules(original))
            feature_5.append(Functions2.measure_streaks(original))
            feature_6.append(Functions2.measure_irregular_pigmentation(original))
            feature_7.append(Functions2.measure_regression(original))

    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7

In [40]:
# Define the folder path for image processing
folder_path_in = r"/Users/bogdancristianmihaila/Desktop/2nd Semester/Github/project2/Project-2-Medical-Imaging/data/images/Masks/Color_mask/107images"

# Extract features from the images
feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7 = extract_features(folder_path_in)

In [41]:
# Create a DataFrame for the features
features_df = pd.DataFrame()
features_df["img_id"] = [filename for filename in os.listdir(folder_path_in) if filename.endswith(('.jpg', '.png'))]
features_df["1: pigment network"] = feature_1
features_df["2: Blue veil"] = feature_2
features_df["3: Vascular"] = feature_3
features_df["4: Globules"] = feature_4
features_df["5: Streaks"] = feature_5
features_df["6: Pigmentation"] = feature_6
features_df["7: Regression"] = feature_7

# Merge the features DataFrame with the diagnostic column from the original DataFrame
df_merged = pd.merge(df[['img_id', 'diagnostic']], features_df, on='img_id', how='inner')

In [42]:
# Split the data into training and testing sets
X = df_merged.drop(['img_id', 'diagnostic'], axis=1)
Y = df_merged['diagnostic']
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

### Training classifiers without PCA

In [47]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score

classifiers = [LR(), KNN(), DTC()]  # Replace with your trained classifiers

# Perform cross-validation for each classifier
for classifier in classifiers:
    y_pred = cross_val_predict(classifier, X, Y, cv=5)  # Change cv value as per your requirement
    
    # Calculate evaluation metrics
    f1 = f1_score(Y, y_pred)
    precision = precision_score(Y, y_pred)
    recall = recall_score(Y, y_pred)
    tn, fp, fn, tp = confusion_matrix(Y, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    print(f"Classifier: {type(classifier).__name__}")
    print(f"F1 score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall/Sensitivity: {recall}")
    print(f"Specificity: {specificity}")
    print("Confusion Matrix:")
    print(confusion_matrix(Y, y_pred))
    print()


Classifier: LogisticRegression
F1 score: 0.616822429906542
Precision: 0.6226415094339622
Recall/Sensitivity: 0.6111111111111112
Specificity: 0.6226415094339622
Confusion Matrix:
[[33 20]
 [21 33]]

Classifier: KNeighborsClassifier
F1 score: 0.6666666666666666
Precision: 0.6862745098039216
Recall/Sensitivity: 0.6481481481481481
Specificity: 0.6981132075471698
Confusion Matrix:
[[37 16]
 [19 35]]

Classifier: DecisionTreeClassifier
F1 score: 0.5142857142857143
Precision: 0.5294117647058824
Recall/Sensitivity: 0.5
Specificity: 0.5471698113207547
Confusion Matrix:
[[29 24]
 [27 27]]



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Training classifiers with PCA

In [53]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
from sklearn.decomposition import PCA

classifiers = [LR(), KNN(), DTC()]  # Replace with your trained classifiers

# Perform PCA
pca = PCA(n_components=2)  # Adjust the number of components as per your requirement
X_pca = pca.fit_transform(X)

# Perform cross-validation for each classifier
for classifier in classifiers:
    y_pred = cross_val_predict(classifier, X_pca, Y, cv=5)  # Change cv value as per your requirement
    
    # Calculate evaluation metrics
    f1 = f1_score(Y, y_pred)
    precision = precision_score(Y, y_pred)
    recall = recall_score(Y, y_pred)
    tn, fp, fn, tp = confusion_matrix(Y, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    print(f"Classifier: {type(classifier).__name__}")
    print(f"F1 score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall/Sensitivity: {recall}")
    print(f"Specificity: {specificity}")
    print("Confusion Matrix:")
    print(confusion_matrix(Y, y_pred))
    print()

Classifier: LogisticRegression
F1 score: 0.5333333333333333
Precision: 0.6666666666666666
Recall/Sensitivity: 0.4444444444444444
Specificity: 0.7735849056603774
Confusion Matrix:
[[41 12]
 [30 24]]

Classifier: KNeighborsClassifier
F1 score: 0.6666666666666666
Precision: 0.6862745098039216
Recall/Sensitivity: 0.6481481481481481
Specificity: 0.6981132075471698
Confusion Matrix:
[[37 16]
 [19 35]]

Classifier: DecisionTreeClassifier
F1 score: 0.5555555555555556
Precision: 0.5555555555555556
Recall/Sensitivity: 0.5555555555555556
Specificity: 0.5471698113207547
Confusion Matrix:
[[29 24]
 [24 30]]



***Saving the models***

In [30]:
# import pickle

# # In the path where you want to save the models
# path = r'/Users/bogdancristianmihaila/Desktop/2nd Semester/Github/project2/Project-2-Medical-Imaging/Pickle/pickle_107images/no_pca_models'

# # Save the models
# pickle.dump(LR, open(path + r'\LR.pkl', 'wb'))
# pickle.dump(KNN, open(path + r'\KNN.pkl', 'wb'))
# pickle.dump(DTC, open(path + r'\DTC.pkl', 'wb'))
# #pickle.dump(xgb.XGBRFClassifier, open(path + r'\xgb.XGBRFClassifier.pkl', 'wb'))
