In [1]:
import pandas as pd
import os
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, f1_score, roc_auc_score
import pickle
import Functions2
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [2]:
# Load the data
data_path = r"C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\data\full_data.csv"
df = pd.read_csv(data_path)

# Preprocess the diagnostic column
df['diagnostic'] = df['diagnostic'].map({'BCC': 1, 'MEL': 1, 'SCC': 1, 'ACK': 0, 'NEV': 0, 'SEK': 0})

In [3]:
# Define the function to extract features
def extract_features(folder_path):
    feature_1 = []
    feature_2 = []
    feature_3 = []
    feature_4 = []
    feature_5 = []
    feature_6 = []
    feature_7 = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_path = os.path.join(folder_path, filename)
            original = io.imread(image_path)

            # Ignore the alpha channel (e.g. transparency)
            if original.shape[-1] == 4:
                original = original[..., :3]

            feature_1.append(Functions2.measure_pigment_network(original))
            feature_2.append(Functions2.measure_blue_veil(original))
            feature_3.append(Functions2.measure_vascular(original))
            feature_4.append(Functions2.measure_globules(original))
            feature_5.append(Functions2.measure_streaks(original))
            feature_6.append(Functions2.measure_irregular_pigmentation(original))
            feature_7.append(Functions2.measure_regression(original))

    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7

In [4]:
# Define the folder path for image processing
folder_path_in = r"C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\data\ColorMask\Training"

# Extract features from the images
feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7 = extract_features(folder_path_in)

In [5]:
# Create a DataFrame for the features
features_df = pd.DataFrame()
features_df["img_id"] = [filename for filename in os.listdir(folder_path_in) if filename.endswith(('.jpg', '.png'))]
features_df["1: pigment network"] = feature_1
features_df["2: Blue veil"] = feature_2
features_df["3: Vascular"] = feature_3
features_df["4: Globules"] = feature_4
features_df["5: Streaks"] = feature_5
features_df["6: Pigmentation"] = feature_6
features_df["7: Regression"] = feature_7

# Merge the features DataFrame with the diagnostic column from the original DataFrame
df_merged = pd.merge(df[['img_id', 'diagnostic']], features_df, on='img_id', how='inner')

In [6]:
# Split the data into training and testing sets
X = df_merged.drop(['img_id', 'diagnostic'], axis=1)
Y = df_merged['diagnostic']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

***PCA trained with LR***

In [7]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np

# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for constant features or features with zero variance
non_zero_var_indices = np.var(X_imputed, axis=0) != 0
if not np.any(non_zero_var_indices):
    raise ValueError("All features have zero variance. Cannot perform PCA.")

# Standardize the feature matrix
X_std = StandardScaler().fit_transform(X_imputed[:, non_zero_var_indices])

# Perform PCA and retain the first four principal components
pca = PCA(0.99)
X_pca = pca.fit_transform(X_std)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.5, random_state=42)

# Initialize and train the model using the reduced feature space
model = LR()
model.fit(X_train, y_train)

# Evaluate the performance of the model
accuracy = model.score(X_test, y_test)
print("Model accuracy:", accuracy)

#how many features are we left with?
print("Number of features:", X_pca.shape[1])

# F1 score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# sensitivity and specificity
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Model accuracy: 0.5909090909090909
Number of features: 6
F1 score: 0.5263157894736842
Confusion matrix:
 [[16  3]
 [15 10]]
Sensitivity: 0.4
Specificity: 0.8421052631578947


***PCA TRAINED WITH KNN***

In [8]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np

# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for constant features or features with zero variance
non_zero_var_indices = np.var(X_imputed, axis=0) != 0
if not np.any(non_zero_var_indices):
    raise ValueError("All features have zero variance. Cannot perform PCA.")

# Standardize the feature matrix
X_std = StandardScaler().fit_transform(X_imputed[:, non_zero_var_indices])

# Perform PCA and retain the first four principal components
pca = PCA(0.99)
X_pca = pca.fit_transform(X_std)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.5, random_state=42)

# Initialize and train the model using the reduced feature space
model = KNN()
model.fit(X_train, y_train)

# Evaluate the performance of the model
accuracy = model.score(X_test, y_test)
print("Model accuracy:", accuracy)

#how many features are we left with?
print("Number of features:", X_pca.shape[1])

# F1 score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# sensitivity and specificity
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)


Model accuracy: 0.5681818181818182
Number of features: 6
F1 score: 0.48648648648648657
Confusion matrix:
 [[16  3]
 [16  9]]
Sensitivity: 0.36
Specificity: 0.8421052631578947


***PCA TRAINED WITH DTC***

In [9]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np


# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for constant features or features with zero variance
non_zero_var_indices = np.var(X_imputed, axis=0) != 0
if not np.any(non_zero_var_indices):
    raise ValueError("All features have zero variance. Cannot perform PCA.")

# Standardize the feature matrix
X_std = StandardScaler().fit_transform(X_imputed[:, non_zero_var_indices])

# Perform PCA and retain the first four principal components
pca = PCA(0.99)
X_pca = pca.fit_transform(X_std)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.5, random_state=42)

# Initialize and train the model using the reduced feature space
model = DTC()
model.fit(X_train, y_train)

# Evaluate the performance of the model
accuracy = model.score(X_test, y_test)
print("Model accuracy:", accuracy)

#how many features are we left with?
print("Number of features:", X_pca.shape[1])

# F1 score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# sensitivity and specificity
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Model accuracy: 0.6136363636363636
Number of features: 6
F1 score: 0.6666666666666666
Confusion matrix:
 [[10  9]
 [ 8 17]]
Sensitivity: 0.68
Specificity: 0.5263157894736842


***PCA TRAINED WITH XGB***

In [15]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np
import xgboost as xgb

# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for constant features or features with zero variance
non_zero_var_indices = np.var(X_imputed, axis=0) != 0
if not np.any(non_zero_var_indices):
    raise ValueError("All features have zero variance. Cannot perform PCA.")

# Standardize the feature matrix
X_std = StandardScaler().fit_transform(X_imputed[:, non_zero_var_indices])

# Perform PCA and retain the first four principal components
pca = PCA(0.99)
X_pca = pca.fit_transform(X_std)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.5, random_state=42)

# Initialize and train the model using the reduced feature space
model = xgb.XGBClassifier()
model.fit(X_train, y_train)

# Evaluate the performance of the model
accuracy = model.score(X_test, y_test)
print("Model accuracy:", accuracy)

#how many features are we left with?
print("Number of features:", X_pca.shape[1])

# F1 score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

# sensitivity and specificity
tn, fp, fn, tp = cm.ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
print("Sensitivity:", sensitivity)
print("Specificity:", specificity)

Model accuracy: 0.6590909090909091
Number of features: 6
F1 score: 0.6666666666666665
Confusion matrix:
 [[14  5]
 [10 15]]
Sensitivity: 0.6
Specificity: 0.7368421052631579


In [22]:
import numpy as np
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score

classifiers = [LR(), KNN(), DTC(), xgb.XGBRFClassifier()]  # Replace with your trained classifiers

# Perform cross-validation for each classifier
for classifier in classifiers:
    y_pred = cross_val_predict(classifier, X, Y, cv=5)  # Change cv value as per your requirement
    
    # Calculate evaluation metrics
    f1 = f1_score(Y, y_pred)
    precision = precision_score(Y, y_pred)
    recall = recall_score(Y, y_pred)
    tn, fp, fn, tp = confusion_matrix(Y, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    print(f"Classifier: {type(classifier).__name__}")
    print(f"F1 score: {f1}")
    print(f"Precision: {precision}")
    print(f"Recall/Sensitivity: {recall}")
    print(f"Specificity: {specificity}")
    print("Confusion Matrix:")
    print(confusion_matrix(Y, y_pred))
    print()


Classifier: LogisticRegression
F1 score: 0.5679012345679012
Precision: 0.6052631578947368
Recall/Sensitivity: 0.5348837209302325
Specificity: 0.6666666666666666
Confusion Matrix:
[[30 15]
 [20 23]]

Classifier: KNeighborsClassifier
F1 score: 0.5952380952380952
Precision: 0.6097560975609756
Recall/Sensitivity: 0.5813953488372093
Specificity: 0.6444444444444445
Confusion Matrix:
[[29 16]
 [18 25]]

Classifier: DecisionTreeClassifier
F1 score: 0.5393258426966292
Precision: 0.5217391304347826
Recall/Sensitivity: 0.5581395348837209
Specificity: 0.5111111111111111
Confusion Matrix:
[[23 22]
 [19 24]]

Classifier: XGBRFClassifier
F1 score: 0.5679012345679012
Precision: 0.6052631578947368
Recall/Sensitivity: 0.5348837209302325
Specificity: 0.6666666666666666
Confusion Matrix:
[[30 15]
 [20 23]]



In [27]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score

classifiers = [LR(), KNN(), DTC(), xgb.XGBRFClassifier()]  # Replace with your trained classifiers

# Initialize a list to store the results
results = []

# Perform cross-validation for each classifier
for classifier in classifiers:
    y_pred = cross_val_predict(classifier, X, Y, cv=5)  # Change cv value as per your requirement

    # Calculate evaluation metrics
    f1 = f1_score(Y, y_pred)
    precision = precision_score(Y, y_pred)
    recall = recall_score(Y, y_pred)
    tn, fp, fn, tp = confusion_matrix(Y, y_pred).ravel()
    specificity = tn / (tn + fp)
    
    # Create a dictionary of the results for the current classifier
    result = {
        'Classifier': type(classifier).__name__,
        'F1 Score': f1,
        'Precision': precision,
        'Recall/Sensitivity': recall,
        'Specificity': specificity,
        'Confusion Matrix': confusion_matrix(Y, y_pred)
    }
    results.append(result)

# Convert the results list into a pandas DataFrame
results_df = pd.DataFrame(results)

# Print the results table
print(results_df)


               Classifier  F1 Score  Precision  Recall/Sensitivity  \
0      LogisticRegression  0.567901   0.605263            0.534884   
1    KNeighborsClassifier  0.595238   0.609756            0.581395   
2  DecisionTreeClassifier  0.539326   0.521739            0.558140   
3         XGBRFClassifier  0.567901   0.605263            0.534884   

   Specificity      Confusion Matrix  
0     0.666667  [[30, 15], [20, 23]]  
1     0.644444  [[29, 16], [18, 25]]  
2     0.511111  [[23, 22], [19, 24]]  
3     0.666667  [[30, 15], [20, 23]]  
