In [72]:
import pandas as pd
import os
from skimage import io
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, precision_score, recall_score, classification_report, f1_score, roc_auc_score
import pickle
import Functions2
from sklearn.linear_model import LogisticRegression as LR
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, roc_auc_score, classification_report

In [27]:
# Load the data
data_path = r"C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\data\full_data.csv"
df = pd.read_csv(data_path)

# Preprocess the diagnostic column
df['diagnostic'] = df['diagnostic'].map({'BCC': 1, 'MEL': 1, 'SCC': 1, 'ACK': 0, 'NEV': 0, 'SEK': 0})

In [28]:
# Define the function to extract features
def extract_features(folder_path):
    feature_1 = []
    feature_2 = []
    feature_3 = []
    feature_4 = []
    feature_5 = []
    feature_6 = []
    feature_7 = []

    for filename in os.listdir(folder_path):
        if filename.endswith('.jpg') or filename.endswith('.png'):
            image_path = os.path.join(folder_path, filename)
            original = io.imread(image_path)

            # Ignore the alpha channel (e.g. transparency)
            if original.shape[-1] == 4:
                original = original[..., :3]

            feature_1.append(Functions2.measure_pigment_network(original))
            feature_2.append(Functions2.measure_blue_veil(original))
            feature_3.append(Functions2.measure_vascular(original))
            feature_4.append(Functions2.measure_globules(original))
            feature_5.append(Functions2.measure_streaks(original))
            feature_6.append(Functions2.measure_irregular_pigmentation(original))
            feature_7.append(Functions2.measure_regression(original))

    return feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7

In [29]:
# Define the folder path for image processing
folder_path_in = r"C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\data\ColorMask\Training"

# Extract features from the images
feature_1, feature_2, feature_3, feature_4, feature_5, feature_6, feature_7 = extract_features(folder_path_in)

In [31]:
# Create a DataFrame for the features
features_df = pd.DataFrame()
features_df["img_id"] = [filename for filename in os.listdir(folder_path_in) if filename.endswith(('.jpg', '.png'))]
features_df["1: pigment network"] = feature_1
features_df["2: Blue veil"] = feature_2
features_df["3: Vascular"] = feature_3
features_df["4: Globules"] = feature_4
features_df["5: Streaks"] = feature_5
features_df["6: Pigmentation"] = feature_6
features_df["7: Regression"] = feature_7

# Merge the features DataFrame with the diagnostic column from the original DataFrame
df_merged = pd.merge(df[['img_id', 'diagnostic']], features_df, on='img_id', how='inner')

In [32]:
# Split the data into training and testing sets
X = df_merged.drop(['img_id', 'diagnostic'], axis=1)
Y = df_merged['diagnostic']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5)

In [71]:
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
import numpy as np

# Handle missing values in X
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Check for constant features or features with zero variance
non_zero_var_indices = np.var(X_imputed, axis=0) != 0
if not np.any(non_zero_var_indices):
    raise ValueError("All features have zero variance. Cannot perform PCA.")

# Standardize the feature matrix
X_std = StandardScaler().fit_transform(X_imputed[:, non_zero_var_indices])

# Perform PCA and retain the first four principal components
pca = PCA(0.99)
X_pca = pca.fit_transform(X_std)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, Y, test_size=0.3, random_state=42)

# Initialize and train the model using the reduced feature space
model = LR()
model.fit(X_train, y_train)

# Evaluate the performance of the model
accuracy = model.score(X_test, y_test)
print("Model accuracy:", accuracy)

#how many features are we left with?
print("Number of features:", X_pca.shape[1])

# F1 score
y_pred = model.predict(X_test)
f1 = f1_score(y_test, y_pred)
print("F1 score:", f1)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion matrix:\n", cm)

Model accuracy: 0.6666666666666666
Number of features: 6
F1 score: 0.6666666666666666
Confusion matrix:
 [[9 4]
 [5 9]]


In [None]:
# Logistic Regression
logreg_classifier = LogisticRegression()
logreg_classifier.fit(X_train, Y_train)
logreg_prediction = logreg_classifier.predict(X_test)

# Evaluate Logistic Regression
logreg_cm = confusion_matrix(Y_test, logreg_prediction)
logreg_precision = precision_score(Y_test, logreg_prediction)
logreg_recall = recall_score(Y_test, logreg_prediction)
logreg_f1 = f1_score(Y_test, logreg_prediction)
logreg_auc_roc = roc_auc_score(Y_test, logreg_prediction)
logreg_classification_rep = classification_report(Y_test, logreg_prediction)

# Print Logistic Regression evaluation metrics
print("Logistic Regression:")
print("Confusion Matrix:")
print(logreg_cm)
print("Precision:", logreg_precision)
print("Recall:", logreg_recall)
print("F1 Score:", logreg_f1)
print("AUC-ROC Score:", logreg_auc_roc)
print("Classification Report:")
print(logreg_classification_rep)

In [None]:
# Save LR model
with open(r'C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\Pickle\logreg_model.pkl', 'wb') as file:
    pickle.dump(logreg_classifier, file)

# Save KNN model
with open(r'C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\Pickle\knn_model.pkl', 'wb') as file:
    pickle.dump(knn_classifier, file)

In [None]:
# Predict the diagnostic for the test set
test_path = r"C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\data\ColorMask\Test"
test_feature_1, test_feature_2, test_feature_3, test_feature_4, test_feature_5, test_feature_6, test_feature_7 = extract_features(test_path)
test_features_df = pd.DataFrame()
test_features_df["img_id"] = [filename for filename in os.listdir(test_path) if filename.endswith(('.jpg', '.png'))]
test_features_df["1: pigment network"] = test_feature_1
test_features_df["2: Blue veil"] = test_feature_2
test_features_df["3: Vascular"] = test_feature_3
test_features_df["4: Globules"] = test_feature_4
test_features_df["5: Streaks"] = test_feature_5
test_features_df["6: Pigmentation"] = test_feature_6
test_features_df["7: Regression"] = test_feature_7

# Load the model
with open(r'C:\Users\serru\OneDrive\Documents\Project2\Project-2-Medical-Imaging\Pickle\logreg_model.pkl', 'rb') as file:
    logreg_model = pickle.load(file)

# Predict the diagnostic for the test set
test_X = test_features_df.drop(['img_id'], axis=1)
test_Y = logreg_model.predict(test_X)

# Create a DataFrame for the predictions
test_predictions_df = pd.DataFrame()
test_predictions_df["img_id"] = test_features_df["img_id"]
test_predictions_df["diagnostic"] = test_Y

# Print the predictions
print(test_predictions_df)

In [None]:
# Find the mean of the features for each diagnostic
df_mean = df_merged.groupby('diagnostic').mean()
print(df_mean)

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.inspection import permutation_importance

# Assuming you have your data in X and y variables
# X should be a 2D array-like object with shape (n_samples, n_features)
# y should be a 1D array-like object with shape (n_samples,)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.5, random_state=42)

# Initialize and train a logistic regression classifier
lr = LogisticRegression()
lr.fit(X_train, y_train)

# Perform permutation importance for logistic regression
result_lr = permutation_importance(lr, X_test, y_test, n_repeats=10, random_state=42)

# Get the feature importance scores for logistic regression
importance_scores_lr = result_lr.importances_mean

# Sort the feature indices based on importance scores in descending order for logistic regression
sorted_indices_lr = np.argsort(importance_scores_lr)[::-1]

# Initialize and train a k-nearest neighbors classifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

# Perform permutation importance for k-nearest neighbors
result_knn = permutation_importance(knn, X_test, y_test, n_repeats=10, random_state=42)

# Get the feature importance scores for k-nearest neighbors
importance_scores_knn = result_knn.importances_mean

# Sort the feature indices based on importance scores in descending order for k-nearest neighbors
sorted_indices_knn = np.argsort(importance_scores_knn)[::-1]

# Print the sorted features and their importance scores for logistic regression
print("Feature Importance Scores for Logistic Regression:")
for i in sorted_indices_lr:
    print(f"Feature {i+1}: {importance_scores_lr[i]}")

# Print the sorted features and their importance scores for k-nearest neighbors
print("Feature Importance Scores for K-Nearest Neighbors:")
for i in sorted_indices_knn:
    print(f"Feature {i+1}: {importance_scores_knn[i]}")


In [None]:
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.datasets import make_classification

# Generate some example data (replace this with your own dataset)
X, y = make_classification(n_samples=100, n_features=20, random_state=42)

# Create a SelectKBest object with the desired score function (f_classif for classification)
k_best = SelectKBest(score_func=f_classif, k=10)

# Perform feature selection
X_selected = k_best.fit_transform(X, y)

# Get the selected feature indices
selected_indices = k_best.get_support(indices=True)

# Print the selected feature indices
print("Selected feature indices:", selected_indices)
