# Importing Libraries

In [None]:
!pip install imbalanced-learn


In [None]:
import requests
import zipfile
import os
from imblearn.over_sampling import SMOTE
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Importing Dataset

In [None]:
url = "https://github.com/mohsley/skin-cancer-detection/raw/refs/heads/main/data.zip"

In [None]:
zip_file_path = "data.zip"
response = requests.get(url)
if response.status_code == 200:
    with open(zip_file_path, "wb") as file:
        file.write(response.content)
    print("ZIP file downloaded successfully.")
else:
    print(f"Failed to download the ZIP file. Status code: {response.status_code}")
    exit()

In [None]:
extract_folder = "extracted_data"
os.makedirs(extract_folder, exist_ok=True)
try:
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall(extract_folder)
    print("ZIP file extracted successfully.")
except zipfile.BadZipFile:
    print("Error: The downloaded file is not a valid ZIP file.")

In [None]:
!rm "/content/data.zip"

In [None]:
df = pd.read_csv("/content/extracted_data/data/train-metadata.csv")

In [None]:
df

In [None]:
df.columns

In [None]:
features_to_drop = [
    # maybe drop these, but they don't make accuracy 1 so idk
    'tbp_lv_Lext',
    'tbp_lv_deltaLB',
    'tbp_lv_location_simple',
    'tbp_lv_norm_color',
    'tbp_lv_perimeterMM',
    'tbp_lv_radial_color_std_max',

    # Clinical measurements
    'mel_thick_mm',
    'mel_mitotic_index',

    # Model confidence scores
    'tbp_lv_dnn_lesion_confidence',
    'tbp_lv_nevi_confidence',

    # IDs and metadata (not relevant for prediction)
    'iddx_full',
    'iddx_1',
    'iddx_2',
    'iddx_3',
    'iddx_4',
    'iddx_5',
    'isic_id',
    'patient_id',
    'lesion_id',
    'attribution',
    'copyright_license'
]

In [None]:
df.drop(columns=features_to_drop,inplace=True)

In [None]:
df.head()

In [None]:
missing_percentages = (df.isnull().sum() / len(df)) * 100
columns_to_keep = missing_percentages[missing_percentages < 50].index
skin_cancer_df= df[columns_to_keep]
skin_cancer_df = skin_cancer_df.fillna(skin_cancer_df.mode().iloc[0])
skin_cancer_df.shape

In [None]:
skin_cancer_df.columns

In [None]:

skin_cancer_df.head()

In [None]:
numerical_df = skin_cancer_df.select_dtypes(include=["number"])


In [None]:
numerical_df.head()

In [None]:
numerical_df.shape

In [None]:
X = numerical_df.drop('target', axis=1)
y = numerical_df['target']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

#SMOTE

In [None]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [None]:
print("Before SMOTE:")
print(y_train.value_counts())
print("After SMOTE:")
print(pd.Series(y_train_resampled).value_counts())

In [None]:
column_names = [f'feature_{i}' for i in range(X_train_resampled.shape[1])]  # Generate generic column names

# Convert X_train_resampled and y_train_resampled to DataFrame and Series
oversampled_data = pd.concat([
    pd.DataFrame(X_train_resampled, columns=column_names),  # Use generated column names
    pd.Series(y_train_resampled, name='target')             # Assign 'target' name to y
], axis=1)
# Sample 100,000 rows per class and combine
final_data = (oversampled_data.groupby('target')
                            .apply(lambda x: x.sample(n=10000, random_state=42))
                            .reset_index(drop=True)
                            .sample(frac=1, random_state=42))  # Shuffle data

# Split into features and target
X_final = final_data.drop(columns='target')
y_final = final_data['target']

# Verify the distribution
print("Final class distribution:")
print(y_final.value_counts())

In [None]:
# Visualizing feature distributions (histograms)
import matplotlib.pyplot as plt

for column in X_train.columns:
    plt.figure(figsize=(6, 4))
    plt.hist(X_train[column], bins=30, alpha=0.7, label='Training Data', color='blue', edgecolor='black')
    plt.hist(X_test[column], bins=30, alpha=0.7, label='Testing Data', color='orange', edgecolor='black')
    plt.title(f'Distribution of {column}', fontsize=14)
    plt.xlabel(column, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.legend()
    plt.show()


# KNN

In [None]:
results = []
roc_data = []

In [None]:
for k in range(2, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_final, y_final)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    # y_proba = knn.predict_proba(X_test)[:, 1]
    y_proba = (knn.predict_proba(X_test)[:, 1] > 0.3)
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)  # ROC components
    roc_auc = auc(fpr, tpr)

    results.append((k, accuracy, roc_auc))
    roc_data.append((k, fpr, tpr, roc_auc))  # Store ROC data for later

In [None]:
results

In [None]:

for k, accuracy, roc_auc in results:
    print(f"k={k}, Accuracy={ accuracy:.4f}, AUC={roc_auc:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Plotting all ROC curves in one graph
plt.figure(figsize=(10, 10))

for k, fpr, tpr, roc_auc in roc_data:
    plt.plot(fpr, tpr, label=f"k={k} (AUC = {roc_auc:.2f})")

# Add the random guessing line
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Guess")

# Graph details
plt.title("Combined ROC Curves", fontsize=16)
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.legend(loc="lower right", fontsize=10)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:

# from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
# import matplotlib.pyplot as plt

# Initialize and train KNN with a predefined number of neighbors
n_neighbors = 5  # You can set this to your preferred value
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Cancer', 'Cancer'],
            yticklabels=['No Cancer', 'Cancer'])
plt.title(f'Confusion Matrix (n_neighbors={n_neighbors})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print classification report
print(f"Classification Report for n_neighbors={n_neighbors}:\n")
print(classification_report(y_test, y_pred))



In [None]:
# Plotting Accuracy vs. n_neighbors
# results = grid_search.cv_results_
# n_neighbors = param_grid['n_neighbors']
# mean_test_score = results['mean_test_score']

# plt.figure(figsize=(8, 6))
# plt.plot(n_neighbors, mean_test_score, marker='o', linestyle='-', color='blue')
# plt.title('Hyperparameter Tuning: Accuracy vs. n_neighbors', fontsize=16)
# plt.xlabel('Number of Neighbors (n_neighbors)', fontsize=12)
# plt.ylabel('Cross-Validated Accuracy', fontsize=12)
# plt.xticks(n_neighbors)
# plt.grid(True)
# plt.show()

from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define a range of values for n_neighbors
n_neighbors_range = range(1, 11)  # Adjust range as needed
train_accuracies = []
test_accuracies = []

# Loop through different values of n_neighbors
for n in n_neighbors_range:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)

    # Calculate accuracy for both training and testing datasets
    train_accuracies.append(accuracy_score(y_train, knn.predict(X_train)))
    test_accuracies.append(accuracy_score(y_test, knn.predict(X_test)))

# Plot training and testing accuracies
plt.figure(figsize=(8, 6))
plt.plot(n_neighbors_range, train_accuracies, marker='o', linestyle='-', label='Training Accuracy', color='blue')
plt.plot(n_neighbors_range, test_accuracies, marker='o', linestyle='--', label='Testing Accuracy', color='orange')
plt.title('Accuracy vs. Number of Neighbors (n_neighbors)', fontsize=16)
plt.xlabel('Number of Neighbors (n_neighbors)', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

# Confusion matrix for the best-performing n_neighbors (optional)
best_n = n_neighbors_range[test_accuracies.index(max(test_accuracies))]
print(f"Best n_neighbors based on testing accuracy: {best_n}")

# Train and evaluate model using the best n_neighbors
best_knn = KNeighborsClassifier(n_neighbors=best_n)
best_knn.fit(X_train, y_train)
y_pred = best_knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Cancer', 'Cancer'],
            yticklabels=['No Cancer', 'Cancer'])
plt.title(f'Confusion Matrix (Best n_neighbors={best_n})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()



#----------------END WITH CSV-----------------------





-------------------START WITH IMAGES-----------------

In [None]:
from google.colab import files
files.upload()


In [None]:
import os

# Create the .kaggle directory
os.makedirs('/root/.kaggle', exist_ok=True)

# Move kaggle.json to the correct directory
!mv kaggle.json /root/.kaggle/

# Set the correct permissions
!chmod 600 /root/.kaggle/kaggle.json


In [None]:
%cd '/content/extracted_data/data'

In [None]:
!kaggle competitions download -c isic-2024-challenge

In [None]:
!unzip "/content/extracted_data/data/isic-2024-challenge.zip"

In [None]:
y!rm "/content/extracted_data/data/isic-2024-challenge.zip"

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from PIL import Image
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.decomposition import PCA
from imblearn.over_sampling import SMOTE
import os
from tqdm import tqdm

df = pd.read_csv("/content/extracted_data/data/train-metadata.csv", low_memory=False)
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

print(df.head)
benign = df[df['target'] == 0]
malignant  = df[df['target'] == 1]

benign_sampled = benign.sample(n=3000, random_state=42)
subset_df = pd.concat([benign_sampled, malignant ])

print(f"Class 0 samples: {len(benign_sampled)}")
print(f"Class 1 samples: {len(malignant )}")
print(f"Total samples: {len(subset_df)}")

missing_percentages = (subset_df.isnull().sum() / len(subset_df)) * 100
columns_to_keep = missing_percentages[missing_percentages < 50].index
skin_cancer_df = subset_df[columns_to_keep]

features_to_drop = [
    'iddx_full',
    'iddx_1',
    'iddx_2',
    'iddx_3',
    'iddx_4',
    'iddx_5',
    'patient_id',
    'lesion_id',
    'attribution',
    'copyright_license'
]

# We do this because depending on the seed and shuffle, the columns that get dropped will be different!
for col in features_to_drop:
    if col in skin_cancer_df.columns:
        print(f"Dropping column: {col}")
        skin_cancer_df = skin_cancer_df.drop(columns=[col])

numeric_columns = skin_cancer_df.select_dtypes(include=['int64', 'float64']).columns
categorical_columns = skin_cancer_df.select_dtypes(include=['object', 'category']).columns

skin_cancer_df[numeric_columns] = skin_cancer_df[numeric_columns].fillna(
    skin_cancer_df[numeric_columns].median())
skin_cancer_df[categorical_columns] = skin_cancer_df[categorical_columns].fillna(
    skin_cancer_df[categorical_columns].mode().iloc[0])

for col in categorical_columns:
    if col != 'isic_id':
        le = LabelEncoder()
        skin_cancer_df[col] = le.fit_transform(skin_cancer_df[col])

expected_samples = len(skin_cancer_df)
image_features = np.zeros((expected_samples, 128 * 128 * 3))
valid_indices = []

for idx, isic_id in tqdm(enumerate(skin_cancer_df['isic_id']), total=expected_samples):
    try:
        file_path = os.path.join(("/content/extracted_data/data/train-image/image", f"{isic_id}.jpg")
        if os.path.exists(file_path):
            img = mpimg.imread(file_path)
            img_resized = np.array(Image.fromarray(img).resize((128, 128)))
            image_features[idx] = img_resized.reshape(-1)
            valid_indices.append(idx)
    except Exception as e:
        print(f"Error processing {isic_id}: {str(e)}")
        continue

if not valid_indices:
    raise ValueError("No valid images were processed")

image_features = image_features[valid_indices]

image_features_scaled = image_features / 255.0
n_components = 150
pca = PCA(n_components=n_components, random_state=42)
image_features_pca = pca.fit_transform(image_features_scaled)

pca_columns = [f'pca_{i}' for i in range(n_components)]
image_df = pd.DataFrame(image_features_pca, columns=pca_columns)

explained_variance = np.cumsum(pca.explained_variance_ratio_)
print(f"\nVariance explained by {n_components} components: {explained_variance[-1]}")

X = pd.concat([
    skin_cancer_df.drop(['isic_id', 'target'], axis=1).reset_index(drop=True),
    image_df
], axis=1)
y = skin_cancer_df['target'].reset_index(drop=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
train_score = {}
test_score = {}
n_neighbors = np.arange(2, 10, 1)
for neighbor in n_neighbors:
    knn = KNeighborsClassifier(n_neighbors=neighbor)
    knn.fit(X_train, y_train)
    train_score[neighbor]=knn.score(X_train, y_train)
    test_score[neighbor]=knn.score(X_test, y_test)

In [None]:
plt.plot(n_neighbors, train_score.values(), label="Train Accuracy")
plt.plot(n_neighbors, test_score.values(), label="Test Accuracy")
plt.xlabel("Number Of Neighbors")
plt.ylabel("Accuracy")
plt.title("KNN: Varying number of Neighbors")
plt.legend()
plt.xlim(0, 33)
plt.ylim(0.60, 0.90)
plt.grid()
plt.show()

In [None]:
# Visualizing feature distributions (histograms)
import matplotlib.pyplot as plt

for column in X_train.columns:
    plt.figure(figsize=(6, 4))
    plt.hist(X_train[column], bins=30, alpha=0.7, label='Training Data', color='blue', edgecolor='black')
    plt.hist(X_test[column], bins=30, alpha=0.7, label='Testing Data', color='orange', edgecolor='black')
    plt.title(f'Distribution of {column}', fontsize=14)
    plt.xlabel(column, fontsize=12)
    plt.ylabel('Frequency', fontsize=12)
    plt.legend()
    plt.show()


In [None]:
results = []
roc_data = []
for k in range(2, 11):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(X_train, y_train)
    y_pred = knn.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    y_proba = knn.predict_proba(X_test)[:, 1]
    fpr, tpr, thresholds = roc_curve(y_test, y_proba)  # ROC components
    roc_auc = auc(fpr, tpr)

    results.append((k, accuracy, roc_auc))
    roc_data.append((k, fpr, tpr, roc_auc))

In [None]:
for k, accuracy, roc_auc in results:
    print(f"k={k}, AUC={roc_auc:.4f}, Accuracy={accuracy:.4f}")

In [None]:
import matplotlib.pyplot as plt

# Plotting all ROC curves in one graph
plt.figure(figsize=(10, 10))

for k, fpr, tpr, roc_auc in roc_data:
    plt.plot(fpr, tpr, label=f"k={k} (AUC = {roc_auc:.2f})")

# Add the random guessing line
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Guess")

# Graph details
plt.title("Combined ROC Curves", fontsize=16)
plt.xlabel("False Positive Rate", fontsize=12)
plt.ylabel("True Positive Rate", fontsize=12)
plt.legend(loc="lower right", fontsize=10)
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
# import matplotlib.pyplot as plt

# Initialize and train KNN with a predefined number of neighbors
n_neighbors = 5  # You can set this to your preferred value
knn = KNeighborsClassifier(n_neighbors=n_neighbors)
knn.fit(X_train, y_train)

# Make predictions
y_pred = knn.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Visualize the confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Cancer', 'Cancer'],
            yticklabels=['No Cancer', 'Cancer'])
plt.title(f'Confusion Matrix (n_neighbors={n_neighbors})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

# Print classification report
print(f"Classification Report for n_neighbors={n_neighbors}:\n")
print(classification_report(y_test, y_pred))


In [None]:
# # Plotting Accuracy vs. n_neighbors
# results = grid_search.cv_results_
# n_neighbors = param_grid['n_neighbors']
# mean_test_score = results['mean_test_score']

# plt.figure(figsize=(8, 6))
# plt.plot(n_neighbors, mean_test_score, marker='o', linestyle='-', color='blue')
# plt.title('Hyperparameter Tuning: Accuracy vs. n_neighbors', fontsize=16)
# plt.xlabel('Number of Neighbors (n_neighbors)', fontsize=12)
# plt.ylabel('Cross-Validated Accuracy', fontsize=12)
# plt.xticks(n_neighbors)
# plt.grid(True)
# plt.show()
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns

# Define a range of values for n_neighbors
n_neighbors_range = range(1, 11)  # Adjust range as needed
train_accuracies = []
test_accuracies = []

# Loop through different values of n_neighbors
for n in n_neighbors_range:
    knn = KNeighborsClassifier(n_neighbors=n)
    knn.fit(X_train, y_train)

    # Calculate accuracy for both training and testing datasets
    train_accuracies.append(accuracy_score(y_train, knn.predict(X_train)))
    test_accuracies.append(accuracy_score(y_test, knn.predict(X_test)))

# Plot training and testing accuracies
plt.figure(figsize=(8, 6))
plt.plot(n_neighbors_range, train_accuracies, marker='o', linestyle='-', label='Training Accuracy', color='blue')
plt.plot(n_neighbors_range, test_accuracies, marker='o', linestyle='--', label='Testing Accuracy', color='orange')
plt.title('Accuracy vs. Number of Neighbors (n_neighbors)', fontsize=16)
plt.xlabel('Number of Neighbors (n_neighbors)', fontsize=12)
plt.ylabel('Accuracy', fontsize=12)
plt.legend()
plt.grid(True)
plt.show()

# Confusion matrix for the best-performing n_neighbors (optional)
best_n = n_neighbors_range[test_accuracies.index(max(test_accuracies))]
print(f"Best n_neighbors based on testing accuracy: {best_n}")

# Train and evaluate model using the best n_neighbors
best_knn = KNeighborsClassifier(n_neighbors=best_n)
best_knn.fit(X_train, y_train)
y_pred = best_knn.predict(X_test)

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['No Cancer', 'Cancer'],
            yticklabels=['No Cancer', 'Cancer'])
plt.title(f'Confusion Matrix (Best n_neighbors={best_n})')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.show()

