In [59]:
import os
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt

In [60]:
# Load your dataset
data = Path("../Resources/cleaned_skin_metadata.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,diagnosis,result,result_label
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,Benign keratosis-like lesions,0,Benign


In [61]:
# Drop columns that are not needed for modeling
df = df.drop(['result_label','lesion_id','image_id'], axis=1)

In [62]:
unique_sex_values = df['sex'].unique()
print(unique_sex_values)

['male' 'female' 'unknown']


In [63]:
unique_loc_values = df['localization'].unique()
print(unique_loc_values)

['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'unknown' 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']


In [64]:
# Encode categorical variables using LabelEncoder and One-Hot Encoding
label_encoder = LabelEncoder()

# Label encode ordinal categorical columns
df['dx'] = label_encoder.fit_transform(df['dx'])
df['dx_type'] = label_encoder.fit_transform(df['dx_type'])
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

In [65]:
# One-hot encode nominal categorical columns
df = pd.get_dummies(df, columns=['sex', 'localization'])

In [66]:
df.head()

Unnamed: 0,dx,dx_type,age,diagnosis,result,sex_female,sex_male,sex_unknown,localization_abdomen,localization_acral,...,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,2,3,80.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,2,3,80.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,2,3,80.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,2,3,80.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,2,3,75.0,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 23 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   dx                            10015 non-null  int32  
 1   dx_type                       10015 non-null  int32  
 2   age                           10015 non-null  float64
 3   diagnosis                     10015 non-null  int32  
 4   result                        10015 non-null  int64  
 5   sex_female                    10015 non-null  uint8  
 6   sex_male                      10015 non-null  uint8  
 7   sex_unknown                   10015 non-null  uint8  
 8   localization_abdomen          10015 non-null  uint8  
 9   localization_acral            10015 non-null  uint8  
 10  localization_back             10015 non-null  uint8  
 11  localization_chest            10015 non-null  uint8  
 12  localization_ear              10015 non-null  uint8  
 13  l

In [68]:
# Standardize all columns using StandardScaler
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scaled.head()

Unnamed: 0,dx,dx_type,age,diagnosis,result,sex_female,sex_male,sex_unknown,localization_abdomen,localization_acral,...,localization_face,localization_foot,localization_genital,localization_hand,localization_lower extremity,localization_neck,localization_scalp,localization_trunk,localization_unknown,localization_upper extremity
0,-1.496108,0.835507,1.662953,-1.343453,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
1,-1.496108,0.835507,1.662953,-1.343453,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
2,-1.496108,0.835507,1.662953,-1.343453,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
3,-1.496108,0.835507,1.662953,-1.343453,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
4,-1.496108,0.835507,1.367434,-1.343453,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,-0.113782,-0.403791,-0.154674,-0.354486


In [69]:
# Assuming 'result' is the target variable
X = df_scaled.drop(['result', 'dx'], axis=1)
y = df['result']

In [70]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [71]:
# Create an SVM model
svm_model = SVC()
# svm_model = SVC(kernel='linear', class_weight='balanced', random_state=1)

# Train the model using the training data
svm_model.fit(X_train, y_train)

In [72]:
# Score the model using the test data
print(f"Training Data Score: {svm_model.score(X_train, y_train)}")
print(f"Testing Data Score: {svm_model.score(X_test, y_test)}")

Training Data Score: 0.9933430967913727
Testing Data Score: 0.990814696485623


In [73]:
# Make predictions
predictions = svm_model.predict(X_test)

In [74]:
# Calculate the Accuracy Score
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.990814696485623


In [75]:
# # confusion matrix
# conf_matrix = confusion_matrix(y_test, predictions)

# # Display the confusion matrix using seaborn
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted Benign', 'Predicted Malignant'], yticklabels=['Actual Benign', 'Actual Malignant'])
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

In [76]:
# confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# hvplot for visualization
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual Benign', 'Actual Malignant'], columns=['Predicted Benign', 'Predicted Malignant'])
conf_matrix_df.hvplot.heatmap(cmap="Blues", width=600, height=300, xlabel='Predicted', ylabel='Actual', title='Confusion Matrix')

In [77]:
# Print the DataFrame
print("Confusion Matrix:")
print(conf_matrix_df)

Confusion Matrix:
                  Predicted Benign  Predicted Malignant
Actual Benign                 1980                    0
Actual Malignant                23                  501


In [78]:
# Display the classification report
classification_report_result = classification_report(y_test, predictions)
print('Classification Report:')
print(classification_report_result)

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1980
           1       1.00      0.96      0.98       524

    accuracy                           0.99      2504
   macro avg       0.99      0.98      0.99      2504
weighted avg       0.99      0.99      0.99      2504

