In [30]:
import os
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder, StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
import hvplot.pandas

In [2]:
# Load your dataset
# Replace 'your_dataset.csv' with the actual file path or URL
data = Path("../Resources/cleaned_skin_metadata.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,diagnosis,result,result_label
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,Benign keratosis-like lesions,0,Benign


In [3]:
# Drop columns that are not needed for modeling
df = df.drop(['result_label'], axis=1)

In [4]:
# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
df['lesion_id'] = label_encoder.fit_transform(df['lesion_id'])
df['image_id'] = label_encoder.fit_transform(df['image_id'])
df['dx'] = label_encoder.fit_transform(df['dx'])
df['dx_type'] = label_encoder.fit_transform(df['dx_type'])
df['diagnosis'] = label_encoder.fit_transform(df['diagnosis'])

df['sex'] = label_encoder.fit_transform(df['sex'])
df['localization'] = label_encoder.fit_transform(df['localization'])

In [5]:
df.head(20)

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,diagnosis,result
0,118,3113,2,3,80.0,1,11,2,0
1,118,724,2,3,80.0,1,11,2,0
2,2710,2463,2,3,80.0,1,11,2,0
3,2710,1355,2,3,80.0,1,11,2,0
4,1460,7327,2,3,75.0,1,4,2,0
5,1460,3544,2,3,75.0,1,4,2,0
6,2741,4870,2,3,60.0,1,5,2,0
7,2741,4762,2,3,60.0,1,5,2,0
8,5064,1531,2,3,70.0,0,2,2,0
9,5064,903,2,3,70.0,0,2,2,0


In [6]:
# Standardize all columns using StandardScaler
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)
df_scaled.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,diagnosis,result
0,-1.680017,-0.655118,-1.496108,0.835507,1.662953,0.882321,0.847871,-1.343453,-0.51447
1,-1.680017,-1.481453,-1.496108,0.835507,1.662953,0.882321,0.847871,-1.343453,-0.51447
2,-0.474447,-0.879948,-1.496108,0.835507,1.662953,0.882321,0.847871,-1.343453,-0.51447
3,-0.474447,-1.263195,-1.496108,0.835507,1.662953,0.882321,0.847871,-1.343453,-0.51447
4,-1.055837,0.802468,-1.496108,0.835507,1.367434,0.882321,-0.626664,-1.343453,-0.51447


In [20]:
# Assuming 'result_label' is the target variable
X = df_scaled.drop('result', axis=1)
X = df_scaled.drop('result', axis=1)
y = df['result']

In [21]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [22]:
# Create an SVM model
svm_model = SVC(kernel='linear',  random_state=1)

In [23]:
# Train the model using the training data
svm_model.fit(X_train, y_train)

In [24]:
# Score the model using the test data
print(f"Training Data Score: {svm_model.score(X_train, y_train)}")
print(f"Testing Data Score: {svm_model.score(X_test, y_test)}")

Training Data Score: 0.982958327785914
Testing Data Score: 0.9884185303514377


In [25]:
# Make predictions
predictions = svm_model.predict(X_test)

In [26]:
# Calculate the Accuracy Score
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9884185303514377


In [35]:
# # Compute confusion matrix
# conf_matrix = confusion_matrix(y_test, predictions)

# # Display the confusion matrix using seaborn
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted Benign', 'Predicted Malignant'], yticklabels=['Actual Benign', 'Actual Malignant'])
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

In [34]:
# Compute confusion matrix
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual Benign', 'Actual Malignant'], columns=['Predicted Benign', 'Predicted Malignant'])

# Use hvplot for visualization
conf_matrix_df.hvplot.heatmap(cmap="Blues", width=600, height=300, xlabel='Predicted', ylabel='Actual', title='Confusion Matrix')

In [28]:
# Display the classification report
classification_report_result = classification_report(y_test, predictions)
print('Classification Report:')
print(classification_report_result)

Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99      1980
           1       1.00      0.94      0.97       524

    accuracy                           0.99      2504
   macro avg       0.99      0.97      0.98      2504
weighted avg       0.99      0.99      0.99      2504

