In [1]:
import os
from pathlib import Path
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from imblearn.under_sampling import RandomUnderSampler
from joblib import dump, load

import hvplot.pandas
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
# Load your dataset
data = Path("../Resources/cleaned_skin_metadata.csv")
df = pd.read_csv(data)
df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,diagnosis,result,result_label
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp,Benign keratosis-like lesions,0,Benign
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear,Benign keratosis-like lesions,0,Benign


In [3]:
# Drop columns that are not needed for modeling
df1 = df.drop(['result_label','lesion_id','image_id','dx', 'diagnosis'], axis=1)
df1.head()

Unnamed: 0,dx_type,age,sex,localization,result
0,histo,80.0,male,scalp,0
1,histo,80.0,male,scalp,0
2,histo,80.0,male,scalp,0
3,histo,80.0,male,scalp,0
4,histo,75.0,male,ear,0


In [4]:
# Columns to convert to sentence case
columns_to_convert = ['dx_type', 'sex', 'localization']

# Convert specified columns to sentence case
for col in columns_to_convert:
    df1[col] = df1[col].str.capitalize()

df1.head()

Unnamed: 0,dx_type,age,sex,localization,result
0,Histo,80.0,Male,Scalp,0
1,Histo,80.0,Male,Scalp,0
2,Histo,80.0,Male,Scalp,0
3,Histo,80.0,Male,Scalp,0
4,Histo,75.0,Male,Ear,0


In [5]:
unique_sex_values = df['sex'].unique()
print(unique_sex_values)

['male' 'female' 'unknown']


In [6]:
unique_loc_values = df['localization'].unique()
print(unique_loc_values)

['scalp' 'ear' 'face' 'back' 'trunk' 'chest' 'upper extremity' 'abdomen'
 'unknown' 'lower extremity' 'genital' 'neck' 'hand' 'foot' 'acral']


In [7]:
# Encode categorical variables using LabelEncoder and One-Hot Encoding
label_encoder = LabelEncoder()

dx_types = ['Histo', 'Follow_up', 'Consensus', 'Confocal']
label_encoder = LabelEncoder()
label_encoder.fit(dx_types)
# Now transform the 'dx_type' column in your DataFrame
df1['dx_type'] = label_encoder.transform(df1['dx_type'])

# Save the fitted LabelEncoder
dump(label_encoder, 'dx_type_label_encoder.joblib')

['dx_type_label_encoder.joblib']

In [8]:
# diagnosis_label_encoder = LabelEncoder()
# # Assuming diagnosis_types contains all categories for 'diagnosis'
# diagnosis_types = ['Actinic k','Benign ke','Actinic keratoses','Benign keratosis-like lesions ',  'Basal cell carcinoma', 'Dermatofibroma', 'Melanocytic nevi', 'Melanoma', 'Vascular lesions']
# diagnosis_label_encoder.fit(diagnosis_types)

# df1['diagnosis'] = diagnosis_label_encoder.transform(df1['diagnosis'])

# # Save the fitted LabelEncoder
# dump(diagnosis_label_encoder, 'diagnosis_label_encoder.joblib')

In [9]:
# # Label encode ordinal categorical columns
# df1['dx_type'] = label_encoder.fit_transform(df1['dx_type'])
# df1['diagnosis'] = label_encoder.fit_transform(df1['diagnosis'])

In [10]:
# One-hot encode nominal categorical columns
df1 = pd.get_dummies(df1, columns=['sex', 'localization'])

In [11]:
df1.head()

Unnamed: 0,dx_type,age,result,sex_Female,sex_Male,sex_Unknown,localization_Abdomen,localization_Acral,localization_Back,localization_Chest,...,localization_Face,localization_Foot,localization_Genital,localization_Hand,localization_Lower extremity,localization_Neck,localization_Scalp,localization_Trunk,localization_Unknown,localization_Upper extremity
0,3,80.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,3,80.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2,3,80.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,3,80.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
4,3,75.0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# df1.rename(columns={'sex_female': 'female', 'sex_male': 'male', 'sex_unknown': 'unknown', 'localization_abdomen': 'abdomen', 'localization_acral': 'acral', 'localization_back': 'back', 'localization_chest': 'chest', 'localization_ear': 'ear', 'localization_face': 'face', 'localization_foot': 'foot', 'localization_genital': 'genital', 'localization_hand': 'hand', 'localization_lower extremity': 'lower extremity', 'localization_neck': 'neck', 'localization_scalp': 'scalp', 'localization_trunk': 'trunk', 'localization_unknown': 'unknown', 'localization_upper extremity': 'upper extremity'}, inplace=True)
# df1.head()

In [13]:
# Save the DataFrame to a CSV file
csv_file_path = '../Resources//df1.csv'  
df1.to_csv(csv_file_path, index=False)  

In [14]:
for column in df1.columns:
    unique_classes = df1[column].unique()
    num_unique_classes = len(unique_classes)
    print(f'Column "{column}": {num_unique_classes} unique classes - {unique_classes}')

Column "dx_type": 4 unique classes - [3 1 0 2]
Column "age": 19 unique classes - [80.         75.         60.         70.         55.         85.
 65.         40.         50.         45.         35.          0.
 30.         51.86382808  5.         25.         20.         10.
 15.        ]
Column "result": 2 unique classes - [0 1]
Column "sex_Female": 2 unique classes - [0 1]
Column "sex_Male": 2 unique classes - [1 0]
Column "sex_Unknown": 2 unique classes - [0 1]
Column "localization_Abdomen": 2 unique classes - [0 1]
Column "localization_Acral": 2 unique classes - [0 1]
Column "localization_Back": 2 unique classes - [0 1]
Column "localization_Chest": 2 unique classes - [0 1]
Column "localization_Ear": 2 unique classes - [0 1]
Column "localization_Face": 2 unique classes - [0 1]
Column "localization_Foot": 2 unique classes - [0 1]
Column "localization_Genital": 2 unique classes - [0 1]
Column "localization_Hand": 2 unique classes - [0 1]
Column "localization_Lower extremity": 2 unique

In [15]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10015 entries, 0 to 10014
Data columns (total 21 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   dx_type                       10015 non-null  int32  
 1   age                           10015 non-null  float64
 2   result                        10015 non-null  int64  
 3   sex_Female                    10015 non-null  uint8  
 4   sex_Male                      10015 non-null  uint8  
 5   sex_Unknown                   10015 non-null  uint8  
 6   localization_Abdomen          10015 non-null  uint8  
 7   localization_Acral            10015 non-null  uint8  
 8   localization_Back             10015 non-null  uint8  
 9   localization_Chest            10015 non-null  uint8  
 10  localization_Ear              10015 non-null  uint8  
 11  localization_Face             10015 non-null  uint8  
 12  localization_Foot             10015 non-null  uint8  
 13  l

In [16]:
# Initialize the StandardScaler
scaler = StandardScaler()

# Scale only the 'age' column
df1['age'] = scaler.fit_transform(df1[['age']])

# Display the first few rows of the updated DataFrame
df1.head()

# Save the fitted StandardScaler
dump(scaler, 'age_scaler.joblib')

['age_scaler.joblib']

In [17]:
# Standardize all columns using StandardScaler
scaler = StandardScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df1), columns=df1.columns)
df_scaled.head()

Unnamed: 0,dx_type,age,result,sex_Female,sex_Male,sex_Unknown,localization_Abdomen,localization_Acral,localization_Back,localization_Chest,...,localization_Face,localization_Foot,localization_Genital,localization_Hand,localization_Lower extremity,localization_Neck,localization_Scalp,localization_Trunk,localization_Unknown,localization_Upper extremity
0,0.835507,1.662953,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,-0.529339,-0.205817,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
1,0.835507,1.662953,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,-0.529339,-0.205817,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
2,0.835507,1.662953,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,-0.529339,-0.205817,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
3,0.835507,1.662953,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,-0.529339,-0.205817,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,8.788753,-0.403791,-0.154674,-0.354486
4,0.835507,1.367434,-0.51447,-0.912821,0.923348,-0.075657,-0.337111,-0.026447,-0.529339,-0.205817,...,-0.28349,-0.181384,-0.069397,-0.095226,-0.51152,-0.130618,-0.113782,-0.403791,-0.154674,-0.354486


In [18]:
# Assuming 'result' is the target variable
X = df1.drop(['result'], axis=1)
y = df['result']

In [19]:
# # Assuming 'result' is the target variable
# X = df_scaled.drop(['result', 'dx', 'dx_type', 'diagnosis'], axis=1)
# y = df['result']

In [20]:
# # Apply random undersampling before splitting into training and testing sets
# undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=1)
# X_resampled, y_resampled = undersampler.fit_resample(X, y)

In [22]:
# Split our data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)


In [23]:
# Apply random undersampling to the training set
undersampler = RandomUnderSampler(sampling_strategy='majority', random_state=1)
X_train_resampled, y_train_resampled = undersampler.fit_resample(X_train, y_train)

In [24]:
# Create an SVM model
svm_model = SVC()
# svm_model = SVC(kernel='linear', class_weight='balanced', random_state=1)

# Train the model using the training data
svm_model.fit(X_train_resampled, y_train_resampled)

In [25]:
# Score the model using the test data
print(f"Training Data Score: {svm_model.score(X_train_resampled, y_train_resampled)}")
print(f"Testing Data Score: {svm_model.score(X_test, y_test)}")

Training Data Score: 0.8085241730279898
Testing Data Score: 0.7376198083067093


In [26]:
# Make predictions
predictions = svm_model.predict(X_test)

In [27]:
# Calculate the Accuracy Score
accuracy = accuracy_score(y_test, predictions)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7376198083067093


In [28]:
# # confusion matrix
# conf_matrix = confusion_matrix(y_test, predictions)

# # Display the confusion matrix using seaborn
# plt.figure(figsize=(8, 6))
# sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=['Predicted Benign', 'Predicted Malignant'], yticklabels=['Actual Benign', 'Actual Malignant'])
# plt.xlabel('Predicted')
# plt.ylabel('Actual')
# plt.title('Confusion Matrix')
# plt.show()

In [29]:
# confusion matrix
conf_matrix = confusion_matrix(y_test, predictions)

# hvplot for visualization
conf_matrix_df = pd.DataFrame(conf_matrix, index=['Actual Benign', 'Actual Malignant'], columns=['Predicted Benign', 'Predicted Malignant'])
conf_matrix_df.hvplot.heatmap(cmap="Blues", width=600, height=300, xlabel='Predicted', ylabel='Actual', title='Confusion Matrix')

In [30]:
# Print the DataFrame
print("Confusion Matrix:")
print(conf_matrix_df)

Confusion Matrix:
                  Predicted Benign  Predicted Malignant
Actual Benign                 1365                  615
Actual Malignant                42                  482


In [39]:
# Display the classification report
target_names=["Benign", "Malignant"]
classification_report_result = classification_report(y_test, predictions, target_names=target_names)
print('Classification Report:')
print(classification_report_result)

Classification Report:
              precision    recall  f1-score   support

      Benign       0.97      0.69      0.81      1980
   Malignant       0.44      0.92      0.59       524

    accuracy                           0.74      2504
   macro avg       0.70      0.80      0.70      2504
weighted avg       0.86      0.74      0.76      2504



In [32]:
import joblib
# Save the model
joblib.dump(svm_model, 'svm_model.joblib')

['svm_model.joblib']

In [33]:
# Load the saved model
loaded_model = joblib.load('svm_model.joblib')

In [34]:
counts = df1['result'].value_counts()

# Display the counts
print(counts)

0    7919
1    2096
Name: result, dtype: int64


In [35]:
# Save the LabelEncoder and StandardScaler
joblib.dump(scaler, 'svm_scaler.joblib')

['svm_scaler.joblib']

In [36]:
joblib.dump(label_encoder, 'svm_label_encoder.joblib')

['svm_label_encoder.joblib']

In [37]:
# Save processed data
df1.to_csv("processed_data.csv", index=False)

In [38]:
joblib.dump(label_encoder, 'label_encoder_dx_type.joblib')

['label_encoder_dx_type.joblib']