In [1]:
# Import modules
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [2]:
# Read the CSV file into a Pandas DataFrame
skin_metadata_df = pd.read_csv(
    Path("Resources/HAM10000_metadata.csv")
)

# Review the DataFrame
skin_metadata_df.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
skin_metadata_df_dict = {
    'nv': 'Melanocytic nevi',
    'mel': 'Melanoma',
    'bkl': 'Benign keratosis-like lesions ',
    'bcc': 'Basal cell carcinoma',
    'akiec': 'Actinic keratoses',
    'vasc': 'Vascular lesions',
    'df': 'Dermatofibroma'
}

skin_metadata_df['diagnosis'] = skin_metadata_df['dx'].map(skin_metadata_df_dict.get) 

In [4]:
skin_metadata_df['diagnosis'].value_counts()

diagnosis
Melanocytic nevi                  6705
Melanoma                          1113
Benign keratosis-like lesions     1099
Basal cell carcinoma               514
Actinic keratoses                  327
Vascular lesions                   142
Dermatofibroma                     115
Name: count, dtype: int64

## Step 1: Preparing the Data

In [5]:
label_encoder = preprocessing.LabelEncoder()

In [6]:
df1 = skin_metadata_df.copy()

In [7]:
lesion_id_cat = label_encoder.fit_transform(df1['lesion_id'])
lesion_id_cat = pd.DataFrame({'lesion_id_cat': lesion_id_cat})

In [8]:
image_id_cat = label_encoder.fit_transform(df1['image_id'])
image_id_cat = pd.DataFrame({'image_id_cat': image_id_cat})

In [9]:
dx_cat = label_encoder.fit_transform(df1['dx'])
dx_cat = pd.DataFrame({'dx_cat': dx_cat})

In [10]:
dx_type_cat = label_encoder.fit_transform(df1['dx_type'])
dx_type_cat = pd.DataFrame({'dx_type_cat': dx_type_cat})

In [11]:
sex_cat = label_encoder.fit_transform(df1['sex'])
sex_cat = pd.DataFrame({'sex_cat': sex_cat})

In [12]:
localization_cat = label_encoder.fit_transform(df1['localization'])
localization_cat = pd.DataFrame({'localization_cat': localization_cat})

In [13]:
diagnosis_cat = label_encoder.fit_transform(df1['diagnosis'])
diagnosis_cat = pd.DataFrame({'diagnosis_cat': diagnosis_cat})

In [14]:
df1.lesion_id = lesion_id_cat
df1.image_id = image_id_cat
df1.dx = dx_cat
df1.dx_type = dx_type_cat
df1.sex = sex_cat
df1.localization = localization_cat
df1.diagnosis = diagnosis_cat

In [15]:
df1.head()

Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization,diagnosis
0,118,3113,2,3,80.0,1,11,2
1,118,724,2,3,80.0,1,11,2
2,2710,2463,2,3,80.0,1,11,2
3,2710,1355,2,3,80.0,1,11,2
4,1460,7327,2,3,75.0,1,4,2


## Create a column for dianogsis where 1 = cancer and 0 = benign.

In [None]:
# Create a new column 'diagnosis' based on conditions
df['diagnosis'] = np.where(df['dx'].isin(['nv', 'bkl', 'df']), 0, 1)

# Display the DataFrame with the new column
print(df.head())
print(df.tail())

### 2. Separate the features `X` from the target `y`

In [None]:
# Features (X): Considering 'age', 'sex', and 'localization' as features
X = df[['age', 'sex', 'localization']]

# Target variable (y): 'diagnosis'
y = df['diagnosis']

In [None]:
# Preview the features data
X.head()

In [None]:
# Preview the first five entries for the target variable
y[:5]

### 3. Encode the categorical variables from the features data using `get_dummies`.

In [None]:
# Perform one-hot encoding on categorical variables
X = pd.get_dummies(X, columns=['sex', 'localization'], drop_first=True)

In [None]:
# Review the features data
X.head()

### 4. Separate the data into training and testing subsets.

In [None]:
# Split the dataset using train_test_split
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Splitting the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### 5. Scale the data using `StandardScaler`

In [None]:
# Instantiate a StandardScaler instance
scaler = StandardScaler()

# Fit the training data to the standard scaler
X_scaler = scaler.fit(X_train)

# Transform the training data using the scaler
X_train_scaled = X_scaler.transform(X_train)

# Transform the testing data using the scaler
X_test_scaled = X_scaler.transform(X_test)

### 6. Building and Training the Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

# Initialize the Logistic Regression model
logreg = LogisticRegression()

# Fit the model on the training data
logreg.fit(X_train, y_train)


### 7. Fit the model using the training data.

In [None]:
from sklearn.metrics import accuracy_score, classification_report

# Predict on the test set
y_pred = logreg.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

# Classification report
print(classification_report(y_test, y_pred))

### 8. Make predictions using the testing data.

In [None]:
# Assuming new_data contains new instances in the same format as the training data
# Preprocess new_data similarly before making predictions

# Predict on new data
new_data_predictions = logreg.predict(new_data)


### 9. Generate the classification report for the test data.

In [None]:
# Print the classification report comparing the testing data to the model predictions
print(classification_report(y_test, y_pred))

In [None]:
# Initiate the classifier model
classifier = LogisticRegression(max_iter=1000)

# Split the data into training and testing sets 
X_train, X_test, y_train, y_test = train_test_split(X1, y1, test_size=0.2, random_state=0)

X_scaler = StandardScaler().fit(X_train)

X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Save the scaler to the "Model_Saved" folder
# joblib.dump(X_scaler, f"Model_Saved/{name}_X_scaler.joblib")

# Fit, train and test the model
model = classifier.fit(X_train_scaled, y_train)
print('Healthy vs the rest')
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

# Make predictions on the test data
y_pred = model.predict(X_test_scaled)

# Calculate the accuracy of the model
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}%'.format(accuracy * 100))

# Print classification report
target_names = ['Healthy', 'Cancer_Pre-cancer']
print(classification_report(y_test, y_pred, target_names=target_names))

# Plot ROC curve
y_pred_prob = model.predict_proba(X_test_scaled)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

plt.figure()
plt.plot(fpr, tpr, color='darkorange')
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve - Logistic Regression Model')
plt.show()