#1. Load the dataset

In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
filepath = '/content/drive/MyDrive/Resources/buX/2022 - Semester 8 - Summer/CSE422/Labs/Lab Assignment 07/Melanoma TFRecords 256x256.csv'
melanoma = pd.read_csv(filepath)
melanoma.head(3)

Unnamed: 0,image_name,patient_id,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target,tfrecord,width,height,patient_code
0,ISIC_2637011,IP_7279968,male,45.0,head/neck,unknown,benign,0,0,6000,4000,0
1,ISIC_0015719,IP_3075186,female,45.0,upper extremity,unknown,benign,0,0,6000,4000,1
2,ISIC_0052212,IP_2842074,female,50.0,lower extremity,nevus,benign,0,6,1872,1053,2


In [4]:
melanoma = melanoma.drop(labels=['image_name', 'patient_id', 'patient_code', 'tfrecord', 'width', 'height'], axis=1)

In [5]:
melanoma.shape

(33126, 6)

#2. Handle missing values if needed

In [6]:
melanoma.isnull().sum()

sex                               65
age_approx                        68
anatom_site_general_challenge    527
diagnosis                          0
benign_malignant                   0
target                             0
dtype: int64

In [7]:
melanoma = melanoma.dropna(subset=['anatom_site_general_challenge', 'sex', 'age_approx'], axis='index')
melanoma.isnull().sum()

sex                              0
age_approx                       0
anatom_site_general_challenge    0
diagnosis                        0
benign_malignant                 0
target                           0
dtype: int64

In [8]:
melanoma.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 32531 entries, 0 to 33125
Data columns (total 6 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   sex                            32531 non-null  object 
 1   age_approx                     32531 non-null  float64
 2   anatom_site_general_challenge  32531 non-null  object 
 3   diagnosis                      32531 non-null  object 
 4   benign_malignant               32531 non-null  object 
 5   target                         32531 non-null  int64  
dtypes: float64(1), int64(1), object(4)
memory usage: 1.7+ MB


#3. Encode categorical features if needed

In [9]:
labels = ['sex', 'anatom_site_general_challenge', 'diagnosis', 'benign_malignant']

In [10]:
for label in labels:
    print(label, melanoma[label].unique())

sex ['male' 'female']
anatom_site_general_challenge ['head/neck' 'upper extremity' 'lower extremity' 'torso' 'palms/soles'
 'oral/genital']
diagnosis ['unknown' 'nevus' 'melanoma' 'seborrheic keratosis' 'lentigo NOS'
 'lichenoid keratosis' 'solar lentigo' 'cafe-au-lait macule'
 'atypical melanocytic proliferation']
benign_malignant ['benign' 'malignant']


In [12]:
from sklearn.preprocessing import LabelEncoder

enc = LabelEncoder()

for label in labels:
    melanoma[label] = enc.fit_transform(melanoma[label])

In [13]:
melanoma.head()

Unnamed: 0,sex,age_approx,anatom_site_general_challenge,diagnosis,benign_malignant,target
0,1,45.0,0,8,0,0
1,0,45.0,5,8,0,0
2,0,50.0,1,5,0,0
3,0,45.0,0,8,0,0
4,0,55.0,5,8,0,0


#4. Scale the values if needed.

In [14]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

scaler.fit(melanoma)

MinMaxScaler()

In [15]:
melanoma_scaled = scaler.transform(melanoma)

In [16]:
print("per-feature minimum after scaling:\n {}".format(
    melanoma_scaled.min(axis=0)))
print("per-feature maximum after scaling:\n {}".format(
    melanoma_scaled.max(axis=0)))

per-feature minimum after scaling:
 [0. 0. 0. 0. 0. 0.]
per-feature maximum after scaling:
 [1. 1. 1. 1. 1. 1.]


#5. Split the dataset into features and labels. Determine which column indicates the labels.

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(melanoma_scaled, melanoma[['target']],
                                                    random_state=1, train_size=0.8)
print(X_train.shape)
print(X_test.shape)

(26024, 6)
(6507, 6)


#6. After performing the necessary pre-processing, perform classification using a Decision Tree Classifier.

In [18]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
clf = DecisionTreeClassifier(criterion='entropy',random_state=1)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
score=accuracy_score(y_pred,y_test)
print(score)

1.0


#7. Compare the accuracy with a logistic regression model and plot them using a bar chart.

In [None]:
from sklearn import tree
import matplotlib.pyplot as plt
fig, axes = plt.subplots(nrows = 1,ncols = 1,figsize = (4,4), dpi=300)
tree.plot_tree(clf,
               feature_names = melanoma.columns, 
               class_names=['1','2','3'],
               filled = True);