In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Step 1: Clean data and analyze dataset

The disorders in this group are psoriasis (1), seborrheic dermatitis (2), lichen planus (3), pityriasis rosea (4), chronic dermatitis (5), and pityriasis rubra pilaris (6).

Usually, a biopsy is necessary for the diagnosis, but unfortunately, these diseases share many histopathological features as well (columns 12-33).

Link to more information on the data: https://www.kaggle.com/datasets/olcaybolat1/dermatology-dataset-classification/data

In [4]:
df = pd.read_csv("/kaggle/input/dermatology-dataset-classification/dermatology_database_1.csv")
df

Unnamed: 0,erythema,scaling,definite_borders,itching,koebner_phenomenon,polygonal_papules,follicular_papules,oral_mucosal_involvement,knee_and_elbow_involvement,scalp_involvement,...,disappearance_granular_layer,vacuolisation_damage_basal_layer,spongiosis,saw_tooth_appearance_retes,follicular_horn_plug,perifollicular_parakeratosis,inflammatory_mononuclear_infiltrate,band_like_infiltrate,age,class
0,2,2,0,3,0,0,0,0,1,0,...,0,0,3,0,0,0,1,0,55,2
1,3,3,3,2,1,0,0,0,1,1,...,0,0,0,0,0,0,1,0,8,1
2,2,1,2,3,1,3,0,3,0,0,...,0,2,3,2,0,0,2,3,26,3
3,2,2,2,0,0,0,0,0,3,2,...,3,0,0,0,0,0,3,0,40,1
4,2,3,2,2,2,2,0,2,0,0,...,2,3,2,3,0,0,2,3,45,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
361,2,1,1,0,1,0,0,0,0,0,...,0,0,1,0,0,0,2,0,25,4
362,3,2,1,0,1,0,0,0,0,0,...,1,0,1,0,0,0,2,0,36,4
363,3,2,2,2,3,2,0,2,0,0,...,0,3,0,3,0,0,2,3,28,3
364,2,1,3,1,2,3,0,2,0,0,...,0,2,0,1,0,0,2,3,50,3


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 366 entries, 0 to 365
Data columns (total 35 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   erythema                             366 non-null    int64 
 1   scaling                              366 non-null    int64 
 2   definite_borders                     366 non-null    int64 
 3   itching                              366 non-null    int64 
 4   koebner_phenomenon                   366 non-null    int64 
 5   polygonal_papules                    366 non-null    int64 
 6   follicular_papules                   366 non-null    int64 
 7   oral_mucosal_involvement             366 non-null    int64 
 8   knee_and_elbow_involvement           366 non-null    int64 
 9   scalp_involvement                    366 non-null    int64 
 10  family_history                       366 non-null    int64 
 11  melanin_incontinence                 366 non-

In [6]:
# Invalid ages were marked with a "?" rather than a null value
invalid_ages = df.loc[df["age"] == "?"].index
invalid_ages

Index([33, 34, 35, 36, 262, 263, 264, 265], dtype='int64')

In [7]:
# Since there are only 8 unknown ages, we'll just remove the columns
df.drop(index=invalid_ages, inplace=True)
df.shape

(358, 35)

In [8]:
df.reset_index()

Unnamed: 0,index,erythema,scaling,definite_borders,itching,koebner_phenomenon,polygonal_papules,follicular_papules,oral_mucosal_involvement,knee_and_elbow_involvement,...,disappearance_granular_layer,vacuolisation_damage_basal_layer,spongiosis,saw_tooth_appearance_retes,follicular_horn_plug,perifollicular_parakeratosis,inflammatory_mononuclear_infiltrate,band_like_infiltrate,age,class
0,0,2,2,0,3,0,0,0,0,1,...,0,0,3,0,0,0,1,0,55,2
1,1,3,3,3,2,1,0,0,0,1,...,0,0,0,0,0,0,1,0,8,1
2,2,2,1,2,3,1,3,0,3,0,...,0,2,3,2,0,0,2,3,26,3
3,3,2,2,2,0,0,0,0,0,3,...,3,0,0,0,0,0,3,0,40,1
4,4,2,3,2,2,2,2,0,2,0,...,2,3,2,3,0,0,2,3,45,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
353,361,2,1,1,0,1,0,0,0,0,...,0,0,1,0,0,0,2,0,25,4
354,362,3,2,1,0,1,0,0,0,0,...,1,0,1,0,0,0,2,0,36,4
355,363,3,2,2,2,3,2,0,2,0,...,0,3,0,3,0,0,2,3,28,3
356,364,2,1,3,1,2,3,0,2,0,...,0,2,0,1,0,0,2,3,50,3


# Part 2: EDA

In [9]:
df["age"].dtypes

dtype('O')

In [10]:
df["age"] = df["age"].astype("int64")

In [11]:
df["age"]

0      55
1       8
2      26
3      40
4      45
       ..
361    25
362    36
363    28
364    50
365    35
Name: age, Length: 358, dtype: int64

In [12]:
df["age"].max()

75

In [13]:
'''Ignore this part for now
Ill put in some graphs later'''
plt.figure(figsize = (10, 6))


<Figure size 1000x600 with 0 Axes>

<Figure size 1000x600 with 0 Axes>

# Part 3: Separating data, train_test_split

In [17]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [15]:
y = df["class"]
X = df.drop(columns="class")
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=1)

In [21]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
score = accuracy_score(y_test, y_pred)
score

0.9583333333333334

Final accuracy with decision tree: 95.83%