In [39]:
import shutil
from PIL import Image
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

In [40]:
df = pd.read_csv(Path('..', 'data', 'archive', 'ISIC_2019_Training_Metadata.csv'))

In [41]:
df = df.dropna(axis=0, how='all', subset=['age_approx', 'anatom_site_general', 'sex'])
df = df.drop(['lesion_id'], axis=1)

In [42]:
df.sex.replace(np.nan, "Delete", inplace=True)

In [43]:
df.anatom_site_general.replace(np.nan, "Delete1", inplace=True)

In [44]:
imputer = SimpleImputer(strategy="mean") 
imputer.fit(df[['age_approx']]) 
df['age_approx'] = imputer.transform(df[['age_approx']])

In [45]:
df.sex.unique()  
ohe = OneHotEncoder(sparse = False, handle_unknown='ignore') 
ohe.fit(df[['sex']]) 
sex_encoded = ohe.transform(df[['sex']])
df[ohe.categories_[0]] = sex_encoded 

In [47]:
df.anatom_site_general.unique()  

ohe2 = OneHotEncoder(sparse = False, handle_unknown='ignore') 

ohe2.fit(df[['anatom_site_general']]) 
anatom_site_general_encoded = ohe2.transform(df[['anatom_site_general']])
df[ohe2.categories_[0]] = anatom_site_general_encoded 

In [48]:
df = df.drop(columns=['anatom_site_general', 'sex', 'Delete', 'Delete1'])
df

Unnamed: 0,image,age_approx,female,male,anterior torso,head/neck,lateral torso,lower extremity,oral/genital,palms/soles,posterior torso,upper extremity
0,ISIC_0000000,55.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,30.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,60.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,ISIC_0000003,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,ISIC_0000004,80.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,85.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25327,ISIC_0073248,65.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25328,ISIC_0073249,70.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25329,ISIC_0073251,55.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [31]:
y_df = pd.read_csv(Path('..', 'data', 'archive', 'ISIC_2019_Training_GroundTruth.csv'))
y_df

Unnamed: 0,image,MEL,NV,BCC,AK,BKL,DF,VASC,SCC,UNK
0,ISIC_0000000,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ISIC_0000001,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ISIC_0000002,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ISIC_0000003,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ISIC_0000004,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
25326,ISIC_0073247,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
25327,ISIC_0073248,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25328,ISIC_0073249,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25329,ISIC_0073251,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
target = pd.DataFrame({"target":["MEL","NV","BCC", "AK", "BKL", "DF", "VASC", "SCC", "UNK"]})
target

Unnamed: 0,target
0,MEL
1,NV
2,BCC
3,AK
4,BKL
5,DF
6,VASC
7,SCC
8,UNK


In [33]:
y_df = y_df.set_index('image')
y_df = y_df.idxmax(axis='columns')


In [34]:
y_df = y_df.reset_index()

In [35]:
y_df.columns = ['image', 'target']

In [36]:
y_df

Unnamed: 0,image,target
0,ISIC_0000000,NV
1,ISIC_0000001,NV
2,ISIC_0000002,MEL
3,ISIC_0000003,NV
4,ISIC_0000004,MEL
...,...,...
25326,ISIC_0073247,BCC
25327,ISIC_0073248,BKL
25328,ISIC_0073249,MEL
25329,ISIC_0073251,NV


In [37]:
df = df.merge(y_df, how='left', on='image')
df

Unnamed: 0,image,age_approx,female,male,anterior torso,head/neck,lateral torso,lower extremity,oral/genital,palms/soles,posterior torso,upper extremity,target
0,ISIC_0000000,55.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NV
1,ISIC_0000001,30.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,NV
2,ISIC_0000002,60.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,MEL
3,ISIC_0000003,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,NV
4,ISIC_0000004,80.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,MEL
...,...,...,...,...,...,...,...,...,...,...,...,...,...
25111,ISIC_0073247,85.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,BCC
25112,ISIC_0073248,65.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,BKL
25113,ISIC_0073249,70.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,MEL
25114,ISIC_0073251,55.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,NV


In [49]:
df.set_index('image', inplace = True)
df

Unnamed: 0_level_0,age_approx,female,male,anterior torso,head/neck,lateral torso,lower extremity,oral/genital,palms/soles,posterior torso,upper extremity
image,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
ISIC_0000000,55.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISIC_0000001,30.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISIC_0000002,60.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
ISIC_0000003,30.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
ISIC_0000004,80.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
ISIC_0073247,85.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
ISIC_0073248,65.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ISIC_0073249,70.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
ISIC_0073251,55.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
