In [64]:
import pandas as pd

In [65]:
df = pd.read_csv('~/ufes/datasets/ISIC2024/train-metadata.csv')

  df = pd.read_csv('~/ufes/datasets/ISIC2024/train-metadata.csv')


In [66]:
non_features = ['isic_id', 'target', 'patient_id',
    'image_type', 'tbp_tile_type', 'attribution', 'copyright_license', 
    'lesion_id', 'iddx_full', 'iddx_1', 'iddx_2', 'iddx_3', 
    'iddx_4', 'iddx_5', 'mel_mitotic_index', 'mel_thick_mm']

In [67]:
features = list(set(df.columns) - set(non_features))
print(f'Number of features: {len(features)}')

Number of features: 39


In [68]:
df['target'] = df['target'].astype(int)
df['diagnosis'] = df['target'].map({0:'benign', 1:'malignant'}) 
df['diagnosis'] = df['diagnosis'].astype('category')
df['diagnosis'].value_counts()

diagnosis
benign       400666
malignant       393
Name: count, dtype: int64

## Missing data analysis

### Columns with missing or unknown values

In [69]:
is_na = df.isna() | df.isin(['Unknown'])
count_nan = is_na.sum()
features_with_nan_values = list(count_nan[features][count_nan > 0].index)
count_nan[features_with_nan_values]

age_approx                 2798
tbp_lv_location_simple     5756
sex                       11517
tbp_lv_location            5756
anatom_site_general        5756
dtype: int64

### Percentage of missing or unknown values

In [70]:
count_nan[features_with_nan_values] / len(df) * 100

age_approx                0.697653
tbp_lv_location_simple    1.435200
sex                       2.871647
tbp_lv_location           1.435200
anatom_site_general       1.435200
dtype: float64

### Columns with values equals to zero

In [71]:
is_zero = df.isin([0])
count_zero = is_zero.sum()
features_with_zeroed_values = list(count_zero[features][count_zero > 0].index)
count_zero[features_with_zeroed_values]

tbp_lv_nevi_confidence             6
tbp_lv_radial_color_std_max    29733
tbp_lv_symm_2axis_angle        18562
tbp_lv_color_std_mean          26187
tbp_lv_norm_color              26182
dtype: int64

### Percentage of values equals to zero

In [72]:
count_zero[features_with_zeroed_values] / len(df) * 100

tbp_lv_nevi_confidence         0.001496
tbp_lv_radial_color_std_max    7.413622
tbp_lv_symm_2axis_angle        4.628247
tbp_lv_color_std_mean          6.529463
tbp_lv_norm_color              6.528217
dtype: float64

## Per-class analysis

### Percentage of missing values per class

In [73]:
is_na_per_diagnosis = df[features_with_nan_values + ['diagnosis']].groupby(['diagnosis']).apply(lambda x : x.isna(), include_groups=False)
for diagnosis in df['diagnosis'].unique():
    print('-' * 50)
    print(f'Percentage of missing values for {diagnosis.upper()} class:')
    print(is_na_per_diagnosis.loc[diagnosis].sum() / df['diagnosis'].value_counts()[diagnosis] * 100,'\n')

  is_na_per_diagnosis = df[features_with_nan_values + ['diagnosis']].groupby(['diagnosis']).apply(lambda x : x.isna(), include_groups=False)


--------------------------------------------------
Percentage of missing values for BENIGN class:
age_approx                0.697589
tbp_lv_location_simple    0.000000
sex                       2.871968
tbp_lv_location           0.000000
anatom_site_general       1.436608
dtype: float64 

--------------------------------------------------
Percentage of missing values for MALIGNANT class:
age_approx                0.763359
tbp_lv_location_simple    0.000000
sex                       2.544529
tbp_lv_location           0.000000
anatom_site_general       0.000000
dtype: float64 



### Percentage of values equals to zero per class

In [74]:
is_na_per_diagnosis = df[features_with_zeroed_values + ['diagnosis']].groupby(['diagnosis']).apply(lambda x : x.isin([0]), include_groups=False)
for diagnosis in df['diagnosis'].unique():
    print('-' * 50)
    print(f'Percentage of missing values for {diagnosis.upper()} class:')
    print(is_na_per_diagnosis.loc[diagnosis].sum() / df['diagnosis'].value_counts()[diagnosis] * 100,'\n')

  is_na_per_diagnosis = df[features_with_zeroed_values + ['diagnosis']].groupby(['diagnosis']).apply(lambda x : x.isin([0]), include_groups=False)


--------------------------------------------------
Percentage of missing values for BENIGN class:
tbp_lv_nevi_confidence         0.001498
tbp_lv_radial_color_std_max    7.398182
tbp_lv_symm_2axis_angle        4.628044
tbp_lv_color_std_mean          6.513405
tbp_lv_norm_color              6.512157
dtype: float64 

--------------------------------------------------
Percentage of missing values for MALIGNANT class:
tbp_lv_nevi_confidence          0.000000
tbp_lv_radial_color_std_max    23.155216
tbp_lv_symm_2axis_angle         4.834606
tbp_lv_color_std_mean          22.900763
tbp_lv_norm_color              22.900763
dtype: float64 

