In [9]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

In [10]:
df_train = pd.read_csv('dataset/train_label.csv', header=0)
df_train.head()

Unnamed: 0,FileName,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia
0,00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0
1,00000001_001.png,0,1,0,0,0,0,0,0,0,0,1,0,0,0
2,00000001_002.png,0,1,1,0,0,0,0,0,0,0,0,0,0,0
3,00000002_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,00000004_000.png,0,0,0,0,1,1,0,0,0,0,0,0,0,0


In [11]:
disease_names = list(df_train.columns[1:])
# 1: to remove FileName

In [12]:
disease_names

['Atelectasis',
 'Cardiomegaly',
 'Effusion',
 'Infiltration',
 'Mass',
 'Nodule',
 'Pneumonia',
 'Pneumothorax',
 'Consolidation',
 'Edema',
 'Emphysema',
 'Fibrosis',
 'Pleural_Thickening',
 'Hernia']

In [13]:
n_images, n_cols = df_train.shape
print("Amount of images: ", n_images)
print("Amount of diseases: ", len(disease_names))

Amount of images:  75714
Amount of diseases:  14


## Observations per disease

### Including rows with more than one disease

In [14]:
df_train[disease_names].sum().sort_values(ascending=False)

Infiltration          12027
Effusion               7637
Atelectasis            7306
Nodule                 4131
Mass                   3579
Consolidation          2505
Pneumothorax           2355
Pleural_Thickening     1987
Cardiomegaly           1490
Emphysema              1306
Edema                  1218
Fibrosis               1107
Pneumonia               762
Hernia                  126
dtype: int64

### Consider multiple diseases

In [15]:
def merge_diseases_names(row):
    full_disease = ""
    amount = 0

    for dis_name in disease_names:
        present = row[dis_name]
        if present:
            if full_disease:
                full_disease += "|"

            full_disease += dis_name
            amount += 1

    if not full_disease:
        full_disease = "Nothing"

    row["Full"] = full_disease
    row["Amount"] = amount

    return row

df_train["Full"] = ""
df_train["Amount"] = -1

df_train = df_train.apply(merge_diseases_names, axis=1)
df_train.head()

Unnamed: 0,FileName,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia,Full,Amount
0,00000001_000.png,0,1,0,0,0,0,0,0,0,0,0,0,0,0,Cardiomegaly,1
1,00000001_001.png,0,1,0,0,0,0,0,0,0,0,1,0,0,0,Cardiomegaly|Emphysema,2
2,00000001_002.png,0,1,1,0,0,0,0,0,0,0,0,0,0,0,Cardiomegaly|Effusion,2
3,00000002_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,0,Nothing,0
4,00000004_000.png,0,0,0,0,1,1,0,0,0,0,0,0,0,0,Mass|Nodule,2


---

Also see Figure 2 in paper, with proportions for each disease

In [16]:
# Amount of each class
counts = df_train["Full"].value_counts()
counts

Nothing                                                             44129
Infiltration                                                         6328
Atelectasis                                                          3002
Effusion                                                             2437
Nodule                                                               1957
Mass                                                                 1506
Pneumothorax                                                         1096
Effusion|Infiltration                                                 864
Atelectasis|Infiltration                                              838
Atelectasis|Effusion                                                  717
Pleural_Thickening                                                    717
Consolidation                                                         713
Cardiomegaly                                                          668
Infiltration|Nodule                   

In [17]:
# Only single diseases
counts.loc[disease_names].sort_values(ascending=False)

Infiltration          6328
Atelectasis           3002
Effusion              2437
Nodule                1957
Mass                  1506
Pneumothorax          1096
Pleural_Thickening     717
Consolidation          713
Cardiomegaly           668
Emphysema              526
Fibrosis               485
Edema                  347
Pneumonia              204
Hernia                  55
Name: Full, dtype: int64

In [21]:
amount_nothing = counts.loc["Nothing"]
amount_something = n_images - amount_nothing
print("Normal images: ", amount_nothing)
print("Images with disease: ", amount_something)

Normal images:  44129
Images with disease:  31585
