In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd 
import numpy as np
import os 
import sys

In [3]:
classes = np.load("/content/drive/MyDrive/Classification2D/chest-X-rays/output/columns_14.npy")
classes 

array(['Atelectasis', 'Cardiomegaly', 'Consolidation', 'Edema',
       'Effusion', 'Emphysema', 'Fibrosis', 'Hernia', 'Infiltration',
       'Mass', 'Nodule', 'Pleural_Thickening', 'Pneumonia',
       'Pneumothorax'], dtype='<U18')

In [4]:
data = pd.read_csv('/content/drive/MyDrive/Classification2D/NIH-Chest-X-ray-Dataset/train_val_data.csv', index_col=0)
data

Unnamed: 0,image_id,class_name,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,...,Effusion,Emphysema,Fibrosis,Hernia,Infiltration,Mass,Nodule,Pleural_Thickening,Pneumonia,Pneumothorax
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
112111,00030798_000.png,No Finding,0,30798,30,M,PA,2500,2048,0.171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112112,00030799_000.png,No Finding,0,30799,32,M,PA,2048,2500,0.171,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112113,00030800_000.png,No Finding,0,30800,34,F,PA,2048,2500,0.168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
112114,00030801_000.png,No Finding,0,30801,39,M,PA,2500,2048,0.168,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
counts = pd.DataFrame([data[c].value_counts() for c in classes])
counts

Unnamed: 0,0.0,1.0
Atelectasis,100557,11559
Cardiomegaly,109340,2776
Consolidation,107449,4667
Edema,109813,2303
Effusion,98799,13317
Emphysema,109600,2516
Fibrosis,110430,1686
Hernia,111889,227
Infiltration,92222,19894
Mass,106334,5782


In [6]:
N = counts.iloc[0, 0] + counts.iloc[0, 1] 
beta = (N - 1) / N
weights = (1 - beta) / ( 1 - (beta ** counts))
weights 

Unnamed: 0,0.0,1.0
Atelectasis,1.5e-05,9.1e-05
Cardiomegaly,1.4e-05,0.000365
Consolidation,1.4e-05,0.000219
Edema,1.4e-05,0.000439
Effusion,1.5e-05,8e-05
Emphysema,1.4e-05,0.000402
Fibrosis,1.4e-05,0.000598
Hernia,1.4e-05,0.00441
Infiltration,1.6e-05,5.5e-05
Mass,1.5e-05,0.000177


In [7]:
normalized_weight = weights / weights.values.sum(axis=1)[:, np.newaxis]
normalized_weight

Unnamed: 0,0.0,1.0
Atelectasis,0.141947,0.858053
Cardiomegaly,0.037779,0.962221
Consolidation,0.062034,0.937966
Edema,0.031531,0.968469
Effusion,0.160517,0.839483
Emphysema,0.034354,0.965646
Fibrosis,0.023268,0.976732
Hernia,0.003193,0.996807
Infiltration,0.224794,0.775206
Mass,0.075823,0.924177


To balance the loss, we introduce a weighting factor αi
that is inversely proportional to the effective number of samples for class i: αi ∝ 1/Eni
. To make the total loss roughly
in the same scale when applying αi
, we normalize αi so
that PC
i=1 αi = C

In [8]:
weights.sum(axis=0)

0.0    0.000204
1.0    0.008193
dtype: float64

In [9]:
12.98682 + 1.01318

14.0

In [10]:
import torch 
x = torch.rand(14, 2)
y = torch.rand(14, 1)
(x / y).shape

torch.Size([14, 2])

In [11]:
print(weights.values.shape)
print(weights.values[:, 0].shape)
print(weights.values[:, 0, np.newaxis].shape)

(14, 2)
(14,)
(14, 1)


In [12]:
normalized_weight = normalized_weight / normalized_weight.values[:, 0, np.newaxis]
normalized_weight

Unnamed: 0,0.0,1.0
Atelectasis,1.0,6.044875
Cardiomegaly,1.0,25.470055
Consolidation,1.0,15.120232
Edema,1.0,30.714838
Effusion,1.0,5.229863
Emphysema,1.0,28.10902
Fibrosis,1.0,41.978216
Hernia,1.0,312.153256
Infiltration,1.0,3.448514
Mass,1.0,12.188555


In [13]:
from pathlib import Path 

normalized_weight.to_csv("/content/drive/MyDrive/Classification2D/chest-X-rays/output/weights_14.csv")