In [95]:
import pandas as pd


df = pd.read_csv("/data/rsna-breast-cancer-detection/train.csv")

In [96]:
patient_ids = list(df.get('patient_id'))
lateralities = list(df.get('laterality'))
cancers = list(df.get('cancer'))
image_ids = list(df.get('image_id'))


In [97]:
dataset = {}
for patient_id, laterality, cancer, image_id in zip(patient_ids, lateralities, cancers, image_ids):
    key = f"{patient_id}_{laterality}"
    try:
        dataset[key].append((image_id, cancer))
    except KeyError:
        dataset[key] = [(image_id, cancer)]



In [98]:

# No patient has less than 2 mammographies each
for patient_id, values in dataset.items():
    if len(values) < 2:
        print(patient_id)

In [99]:
# Calculate how many images there are for each {patient_id}_{laterality}

max = 0
min = 100

for patient_id, cancers in dataset.items():
    if 'L' in patient_id:
        if len(cancers) > max:
            max = len(cancers)
        if len(cancers) < min:
            min = len(cancers)
print(min, max)

2 7


In [100]:
# Find out if, given a patient and her laterality, the mammograhies have different classes
import numpy as np

print(f"Test {[1, 1, 1, 1]}: all negatives? {not np.array([1, 1, 1, 1]).any()}")
print(f"Test {[0, 0, 0, 0]}: all positives: {np.array([0, 0, 0, 0]).all()}")

found = False
for patient_id, cancers in dataset.items():
    image_ids, cancers_ids = list(zip(*cancers))
    cancers_ids = np.array(cancers_ids)
    # All ones or all zeros: it's the negative of the obvious formula.
    if cancers_ids.any() and not cancers_ids.all():
        print(cancers_ids)
        found = True
        break 
if not found:
    print("Every patient_id has the same class among all mammographies!")
        

Test [1, 1, 1, 1]: all negatives? False
Test [0, 0, 0, 0]: all positives: False
Every patient_id has the same class among all mammographies!


In [106]:
# Count positive and negative labels
import random
random.seed(42)
positives = []
negatives = []
for patient_id, cancers in dataset.items():
    if np.array(list(zip(*cancers))[1]).any():
        positives.append(patient_id)
    else:
        negatives.append(patient_id)

print(f"Positive patient_laterality: {len(positives)}")
print(f"Negative patient_laterality: {len(negatives)}")

# Calculate how many mammographies there are among the positives 
max = 0
min = 100
total = 0
sum = 0
for patient_id, cancers in dataset.items():
    if patient_id in positives:
        if 'R' in patient_id:
            if len(cancers) > max:
                max = len(cancers)
            if len(cancers) < min:
                min = len(cancers)
            sum += 1
            total += len(cancers)
print(min, max, total / sum)



Positive patient_laterality: 492
Negative patient_laterality: 23334
2 8 2.302596623532436


In [107]:
# Create val dataset based on same statistical distribution.
# Take 20% of data for validation
val_ratio = 0.2
total = len(positives) + len(negatives)
pos_ratio = len(positives) / total
neg_ratio = len(negatives) / total
val_len = round(total * val_ratio)

pos_len = round(val_len * pos_ratio)
neg_len = round(val_len * neg_ratio)

print(f"Positive validation count: {pos_len}")
print(f"Negative validation count: {neg_len}")

Positive validation count: 98
Negative validation count: 4667
