In [1]:
import os

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pydicom
from PIL import Image, ImageEnhance


## CT Abnormality Dataset

In [2]:
df_ab = pd.read_csv('data/nlst_567/nlst_567_ct_ab_20191108.csv')
df_ab.head()

Unnamed: 0,sct_ab_desc,sct_ab_num,sct_epi_loc,sct_long_dia,sct_margins,sct_perp_dia,sct_pre_att,study_yr,sct_slice_num,sct_found_after_comp,pid,dataset_version,scr_group,ct_selected,sct_ab_preexist,sct_ab_attn,sct_ab_gwth,visible_days,sct_ab_has_comparison
0,51,1,1.0,8.0,2.0,5.0,1.0,0,38.0,0.0,100012,2011.02.03/08.20.19,1,1,,,,,0
1,64,2,,,,,,0,,0.0,100012,2011.02.03/08.20.19,1,1,,,,,0
2,51,1,1.0,15.0,1.0,10.0,2.0,1,39.0,0.0,100012,2011.02.03/08.20.19,1,1,2.0,2.0,2.0,16.0,1
3,64,2,,,,,,1,,0.0,100012,2011.02.03/08.20.19,1,1,,,,,0
4,65,1,,,,,,0,,0.0,100147,2011.02.03/08.20.19,1,1,,,,,0


In [3]:
# drop entries w/o corresponding slice number
df_ab = df_ab.dropna(subset=['sct_slice_num'])

# extract only pid, study year and slice number
df_slices = df_ab[['pid', 'study_yr', 'sct_ab_num', 'sct_slice_num']]
print(df_slices.shape)
df_slices.head()

# convert years to match CT image file structure 
year_conversion = {0: 'T0', 1: 'T1', 2: 'T2'}
df_slices['study_yr'] = df_slices.apply(lambda row: year_conversion[row['study_yr']], axis=1)
df_slices.head()

(1706, 4)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # This is added back by InteractiveShellApp.init_path()


Unnamed: 0,pid,study_yr,sct_ab_num,sct_slice_num
0,100012,T0,1,38.0
2,100012,T1,1,39.0
7,100147,T0,4,88.0
8,100147,T1,1,92.0
15,100158,T2,1,57.0


# Treatment dataset

In [4]:
df_treat = pd.read_csv('data/nlst_567/treatment.data.d100517.csv')
df_treat.head()

Unnamed: 0,dataset_version,pid,disease_post_surg,rad_stop_days,treat,treat_days,treat_year,treatnum
0,2011.02.03/10.05.17,100012,0.0,,2,483.0,1,203
1,2011.02.03/10.05.17,100012,0.0,,2,483.0,1,208
2,2011.02.03/10.05.17,100049,,2499.0,1,2471.0,6,101
3,2011.02.03/10.05.17,100055,2.0,,2,148.0,0,201
4,2011.02.03/10.05.17,100055,2.0,,2,148.0,0,208


In [5]:
# keep only patients with no residual disease left after surgery 
df_treat = df_treat[df_treat.disease_post_surg == 0]

# keep only pids 
df_treat = df_treat[['pid', 'treat_year', 'treat_days', 'disease_post_surg']]
df_treat['pid'] = df_treat['pid'].drop_duplicates()
df_treat = df_treat.dropna()
df_treat.head()

Unnamed: 0,pid,treat_year,treat_days,disease_post_surg
0,100012.0,1,483.0,0.0
6,100147.0,1,491.0,0.0
8,100158.0,2,795.0,0.0
11,100242.0,0,79.0,0.0
15,100280.0,1,456.0,0.0


# Person dataset

In [6]:
df_person = pd.read_csv('data/nlst_567/nlst_567_prsn_20191108.csv')

# only keep patients that remained in contact
df_person = df_person[np.logical_or(df_person['contactstatus'] == 1, df_person['contactstatus'] == 2)]

# combine death to lung cancer columns 
df_person['finaldeathlc'] =  df_person['finaldeathlc'].fillna(0)
df_person['dcfdeathlc'] =  df_person['dcfdeathlc'].fillna(0)
df_person['finaldeathlc'] = np.logical_or(df_person['finaldeathlc'], df_person['dcfdeathlc'])

# keep only pid, progression status and death columns
df_person = df_person[['pid', 'prog_days_1st', 'progressed_ever', 'finaldeathlc']]

  interactivity=interactivity, compiler=compiler, result=result)


# Join datasets

In [7]:
df = df_slices.join(df_treat.set_index('pid'), on='pid', how='inner')
df = df.join(df_person.set_index('pid'), on='pid', how='inner')
df = df[np.logical_or(df.progressed_ever == 0, df.progressed_ever == 1)]
df.head()

Unnamed: 0,pid,study_yr,sct_ab_num,sct_slice_num,treat_year,treat_days,disease_post_surg,prog_days_1st,progressed_ever,finaldeathlc
0,100012,T0,1,38.0,1,483.0,0.0,1498.0,1.0,False
2,100012,T1,1,39.0,1,483.0,0.0,1498.0,1.0,False
15,100158,T2,1,57.0,2,795.0,0.0,,0.0,True
17,100242,T0,1,19.0,0,79.0,0.0,2408.0,1.0,False
18,100242,T0,2,37.0,0,79.0,0.0,2408.0,1.0,False


# Clean

In [8]:
# drop those who had no progression but died of lung cancer (likely died in surgery?)
df = df.drop(df[np.logical_and(df.progressed_ever == 0, df.finaldeathlc == 1)].index)

In [9]:
# make recurrence column
df_slices_pos = df[np.logical_and(df.progressed_ever == 1, df.prog_days_1st >  df.treat_days)]
df_slices_neg = df[np.logical_or(df.progressed_ever == 0, df.prog_days_1st <  df.treat_days)]

recurrence = []
for idx in df.index:
    if idx in df_slices_pos.index: 
        recurrence.append(1)
    elif idx in df_slices_neg.index:
        recurrence.append(0)
    else:
        recurrence.append(None)
df['recurrence'] = recurrence

In [10]:
# drop those who had no recurrence but had adjuvant chemo
df_treat_2 = pd.read_csv('data/nlst_567/treatment.data.d100517.csv')
df_chemo = df_treat_2[df_treat_2.treatnum==300]

ChemoDays = {row.pid: row.treat_days for _, row in df_chemo.iterrows()}

for idx, row in df.iterrows():
    if row.pid in ChemoDays:
        if row.recurrence == 0:
            if row.treat_days < ChemoDays[row.pid]:
                df = df.drop(idx)

In [11]:
df_slices_pos = df[df.recurrence == 1]
df_slices_neg = df[df.recurrence == 0]

print(f"Number of 'recurrent' nodules: {len(df_slices_pos)}")
print(f"Number of 'recurrent' patients: {len(df_slices_pos.pid.unique())}\n")

print(f"Number of 'non-recurrent' nodules: {len(df_slices_neg)}")
print(f"Number of 'non-recurrent' patients: {len(df_slices_neg.pid.unique())}")

Number of 'recurrent' nodules: 231
Number of 'recurrent' patients: 98

Number of 'non-recurrent' nodules: 717
Number of 'non-recurrent' patients: 253


In [12]:
df[:50]

Unnamed: 0,pid,study_yr,sct_ab_num,sct_slice_num,treat_year,treat_days,disease_post_surg,prog_days_1st,progressed_ever,finaldeathlc,recurrence
0,100012,T0,1,38.0,1,483.0,0.0,1498.0,1.0,False,1
2,100012,T1,1,39.0,1,483.0,0.0,1498.0,1.0,False,1
17,100242,T0,1,19.0,0,79.0,0.0,2408.0,1.0,False,1
18,100242,T0,2,37.0,0,79.0,0.0,2408.0,1.0,False,1
19,100242,T0,3,62.0,0,79.0,0.0,2408.0,1.0,False,1
21,100280,T1,1,75.0,1,456.0,0.0,,0.0,False,0
26,100570,T0,1,76.0,0,69.0,0.0,,0.0,False,0
27,100570,T0,2,119.0,0,69.0,0.0,,0.0,False,0
29,100658,T0,1,66.0,2,1064.0,0.0,,0.0,False,0
30,100658,T0,2,60.0,2,1064.0,0.0,,0.0,False,0


In [13]:
df.to_csv('data/nlst_table.csv', index=None)