In [1]:
import pandas as pd
from os.path import dirname, join
import numpy as np
import gcsfs

In [4]:
input_dir = 'radiology-impressions-derived-data'
fs = gcsfs.GCSFileSystem(project='profile-notes')

In [10]:
filename = join(input_dir, 'training_mrns.csv')
with fs.open(filename) as f:
    training_mrns = pd.read_csv(f)


In [13]:
filename = join(input_dir, 'validation_mrns.csv')
with fs.open(filename) as f:
    validation_mrns = pd.read_csv(f)
    
filename = join(input_dir, 'truetest_mrns.csv')
with fs.open(filename) as f:
    test_mrns = pd.read_csv(f)

In [5]:
filename= join(input_dir, 'manual_label_analysis.csv')
with fs.open(filename) as f:
        label_analysis = pd.read_csv(f)

In [6]:
label_analysis.head()

Unnamed: 0.1,Unnamed: 0,DFCI_MRN,ehr_scan_date,PROC_DESCR,NARR_TXT,IMPRESS_TXT,os_date,died,scan_to_os_date,survived_6_months,...,adrenal,bone,brain,liver,lung,nodes,peritoneum,pleura,response,progression
0,0,375195,2014-10-30,ct chest with contrast,exam number: a13738082 ...,none,2017-05-30,N,943.0,1,...,0,0,0,0,1,0,0,0,0,1
1,1,375195,2013-11-07,ct chest with contrast,exam number: a12449475 ...,none,2017-05-30,N,1300.0,1,...,0,0,0,0,1,0,0,0,0,1
2,2,375195,2012-11-08,ct chest with contrast,exam number: a12348771 ...,none,2017-05-30,N,1664.0,1,...,0,0,0,0,1,0,0,0,0,0
3,3,375195,2010-04-15,ct chest with contrast,exam number: a10234376 ...,none,2017-05-30,N,2602.0,1,...,0,0,0,0,1,0,0,0,0,1
4,4,375195,2011-11-10,ct chest with contrast,exam number: a11272310 ...,none,2017-05-30,N,2028.0,1,...,0,0,0,0,1,0,0,0,0,0


In [7]:
label_analysis.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17958 entries, 0 to 17957
Data columns (total 24 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         17958 non-null  int64  
 1   DFCI_MRN           17958 non-null  int64  
 2   ehr_scan_date      17958 non-null  object 
 3   PROC_DESCR         17958 non-null  object 
 4   NARR_TXT           17958 non-null  object 
 5   IMPRESS_TXT        17958 non-null  object 
 6   os_date            17710 non-null  object 
 7   died               17958 non-null  object 
 8   scan_to_os_date    17710 non-null  float64
 9   survived_6_months  17958 non-null  int64  
 10  scan_type          17958 non-null  float64
 11  auto_type          17958 non-null  float64
 12  any_cancer         17958 non-null  int64  
 13  redcap_resp_prog   11650 non-null  float64
 14  adrenal            17958 non-null  int64  
 15  bone               17958 non-null  int64  
 16  brain              179

In [8]:
label_analysis = label_analysis.assign(imaging_text=label_analysis.NARR_TXT + ' ' + label_analysis.IMPRESS_TXT)
label_analysis['imaging_text'] = label_analysis.imaging_text.str.replace(r'\r\n', ' ')
label_analysis = label_analysis.drop_duplicates(subset='imaging_text')
label_analysis = label_analysis[label_analysis['imaging_text'].str.contains("it has been imported") == False]
print(label_analysis.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14230 entries, 0 to 17957
Data columns (total 25 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         14230 non-null  int64  
 1   DFCI_MRN           14230 non-null  int64  
 2   ehr_scan_date      14230 non-null  object 
 3   PROC_DESCR         14230 non-null  object 
 4   NARR_TXT           14230 non-null  object 
 5   IMPRESS_TXT        14230 non-null  object 
 6   os_date            14056 non-null  object 
 7   died               14230 non-null  object 
 8   scan_to_os_date    14056 non-null  float64
 9   survived_6_months  14230 non-null  int64  
 10  scan_type          14230 non-null  float64
 11  auto_type          14230 non-null  float64
 12  any_cancer         14230 non-null  int64  
 13  redcap_resp_prog   8773 non-null   float64
 14  adrenal            14230 non-null  int64  
 15  bone               14230 non-null  int64  
 16  brain              142

In [14]:
# get training/validation/test splits

training_data = label_analysis[label_analysis['DFCI_MRN'].isin(training_mrns.DFCI_MRN)]
validation_data = label_analysis[label_analysis['DFCI_MRN'].isin(validation_mrns.DFCI_MRN)]
test_data = label_analysis[label_analysis['DFCI_MRN'].isin(test_mrns.DFCI_MRN)]


print(training_data.shape)
print (validation_data.shape)
print (test_data.shape)

number_patients = np.geomspace(10, training_mrns.shape[0], 20)
number_patients = [int(s) for s in number_patients]

(11182, 25)
(1545, 25)
(1503, 25)


In [16]:
samples = training_mrns.copy()

patients = []
for i, n, in enumerate(number_patients[::-1]):
    if i > 0:
        samples = samples[['DFCI_MRN']].sample(n=n, random_state=2008)
    training_data_subsampled = training_data[training_data['DFCI_MRN'].isin(samples.DFCI_MRN)].copy()
    filename = 'training_mrns_{}.csv'.format(i)
    training_data_subsampled.to_csv(join('processed',filename))
    number_of_selected_reports = training_data_subsampled.shape[0]
    patients.append(training_data_subsampled.DFCI_MRN.unique())
    number_of_selected_patients = len(training_data_subsampled.DFCI_MRN.unique())

    print ('filename {} # patients {} , # of reports {} '.format(filename, number_of_selected_patients,
                                                                number_of_selected_reports))

filename training_mrns_0.csv # patients 884 , # of reports 11182 
filename training_mrns_1.csv # patients 592 , # of reports 7382 
filename training_mrns_2.csv # patients 425 , # of reports 5564 
filename training_mrns_3.csv # patients 296 , # of reports 4103 
filename training_mrns_4.csv # patients 214 , # of reports 2897 
filename training_mrns_5.csv # patients 149 , # of reports 1849 
filename training_mrns_6.csv # patients 103 , # of reports 1214 
filename training_mrns_7.csv # patients 68 , # of reports 865 
filename training_mrns_8.csv # patients 43 , # of reports 680 
filename training_mrns_9.csv # patients 35 , # of reports 453 
filename training_mrns_10.csv # patients 27 , # of reports 363 
filename training_mrns_11.csv # patients 20 , # of reports 225 
filename training_mrns_12.csv # patients 15 , # of reports 178 
filename training_mrns_13.csv # patients 11 , # of reports 143 
filename training_mrns_14.csv # patients 8 , # of reports 56 
filename training_mrns_15.csv # patie

In [20]:
# filename='/home/jupyter/clinicalNLP2/data/manula_labels/processed/training_mrns_0.csv'
FILENAME='/home/jupyter/clinicalNLP2/data/manual_labels/processed/training_mrns_0.csv'
import pandas as pd
training_mrns=  pd.read_csv(FILENAME)

/home/jupyter/clinicalNLP2/data/manual_labels
