# Init 

In [None]:
import pandas as pd

In [None]:
def fullDisplay(df,max_rows=None,max_col=None,width=None):
    df_cp = df.style.set_properties( **{'width': f'{width}px'}) if width is not None else df.copy() 
    with pd.option_context('display.max_rows', max_rows, 'display.max_columns', max_col,):
        display(df_cp)

In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

In [None]:
def ndig_id(id):
    try:
        new_id = '_'.join([f'{int(i):04d}' for i in id.split('_')])
    except:
        new_id = 'ERROR'
    
    return new_id
    

# Create data-frame

In [None]:
fname = './AMISH_12192019.csv'#'./AMISH_reduced.csv'  #'./AMISH_OCT.csv'
amish_head = pd.read_csv(fname,nrows=2)

col_desc = {amish_head.iloc[1,i]:col.replace('\n',' ') for i, col in enumerate(amish_head.columns)}
col_val_desc = {amish_head.iloc[1,i]:str(vals).replace('\n',' ') for i, vals in enumerate(amish_head.iloc[0,:])}

amish_df = pd.read_csv(fname,skiprows=2,parse_dates=['EXAM_DATE'],dtype={'INDIVIDUAL':str},skipfooter=1)
amish_df['PAT_ID'] = amish_df.GROUP+'_'+amish_df.INDIVIDUAL
amish_df['PAT_ID'] = amish_df.PAT_ID.apply(ndig_id)
amish_df['CASE_ID'] = amish_df.PAT_ID +'_'+amish_df.EXAM_DATE.apply(lambda d: d.strftime("%m%d%Y"))

print(amish_df.shape)
amish_df.head()

In [None]:
amish_df[]

In [None]:
print(amish_df.columns.to_list())

## add volume as a binary feature 

In [None]:
amish_df['CO_RPE_V3MM_OS'] = amish_df['CO_RPE_V3MM_OS'].replace('NE',-3).astype(np.float32)

In [None]:
def binarizeNumerical(p,th,df):
    df[f'{p}_L{th}_OD'] = (df[f'{p}_OD']>=th).values
    df[f'{p}_L{th}_OD'][df[f'{p}_OD']<0]=np.nan

    df[f'{p}_L{th}_OS'] = (df[f'{p}_OS']>=th).values
    df[f'{p}_L{th}_OS'][df[f'{p}_OS']<0]=np.nan
    


In [None]:
th = 0.03 # <= 0.03mm^2
p = 'CO_RPE_V3MM' # Drusen Volume

binarizeNumerical(p,0.03,amish_df)

In [None]:
hypo_df = pd.read_csv('AMISH_hypoDrusen.csv',nrows=None,dtype={'INDIVIDUAL':object},parse_dates=['EXAM_DATE'])
hypo_df.drop(['STUDY','SUBSTUDY','EXAMINER'],axis=1,inplace=True)
hypo_df

In [None]:
amish_df

In [None]:
amish_df_tmp = pd.merge(amish_df,hypo_df,on=['CENTER','GROUP','INDIVIDUAL','EXAM_DATE'],how='left')
amish_df_tmp[['CO_Drusen_Core_OD','CO_Drusen_Core_OS']] = amish_df_tmp[['CO_Drusen_Core_OD','CO_Drusen_Core_OS']].fillna('N')

In [None]:
amish_df = amish_df_tmp

## external df 

In [None]:
amish_df.head(5)

In [None]:
from pathlib import Path
ext_path = Path('/opt/data/Jupyter_Notebook/NadavRakocz/Doheny/reproduce/')
ext_files = [d for d in ext_path.iterdir() if 'csv' in d.name]
ext_files

In [None]:
f.stem

In [None]:
ext_dfs = []
for f in ext_files:
    o_name = f'{f.stem}_ext.csv'
    ext_amish_df = pd.read_csv(f,skiprows=2,parse_dates=['EXAM_DATE'],dtype={'INDIVIDUAL':str},skipfooter=1)
    ext_amish_df['PAT_ID'] = ext_amish_df.ID
    ext_amish_df['PAT_ID'] = ext_amish_df.PAT_ID.apply(ndig_id)
    ext_amish_df['CASE_ID'] = ext_amish_df.PAT_ID +'_'+ext_amish_df.EXAM_DATE.apply(lambda d: d.strftime("%m%d%Y"))
    # binarizeNumerical('CO_RPE_V3MM',0.03,ext_amish_df)
    ext_dfs.append(ext_amish_df)
    

In [None]:
a=ext_amish_df[ext_amish_df['CENTER']=='UPEN']
a.sort_values(by='EXAM_DATE')

## Exclude bad images 

In [None]:
exclude = ['P','CG','NA','NE']
exclude_ish = ['CG','NA','NE']

In [None]:
amish_df_qc = amish_df[~(amish_df.SO_QIMG_OD.isin(exclude) |  amish_df.SO_QIMG_OS.isin(exclude))]
amish_df_q = amish_df[~(amish_df.SO_QIMG_OD.isin(exclude_ish) |  amish_df.SO_QIMG_OS.isin(exclude_ish))]

In [None]:
ext_amish_df_qc = ext_amish_df[~(ext_amish_df.SO_QIMG_OD.isin(exclude_ish) |  ext_amish_df.SO_QIMG_OS.isin(exclude_ish))]
ext_amish_df_qc

## Save to disc 

In [None]:
amish_df.columns.to_list()

In [None]:
AMISH_FNAME = 'AMISH_12192019_wHypoDrusen'
amish_df_qc.to_csv(f'{AMISH_FNAME}.csv',index=False)
amish_df_q.to_csv(f'{AMISH_FNAME}_wQ.csv',index=False)


### ext_df 

In [None]:
ext_amish_df = ext_amish_df.to_csv('EXT_AMISH_SO_QC.csv',index=False)

In [None]:
for f,ext_df in zip(ext_files,ext_dfs):
    o_name = f'{f.stem}_ext_qc.csv'
    ext_df.to_csv(o_name)

# Review features 

In [None]:
col,desc = zip(*[(k,col_desc[k]) for k in col_desc.keys()])
col,vals = zip(*[(k,col_val_desc[k]) for k in col_val_desc.keys()])


a = pd.DataFrame({'COL':col,'DESC':desc})
b = pd.DataFrame({'COL':col,'VALS':vals})
cols_df = pd.merge(a,b,on='COL',how='outer')

fullDisplay(cols_df,width=600)

In [None]:
amish_df[['CO_RPE_V5MM_OD','CO_RPE_V3MM_OS']].max()

# SO

In [None]:
so_col = [c for c in amish_df.columns.to_list() if 'SO' in c]
fullDisplay(amish_df[so_col],max_rows=10)

In [None]:
print(cols_df.COL.iloc[68:84])
print(cols_df.COL.iloc[86:])



In [None]:
[c for c,v in col_val_desc.items() if 'No (N)' in v]

In [None]:
['SO_SUBRETINAL_OD',
 'SO_SUBRETINAL_OS',
 'SO_SRTSRHRM_OD',
 'SO_SRTSRHRM_OS',
 'SO_INTRA_RCS_OD',
 'SO_INTRA_RCS_OS',
 'SO_OUTER_RT_OD',
 'SO_OUTER_RT_OS',
 'SO_SR_DRUSEN_OD',
 'SO_SR_DRUSEN_OS',
 'SO_HRF_IRHRFOND_OD',
 'SO_HRF_IRHRFOND_OS',
 'SO_HRF_HRFOD_OD',
 'SO_HRF_HRFOD_OS',
 'SO_PED_DPED_OD',
 'SO_PED_DPED_OS',
 'SO_PED_HPED_OD',
 'SO_PED_HPED_OS',
 'SO_PED_SEROUS_OD',
 'SO_PED_SEROUS_OS']


In [None]:
len(so_col)//2