In [11]:
import numpy as np
from dataframe_utils import data_get, vars_filter, vars_merge, vars_fill_nan
from workspace import workspace, paths_get, paths_join

## 01 Data Cleaning

For the whole dataset, then split into NAION and ODD separately

In [17]:
workspace('ocumet')
data = data_get('1209-raw-mmy.xlsx', lowercase=True)

print('Raw dataframe size: (%s, %s)' % data.shape)

data = data.dropna(subset=['subject_id'], axis='rows')
data = vars_filter(data, ['rimprof'])
data['bilateral_str'] = data['bilateral'].map({'Y':'Bilateral','N':'Unilateral'})
data['bilateral_str'] = data['bilateral_str'].fillna('')
data['subject_id'] = data['subject_id'].astype(int)
data = vars_merge(data, {'id':['subject_id', 'eye'], 'group':['diagnosis', 'bilateral_str']})
data['group'] = data['group'].str.replace('_',' ').str.strip()

diagnosis_map = {
    'not control':'Not control', 'ODD-AION':'ODD-NAION'}
for k, v in diagnosis_map.items():
    data.loc[data['group'].str.contains(k, case=False), 'group'] = v

display(data.iloc[:5,:10].head())

print('Clean dataframe size: (%s, %s)' % data.shape)
print(', '.join(sorted(data.group.unique().tolist())))

paths order: b.d.o.t.s
you're in /Users/miaomiaoyu/workspace/ocumet
Raw dataframe size: (192, 87)


Unnamed: 0,visit_no,date,subject_id,time_since_onset/diagnosis_(months),date_of_image,months,diagnosis,eye,odd-aion_,bilateral
0,5.0,2022-05-06,9,2020-03-01,2022-05-06,26.0,NAION,OD,,Y
1,5.0,2022-05-06,9,2022-03-25,2022-05-06,1.0,NAION,OS,,Y
2,1.0,2021-06-09,40,2020-01-09,2021-06-09,17.0,NAION,OS,,N
3,1.0,2021-10-06,102,2020-09-18,2021-10-06,12.0,NAION,OD,,Y
4,1.0,2022-05-13,137,2016-04-08,2022-05-13,73.0,NAION,OD,,Y


Clean dataframe size: (183, 66)
Control, NAION Bilateral, NAION Unilateral, Not control, ODD, ODD-NAION


In [26]:
# Create new columns for whole eye FPF values
data['fpf_disc'] = data[['fpf_rnfl_t', 'fpf_rnfl_s', 'fpf_rnfl_n', 'fpf_rnfl_i']].mean(axis=1)  # best way to get 'whole disc fpf'README.md
data['fpf_mac'] = data[['mac_ret']]

In [27]:
independent_vars = 'group'
identification_vars = ['id', 'months', 'visit_no']
dependent_vars = [d_var for d_var in data.columns for this_str in ['mean', 'oct', 'fpf', 'vf', 'hvf'] if d_var.startswith(this_str)]

# Neaten up the dependent var column names for str.split later
new_dependent_vars = [var.replace('cq_mac_', 'cq_').replace('gcc_mac_', 'gcc_') for var in dependent_vars]
data = data.rename( columns=dict(zip(dependent_vars,new_dependent_vars)) )

In [28]:
_,d,_,_,_ = paths_get('ocumet')

for x in ['NAION', 'ODD']:
    this_group = [y for y in data['group'].unique().tolist() if x in y]
    group_data = data[data['group'].isin(this_group+['Control'])]
    if x == 'NAION': 
        group_data = group_data[group_data['group']!='ODD-NAION']
    group_data = vars_fill_nan(group_data, d_vars=new_dependent_vars, i_vars=independent_vars)
    group_data['group_binary'] = np.where(group_data['group']=='Control', 'Control', x)
    data_variables = [independent_vars] + identification_vars + new_dependent_vars + ['group_binary', 'good']
    group_data[data_variables].to_csv(paths_join(d,'1209-%s.csv' % x.lower()), index=False)
    print("%s data saved out as '1209-%s.csv'" % (x, x.lower()))

NAION data saved out as '1209-naion.csv'
ODD data saved out as '1209-odd.csv'
