In [None]:
domains = ['measurement', 'drug', 'procedure', 'condition']

In [None]:
# set Relative Path
import os
import pathlib
current_dir = pathlib.Path.cwd()
print(current_dir)
parent_dir = current_dir.parent

# input file path
importsql_output_dir = pathlib.Path('{}/0_importsql/output'.format(parent_dir))

# output file path
output_dir = pathlib.Path('{}/output/'.format(current_dir))
pathlib.Path.mkdir(output_dir, mode=0o777, parents=True, exist_ok=True)

# output file path (features)
output_features_dir = pathlib.Path('{}/output_features/'.format(current_dir))
pathlib.Path.mkdir(output_features_dir, mode=0o777, parents=True, exist_ok=True)

# read hoi list
with open('{}/ImportSQL/abnormal_list.txt'.format(parent_dir), 'r') as f:
    data = f.read()
outcome_list = data.splitlines()

In [None]:
import pandas as pd
from tqdm import tqdm
from preprocessing_all_domain import *
error=[]
for outcome_name in tqdm(outcome_list):
    try:
        # @load data
        meas_df = pd.read_csv('{}/{}_{}.txt'.format(importsql_output_dir, outcome_name, domains[0]), low_memory=False)
        drug_df = pd.read_csv('{}/{}_{}.txt'.format(importsql_output_dir, outcome_name, domains[1]), low_memory=False)
        proc_df = pd.read_csv('{}/{}_{}.txt'.format(importsql_output_dir, outcome_name, domains[2]), low_memory=False)
        cond_df = pd.read_csv('{}/{}_{}.txt'.format(importsql_output_dir, outcome_name, domains[3]), low_memory=False)

        # @create column for concat ; concept_domain 
        meas_df['concept_domain'] = 'meas'
        drug_df['concept_domain'] = 'drug'
        proc_df['concept_domain'] = 'proc'
        cond_df['concept_domain'] = 'cond'
        
        # @use common terminology.
        meas_df = meas_df.rename({'measurement_date':'concept_date','value_as_number':'concept_value'}, axis=1)
        drug_df = drug_df.rename({'drug_exposure_start_date':'concept_date','quantity':'concept_value'}, axis=1)
        proc_df = proc_df.rename({'procedure_date':'concept_date'}, axis=1)
        cond_df = cond_df.rename({'condition_start_date':'concept_date'}, axis=1)
        
        # @fill concept_value
        drug_df['concept_value'] = 1 # temp code
        proc_df['concept_value'] = 1
        cond_df['concept_value'] = 1
        
        # @use only necessary columns
        common_cols = ['subject_id', 'age', 'sex', 'label', \
            'cohort_start_date', 'first_abnormal_date', 'concept_date', \
            'concept_id', 'concept_name', 'concept_value', 'concept_domain']

        meas_df = meas_df[common_cols]
        drug_df = drug_df[common_cols]
        proc_df = proc_df[common_cols]
        cond_df = cond_df[common_cols]
        
        print(len(meas_df), len(drug_df), len(proc_df), len(cond_df), (len(meas_df) + len(drug_df) + len(proc_df) + len(cond_df)))
        
        # @valid data processing for cohorts.
        meas_df = cohortConditionSetting(meas_df)
        drug_df = cohortConditionSetting(drug_df)
        proc_df = cohortConditionSetting(proc_df)
        cond_df = cohortConditionSetting(cond_df)
        
        # print( 'label 1 : {}'.format(len(all_domain_df[all_domain_df['label']==1].subject_id.unique())) )
        # print( 'label 0 : {}'.format(len(all_domain_df[all_domain_df['label']==0].subject_id.unique())) )
        # print(len(all_domain_df))
        
        # meas_domain_df = all_domain_df[all_domain_df['concept_domain']=='meas']
        # drug_domain_df = all_domain_df[all_domain_df['concept_domain']=='drug']
        # proc_domain_df = all_domain_df[all_domain_df['concept_domain']=='proc']
        # cond_domain_df = all_domain_df[all_domain_df['concept_domain']=='cond']
        
        # @variable selection
        meas_vars_df = variant_selection_paired_t_test(meas_df)
        drug_vars_df = variant_selection_mcnemar(drug_df)
        proc_vars_df = variant_selection_mcnemar(proc_df)
        cond_vars_df = variant_selection_mcnemar(cond_df)

        # @variable selection (Top 30 based on p Value)
        #pd.options.display.precision = 3
        meas_vars_df = meas_vars_df.sort_values(by='pvalue',  ascending=True).reset_index(drop=True).head(30)
        drug_vars_df = drug_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True).head(30)
        cond_vars_df = cond_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True).head(30)
        proc_vars_df = proc_vars_df.sort_values(by='pvalue', ascending=True).reset_index(drop=True).head(30)
        
        print(len(meas_vars_df), len(drug_vars_df), len(proc_vars_df), len(cond_vars_df))

        # @variable selection (save)
        meas_vars_df.to_csv('{}/{}_{}_feature.csv'.format(output_features_dir, outcome_name, domains[0]), header=True, index=True)
        drug_vars_df.to_csv('{}/{}_{}_feature.csv'.format(output_features_dir, outcome_name, domains[1]), header=True, index=True)
        proc_vars_df.to_csv('{}/{}_{}_feature.csv'.format(output_features_dir, outcome_name, domains[2]), header=True, index=True)
        cond_vars_df.to_csv('{}/{}_{}_feature.csv'.format(output_features_dir, outcome_name, domains[3]), header=True, index=True)
        
        # meas_common_features = pd.read_csv('{}/{}_{}_feature.txt'.format(output_features_dir, outcome_name, domains[0]), index_col=False)
        # drug_common_features = pd.read_csv('{}/{}_{}_feature.txt'.format(output_features_dir, outcome_name, domains[1]), index_col=False)
        # proc_common_features = pd.read_csv('{}/{}_{}_feature.txt'.format(output_features_dir, outcome_name, domains[2]), index_col=False)
        # cond_common_features = pd.read_csv('{}/{}_{}_feature.txt'.format(output_features_dir, outcome_name, domains[3]), index_col=False)


        # @Extract only selected concepts from data frame
        def extractSelectedConceptID(domain_df, concept_id_list):
            extract_domain_df = domain_df[domain_df['concept_id'].isin(concept_id_list)]
            print(len(concept_id_list), len(domain_df), len(extract_domain_df))
            return extract_domain_df
        
        meas_df2 = extractSelectedConceptID(meas_df, meas_vars_df.concept_id.unique())
        drug_df2 = extractSelectedConceptID(drug_df, drug_vars_df.concept_id.unique())
        proc_df2 = extractSelectedConceptID(proc_df, proc_vars_df.concept_id.unique())
        cond_df2 = extractSelectedConceptID(cond_df, cond_vars_df.concept_id.unique())
        
        # meas_df2 = extractSelectedConceptID(meas_df2, meas_common_features.concept_id.unique())
        # drug_df2 = extractSelectedConceptID(drug_df2, drug_common_features.concept_id.unique())
        # proc_df2 = extractSelectedConceptID(proc_df2, proc_common_features.concept_id.unique())
        # cond_df2 = extractSelectedConceptID(cond_df2, cond_common_features.concept_id.unique())
        
        all_domain_df = pd.concat([meas_df2, drug_df2, proc_df2, cond_df2], axis=0, ignore_index=True)
        
        # @test : 
        averageDurationOfAE = average_duration_of_adverse_events(all_domain_df)
        print(averageDurationOfAE)
        
        pivot_data = pivotting(all_domain_df)
        # # temp 
        # pivot_data.to_csv('{}/{}_{}_pivot_data.csv'.format(output_features_dir, outcome_name, domains[0]), header=True, index=True)

        domain_ids={}
        domain_ids['meas'] = meas_df2.concept_id.unique()
        domain_ids['drug'] = drug_df2.concept_id.unique()
        domain_ids['proc'] = proc_df2.concept_id.unique()
        domain_ids['cond'] = cond_df2.concept_id.unique()
        
        interpolate_df = day_sequencing_interpolate(pivot_data, domain_ids)

        label_1 = interpolate_df[interpolate_df['label']==1]
        label_0 = interpolate_df[interpolate_df['label']==0]
        
        rolled_label1_d = shift_rolling_window(label_1, OBP=28, nShift=7, uid_index=1)
        rolled_label0_d = label_0_fitting(label_0, OBP=28, nShift=14, uid_index=(rolled_label1_d.unique_id.max()+1))

        # label 0 + label 1
        concat_df = pd.concat([rolled_label1_d, rolled_label0_d], sort=False)
        concat_df = concat_df.sort_values(['unique_id', 'concept_date'])

        # Normalization (Min/Max Scalar)
        concat_df = normalization(concat_df)
        
        # columns name : concept_id > concept_name
        concept_dict = dict(zip(all_domain_df.concept_id, all_domain_df.concept_name))
        concat_df = concat_df.rename(concept_dict, axis='columns')
        
        # Save File
        concat_df.to_csv('{}/{}.txt'.format(output_dir, outcome_name), index=False, float_format='%g')
        
        output={}
        output['meas_whole_var'] = len(meas_df.concept_id.unique())
        output['drug_whole_var'] = len(drug_df.concept_id.unique())
        output['proc_whole_var'] = len(proc_df.concept_id.unique())
        output['cond_whole_var'] = len(cond_df.concept_id.unique())
        output['meas_selected_var'] = len(meas_df2.concept_id.unique())
        output['drug_selected_var'] = len(drug_df2.concept_id.unique())
        output['proc_selected_var'] = len(proc_df2.concept_id.unique())
        output['cond_selected_var'] = len(cond_df2.concept_id.unique())
        output['nPatient_label1'] = len(rolled_label1_d.subject_id.unique())
        output['nPatient_label0'] = len(rolled_label0_d.subject_id.unique())
        
        # print
        print(output['meas_whole_var'], output['meas_selected_var'])
        print(output['drug_whole_var'], output['drug_selected_var'])
        print(output['proc_whole_var'], output['proc_selected_var'])
        print(output['cond_whole_var'], output['cond_selected_var'])
        
        out = open('{}/output.txt'.format(output_dir),'a')
        
        out.write(str(outcome_name) + '///' )
        out.write(str(output['meas_whole_var']) + '///')
        out.write(str(output['meas_selected_var']) + '///')
        out.write(str(output['drug_whole_var']) + '///')
        out.write(str(output['drug_selected_var']) + '///')
        out.write(str(output['proc_whole_var']) + '///')
        out.write(str(output['proc_selected_var']) + '///')
        out.write(str(output['cond_whole_var']) + '///')
        out.write(str(output['cond_selected_var']) + '///')
        out.write(str(output['nPatient_label1']) + '///')
        out.write(str(output['nPatient_label0']) + '\n')
        out.close()
        
    except:
        print('except:', outcome_name)
        error.append(outcome_name)
print(error)