In [1]:
import pandas as pd
import numpy as np
import pickle
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression, LogisticRegression, Ridge, RidgeClassifier
from sklearn.model_selection import train_test_split
from argparse import ArgumentParser
import math

In [2]:
ARG_PARSER = ArgumentParser()

ARG_PARSER.add_argument('--air', default=True)
ARG_PARSER.add_argument('--mimic', default=False)
ARG_PARSER.add_argument('--ehr', default=False)

ARG_PARSER.add_argument('--pred', default=True)
ARG_PARSER.add_argument('--imp', default=False)

ARG_PARSER.add_argument('--pred_len', default=8, type=int)
ARG_PARSER.add_argument('--missingRate', default=10, type=int)

_StoreAction(option_strings=['--missingRate'], dest='missingRate', nargs=None, const=None, default=10, type=<class 'int'>, choices=None, help=None, metavar=None)

In [3]:
args = ARG_PARSER.parse_args(args=[])

In [4]:
class MiceImputer(object):

    def __init__(self, seed_values = True, seed_strategy="mean", copy=False):
        self.strategy = seed_strategy # seed_strategy in ['mean','median','most_frequent', 'constant']
        self.seed_values = seed_values # seed_values = False initializes missing_values using not_null columns
        self.copy = copy
        self.imp = SimpleImputer(strategy=self.strategy, copy=self.copy)

    def fit_transform(self, X, method = 'Linear', iter = 2, verbose = True):
        
        # Why use Pandas?
        # http://gouthamanbalaraman.com/blog/numpy-vs-pandas-comparison.html
        # Pandas < Numpy if X.shape[0] < 50K
        # Pandas > Numpy if X.shape[0] > 500K
        
        # Data necessary for masking missing-values after imputation
        null_cols = X.columns[X.isna().any()].tolist()
        null_X = X.isna()[null_cols]
      
        ### Initialize missing_values
        
        if self.seed_values:
            
            # Impute all missing values using SimpleImputer 
            if verbose:
                print('Initilization of missing-values using SimpleImputer')
            new_X = pd.DataFrame(self.imp.fit_transform(X))
            print("new_X",new_X.shape)
            print("X",X.shape) 
            if(new_X.shape[1]<X.shape[1]):
                new_X=X.fillna(X.mean())
                return new_X
            new_X.columns = X.columns
            new_X.index = X.index
            
        else:
   
            # Initialize a copy based on value of self.copy
            if self.copy:
                new_X = X.copy()
            else:
                new_X = X
#             print("New_X: ",new_X.shape)   
#             print(new_X.iloc[0,:])
            not_null_cols = X.columns[X.notna().any()].tolist()
#             print("not_null_cols: ",not_null_cols)
            
            if verbose:
                print('Initilization of missing-values using regression on non-null columns')
               
            for column in null_cols:
                
                null_rows = null_X[column]
                train_x = new_X.loc[~null_rows, not_null_cols]
                test_x = new_X.loc[null_rows, not_null_cols]
                train_y = new_X.loc[~null_rows, column]
#                 print(train_x.shape)
#                 print(test_x.shape)
#                 print(train_y.shape)
#                 train_x[np.isnan(train_x) == True] = 0
#                 test_x[np.isnan(test_x) == True] = 0
#                 print(np.any(np.isnan(train_x)))
#                 print(np.all(np.isfinite(train_x)))
                
#                 print(np.any(np.isnan(test_x)))
#                 print(np.all(np.isfinite(test_x)))
                
#                 print(np.any(np.isnan(train_y)))
#                 print(np.all(np.isfinite(train_y)))
                
                if X[column].nunique() > 2:
                    m = LinearRegression(n_jobs = -1)
                    m.fit(train_x, train_y)
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    not_null_cols.append(column)
                    
#                 elif X[column].nunique() == 2:
#                     m = LogisticRegression(n_jobs = -1, solver = 'lbfgs')
#                     m.fit(train_x, train_y)
#                     new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
#                     not_null_cols.append(column)
        
        ### Begin iterations of MICE
        
        model_score = {}
        
        for i in range(iter):
            if verbose:
                print('Beginning iteration ' + str(i) + ':')
                
            model_score[i] = []
            
            for column in null_cols:
                
                null_rows = null_X[column]                
                not_null_y = new_X.loc[~null_rows, column]
                not_null_X = new_X[~null_rows].drop(column, axis = 1)
                
                train_x, val_x, train_y, val_y = train_test_split(not_null_X, not_null_y, test_size=0.33, random_state=42)  
                test_x = new_X.drop(column, axis = 1)
                  
                if new_X[column].nunique() > 2:
                    if method == 'Linear':
                        m = LinearRegression(n_jobs = -1)
                    elif method == 'Ridge':
                        m = Ridge()
                    
                    train_x[np.isnan(train_x) == True] = 0
                    test_x[np.isnan(test_x) == True] = 0    
                    m.fit(train_x, train_y)
                    model_score[i].append(m.score(val_x, val_y))
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    if verbose:
                        print('Model score for ' + str(column) + ': ' + str(m.score(val_x, val_y)))
                    
                elif new_X[column].nunique() == 2:
                    if method == 'Linear':
                        m = LogisticRegression(n_jobs = -1, solver = 'lbfgs')
                    elif method == 'Ridge':
                        m = RidgeClassifier()
                        
                    m.fit(train_x, train_y)
                    model_score[i].append(m.score(val_x, val_y))
                    new_X.loc[null_rows,column] = pd.Series(m.predict(test_x))
                    if verbose:
                        print('Model score for ' + str(column) + ': ' + str(m.score(val_x, val_y)))
                
            if model_score[i] == []:
                model_score[i] = 0
            else:
                model_score[i] = sum(model_score[i])/len(model_score[i])

        print(new_X.shape)        
        return new_X


In [5]:
def dataPrep(data,flag):
    data=data.sort_values(by=['person_id','Age'])
    del data['person_id']
    del data['visit_start_datetime']
    del data['visit_end_datetime']
    del data['val']
    del data['Interval']
    #print(data['Age'].head())
    #print(data.columns)
    if flag==1:
        del data['interval']
        #del data['intervalReverse']
    data.rename({'MEAS_3038553': 'BMI'}, axis=1, inplace=True)
    #print(data.shape)
    data = data.to_numpy()
    data = np.reshape(data, (int(data.shape[0]/20), 20, data.shape[1]))
    #data=data[2500:3000,:,:]
    #print(data.shape)
    return data

In [6]:
def pred(data,mask,predWin):
    data = data.to_numpy()
    data = np.reshape(data, (int(data.shape[0]/20), 20, data.shape[1]))

    mask = mask.to_numpy()
    mask = np.reshape(mask, (int(mask.shape[0]/20), 20, mask.shape[1]))

    mask=mask[:,:,0]
    y=data[:,:,2].copy()
    testMask=mask.copy()
    
    for i in range(data.shape[0]):
        #if(data[i,])
        j=20
        if(predWin==8):
                k=16
        elif(predWin==7):
            k=14
        elif(predWin==6):
            k=12
        elif(predWin==5):
            k=10

        data[i,j-k:j-1,:]=0
        mask[i,j-k:j]=0
        y[i,0:j-k]=0
        testMask[i,0:j-k]=0

    data=data[:,:,2]
    #     print(data.shape)
    #     print(mask.shape)
    #     print(y.shape)
    #     print(testMask.shape)
    #     data=data.transpose()
    #     mask=mask.transpose()
    #     y=y.transpose()
    #     testMask=testMask.transpose()
#     print(data[:,0])
#     print(y[:,0])
#     print(testMask[:,0])
    data[data==0]=np.nan
    data=pd.DataFrame(data)
    mice = MiceImputer()
    #print(data.shape)
    data_complete=mice.fit_transform(data)
    data_complete=data_complete.fillna(0)

    data_complete = data_complete.to_numpy()

    data_complete=np.multiply(data_complete,testMask)

    mae = np.abs(data_complete - y).sum() / (1e-5 + np.sum(testMask))
    mre = np.abs(data_complete - y).sum() / (1e-5 + np.sum(np.abs(y)))

    outBmi=data_complete[np.nonzero(testMask)]
    inBmi=y[np.nonzero(testMask)]

    # with open('results/mice/'+str(m) +'/outBmiPred2', 'wb') as fp:
    #     pickle.dump(outBmi, fp)
    # with open('results/mice/'+str(m) +'/inBmiPred2', 'wb') as fp:
    #     pickle.dump(inBmi, fp)

    #plotBmi(inBmi,outBmi,len(inBmi))
    loss = (outBmi - inBmi)
    loss=np.asarray([abs(number) for number in loss])
    variance = sum([((x - mae) ** 2) for x in loss]) / len(loss) 
    res = variance ** 0.5
    ci=1.96*(res/(math.sqrt(len(loss))))
    print("CI",ci)

    print("Mean MAE: ",mae)
    print("Mean MRE: ",mre)

In [7]:
def imp(data,mask,missingRate):
    data = data.to_numpy()
    data = np.reshape(data, (int(data.shape[0]/20), 20, data.shape[1]))

    mask = mask.to_numpy()
    mask = np.reshape(mask, (int(mask.shape[0]/20), 20, mask.shape[1]))

    mask=mask[:,:,0]
    #print(mask)
    y=data[:,:,2].copy()
    testMask=mask.copy()

    samples=0
    pids=0

    for i in range(data.shape[0]):
        idxs = np.nonzero(mask[i,:] == 1)
        idxs=(np.array(idxs)).reshape(-1)
        #print(idxs)
        #idxs=np.random.choice(idxs,3)
        #print(len(idxs))

        samples=samples+len(idxs)
        if(missingRate==50):
            if(len(idxs)>4):
                idxs=np.random.choice(idxs,5)
                data[i,idxs[0],2]=0
                data[i,idxs[1],2]=0
                data[i,idxs[2],2]=0
                data[i,idxs[3],2]=0
                data[i,idxs[4],2]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                mask[i,idxs[2]]=0
                mask[i,idxs[3]]=0
                mask[i,idxs[4]]=0
                pids=pids + 5
                break;
        if(missingRate>=40):
            if(len(idxs)>3):
                idxs=np.random.choice(idxs,4)
                data[i,idxs[0],2]=0
                data[i,idxs[1],2]=0
                data[i,idxs[2],2]=0
                data[i,idxs[3],2]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                mask[i,idxs[2]]=0
                mask[i,idxs[3]]=0
                pids=pids + 4
                break;
        if(missingRate>=30):
            if(len(idxs)>2):
                idxs=np.random.choice(idxs,3)
                data[i,idxs[0],2]=0
                data[i,idxs[1],2]=0
                data[i,idxs[2],2]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                mask[i,idxs[2]]=0
                pids=pids + 3
                break;
        if(missingRate>=20):
            if(len(idxs)>1):
                idxs=np.random.choice(idxs,2)
                data[i,idxs[0],2]=0
                data[i,idxs[1],2]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                pids=pids + 2
                break;
        if(missingRate>=10):
            if(len(idxs)>0):
                idxs=np.random.choice(idxs,1)
                data[i,idxs,2]=0
                mask[i,idxs]=0
                pids=pids + 1


        testMask[i,:]=testMask[i,:]-mask[i,:]
        y[i,:]=y[i,:]*testMask[i,:]

    data=data[:,:,2]
    #data=data.transpose()
    #mask=mask.transpose()
    #y=y.transpose()
    #testMask=testMask.transpose()
    mice = MiceImputer()
    #     print(data[0,:])
    #     print(y[0,:])
    #     print(testMask[0,:])

    data[data==0]=np.nan
    data=pd.DataFrame(data)

    data_complete=mice.fit_transform(data)
    data_complete = data_complete.to_numpy()

    data_complete=np.multiply(data_complete,testMask)

    #     print(data_complete[0,:])
    #     print(y[0,:])
    #     print(testMask[0,:])
    mae = np.abs(data_complete - y).sum() / (1e-5 + np.sum(testMask))
    mre = np.abs(data_complete - y).sum() / (1e-5 + np.sum(np.abs(y)))

    outBmi=data_complete[np.nonzero(testMask)]
    inBmi=y[np.nonzero(testMask)]

    # with open('results/mice/'+str(m) +'/outBmiImp', 'wb') as fp:
    #     pickle.dump(outBmi, fp)
    # with open('results/mice/'+str(m) +'/inBmiImp', 'wb') as fp:
    #     pickle.dump(inBmi, fp)

    #print(len(inBmi))
    #print(len(outBmi))
    #plotBmi(inBmi,outBmi,len(inBmi))

    loss = (outBmi - inBmi)
    loss=np.asarray([abs(number) for number in loss])
    variance = sum([((x - mae) ** 2) for x in loss]) / len(loss) 
    res = variance ** 0.5
    ci=1.96*(res/(math.sqrt(len(loss))))
    print("CI",ci)

    print("Mean MAE: ",mae)
    #print("Mean MRE: ",mre)

In [8]:
def ehrImpute(missingRate):
    data=pd.read_csv('.../aaai/data/ehr/preprocess/test/finalTest102.csv',header=0)
    mask=pd.read_csv('.../aaai/data/ehr/preprocess/test/mask/maskTest102.csv',header=0)

    data=dataPrep(data,0)
    mask=dataPrep(mask,1)
    mask=mask[:,:,316]
    y=data[:,:,316].copy()
    testMask=mask.copy()

    samples=0
    pids=0

    for i in range(data.shape[0]):
        idxs = np.nonzero(mask[i,:] == 1)
        idxs=(np.array(idxs)).reshape(-1)
        #print(idxs)
        #idxs=np.random.choice(idxs,3)
        #print(len(idxs))

        samples=samples+len(idxs)
        if(missingRate==50):
            if(len(idxs)>4):
                idxs=np.random.choice(idxs,5)
                data[i,idxs[0],316]=0
                data[i,idxs[1],316]=0
                data[i,idxs[2],316]=0
                data[i,idxs[3],316]=0
                data[i,idxs[4],316]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                mask[i,idxs[2]]=0
                mask[i,idxs[3]]=0
                mask[i,idxs[4]]=0
                pids=pids + 5
                break;
        if(missingRate>=40):
            if(len(idxs)>3):
                idxs=np.random.choice(idxs,4)
                data[i,idxs[0],316]=0
                data[i,idxs[1],316]=0
                data[i,idxs[2],316]=0
                data[i,idxs[3],316]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                mask[i,idxs[2]]=0
                mask[i,idxs[3]]=0
                pids=pids + 4
                break;
        if(missingRate>=30):
            if(len(idxs)>2):
                idxs=np.random.choice(idxs,3)
                data[i,idxs[0],316]=0
                data[i,idxs[1],316]=0
                data[i,idxs[2],316]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                mask[i,idxs[2]]=0
                pids=pids + 3
                break;
        if(missingRate>=20):
            if(len(idxs)>1):
                idxs=np.random.choice(idxs,2)
                data[i,idxs[0],316]=0
                data[i,idxs[1],316]=0
                #print(mask[i,:])
                mask[i,idxs[0]]=0
                mask[i,idxs[1]]=0
                pids=pids + 2
                break;
        if(missingRate>=10):
            if(len(idxs)>0):
                idxs=np.random.choice(idxs,1)
                data[i,idxs,316]=0
                mask[i,idxs]=0
                pids=pids + 1

        testMask[i,:]=testMask[i,:]-mask[i,:]
        y[i,:]=y[i,:]*testMask[i,:]

    data=data[:,:,316]
    #data=data.transpose()
    #mask=mask.transpose()
    #y=y.transpose()
    #testMask=testMask.transpose()
    mice = MiceImputer()
    #     print(data[0,:])
    #     print(y[0,:])
    #     print(testMask[0,:])

    data[data==0]=np.nan
    data=pd.DataFrame(data)

    data_complete=mice.fit_transform(data)
    data_complete = data_complete.to_numpy()

    data_complete=np.multiply(data_complete,testMask)

    #     print(data_complete[0,:])
    #     print(y[0,:])
    #     print(testMask[0,:])
    mae = np.abs(data_complete - y).sum() / (1e-5 + np.sum(testMask))
    mre = np.abs(data_complete - y).sum() / (1e-5 + np.sum(np.abs(y)))

    outBmi=data_complete[np.nonzero(testMask)]
    inBmi=y[np.nonzero(testMask)]

    # with open('results/mice/'+str(m) +'/outBmiImp', 'wb') as fp:
    #     pickle.dump(outBmi, fp)
    # with open('results/mice/'+str(m) +'/inBmiImp', 'wb') as fp:
    #     pickle.dump(inBmi, fp)

    #print(len(inBmi))
    #print(len(outBmi))
    #plotBmi(inBmi,outBmi,len(inBmi))

    loss = (outBmi - inBmi)
    loss=np.asarray([abs(number) for number in loss])
    variance = sum([((x - mae) ** 2) for x in loss]) / len(loss) 
    res = variance ** 0.5
    ci=1.96*(res/(math.sqrt(len(loss))))
    print("CI",ci)

    print("Mean MAE: ",mae)
    #print("Mean MRE: ",mre)

In [9]:
def ehrPred(predWin):
    data=pd.read_csv('.../aaai/data/ehr/preprocess/test/finalTest102.csv',header=0)
    mask=pd.read_csv('.../aaai/data/ehr/preprocess/test/mask/maskTest102.csv',header=0)

    data=dataPrep(data,0)
    mask=dataPrep(mask,1)
    mask=mask[:,:,316]
    y=data[:,:,316].copy()
    testMask=mask.copy()

    for i in range(data.shape[0]):
        #if(data[i,])
        j=20
        if(predWin==8):
                k=16
        elif(predWin==7):
            k=14
        elif(predWin==6):
            k=12
        elif(predWin==5):
            k=10

        data[i,j-k:j,:]=0
        mask[i,j-k:j]=0
        y[i,0:j-k]=0
        testMask[i,0:j-k]=0

    data=data[:,:,316]
    #     print(data.shape)
    #     print(mask.shape)
    #     print(y.shape)
    #     print(testMask.shape)
#     data=data.transpose()
#     mask=mask.transpose()
#     y=y.transpose()
#     testMask=testMask.transpose()
#         print(data[0,:])
#         print(y[0,:])
#         print(testMask[0,:])
    data[data==0]=np.nan
    data=pd.DataFrame(data)
    mice = MiceImputer()
    print(data.shape)
    data_complete=mice.fit_transform(data)
    data_complete=data_complete.fillna(0)

    data_complete = data_complete.to_numpy()

    data_complete=np.multiply(data_complete,testMask)

    mae = np.abs(data_complete - y).sum() / (1e-5 + np.sum(testMask))
    mre = np.abs(data_complete - y).sum() / (1e-5 + np.sum(np.abs(y)))

    outBmi=data_complete[np.nonzero(testMask)]
    inBmi=y[np.nonzero(testMask)]

    # with open('results/mice/'+str(m) +'/outBmiPred2', 'wb') as fp:
    #     pickle.dump(outBmi, fp)
    # with open('results/mice/'+str(m) +'/inBmiPred2', 'wb') as fp:
    #     pickle.dump(inBmi, fp)

    #plotBmi(inBmi,outBmi,len(inBmi))
    loss = (outBmi - inBmi)
    loss=np.asarray([abs(number) for number in loss])
    variance = sum([((x - mae) ** 2) for x in loss]) / len(loss) 
    res = variance ** 0.5
    ci=1.96*(res/(math.sqrt(len(loss))))
    print("CI",ci)

    print("Mean MAE: ",mae)
    print("Mean MRE: ",mre)

In [10]:
#MICE IMPUTATION ALGO
if args.imp:
    if args.air:
        data=pd.read_csv('.../aaai/data/air/preprocess/airTest.csv',header=0)
        mask=pd.read_csv('.../aaai/data/air/preprocess/airTestMask.csv',header=0)
        data=data[['Date', 'Time', 'Month', 'PT08.S1(CO)', 'CO(GT)', 'NMHC(GT)','C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)','PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']]
        del data['Time']
        del data['Date']
        del mask['Time']
        del mask['Date']
        imp(data,mask,args.missingRate)

    if args.mimic:
        data=pd.read_csv('.../aaai/data/mimic/preprocess/mimicTest.csv',header=0)
        mask=pd.read_csv('.../aaai/data/mimic/preprocess/mimicTestMask.csv',header=0)
        del data['subject_id']
        del data['charttime']
        del mask['subject_id']
        del mask['charttime']
        print(data.shape)
        data=data[['ALBUMIN', 'ANION GAP','WBC', 'BANDS', 'BICARBONATE', 'BILIRUBIN', 'BUN',
       'CHLORIDE', 'CREATININE', 'GLUCOSE', 'HEMATOCRIT', 'HEMOGLOBIN', 'INR',
       'LACTATE', 'PaCO2', 'PLATELET', 'POTASSIUM', 'PT', 'PTT', 'SODIUM'
       ]]
        imp(data,mask,args.missingRate)

    if args.ehr:
        ehrImpute(args.missingRate)
    


In [11]:
#MEAN PRED ALGO
if args.pred:
    if args.air:
        data=pd.read_csv('.../aaai/data/air/preprocess/airTest.csv',header=0)
        mask=pd.read_csv('.../aaai/data/air/preprocess/airTestMask.csv',header=0)
        data=data[['Date', 'Time', 'Month', 'PT08.S1(CO)', 'CO(GT)', 'NMHC(GT)','C6H6(GT)', 'PT08.S2(NMHC)', 'NOx(GT)', 'PT08.S3(NOx)', 'NO2(GT)','PT08.S4(NO2)', 'PT08.S5(O3)', 'T', 'RH', 'AH']]
        del data['Time']
        del data['Date']
        del mask['Time']
        del mask['Date']
        pred(data,mask,args.pred_len)

    if args.mimic:
        data=pd.read_csv('.../aaai/data/mimic/preprocess/mimicTest.csv',header=0)
        mask=pd.read_csv('.../aaai/data/mimic/preprocess/mimicTestMask.csv',header=0)
        del data['subject_id']
        del data['charttime']
        del mask['subject_id']
        del mask['charttime']
        print(data.shape)
        data=data[['ALBUMIN', 'ANION GAP','WBC', 'BANDS', 'BICARBONATE', 'BILIRUBIN', 'BUN',
       'CHLORIDE', 'CREATININE', 'GLUCOSE', 'HEMATOCRIT', 'HEMOGLOBIN', 'INR',
       'LACTATE', 'PaCO2', 'PLATELET', 'POTASSIUM', 'PT', 'PTT', 'SODIUM'
       ]]
        pred(data,mask,args.pred_len)

    if args.ehr:
        ehrPred(args.pred_len)

Initilization of missing-values using SimpleImputer
new_X (107, 5)
X (107, 20)
CI 3.776614504030245
Mean MAE:  70.74858657183213
Mean MRE:  0.9979892663814449
