# Pool the test set (results) from the Logistic Regression model for all 5 imputations
- Get one predicted probability for each patient (USRDS_ID) via averaging the score for each USRDS ID across all 5 imputations

In [1]:
import pickle
import numpy as np
import pandas as pd

import psycopg2
from sqlalchemy import create_engine

con = create_engine('postgresql://username:password@location/dbname')

### Import the model results for each imputation

In [2]:
with open('./results/2021_final_LR_model_test_pred_proba_imp_1.pickle','rb') as f:  
    imp1_pred = pickle.load(f)
with open('./results/2021_final_LR_model_test_pred_proba_imp_2.pickle','rb') as f:  
    imp2_pred = pickle.load(f)
with open('./results/2021_final_LR_model_test_pred_proba_imp_3.pickle','rb') as f:  
    imp3_pred = pickle.load(f)
with open('./results/2021_final_LR_model_test_pred_proba_imp_4.pickle','rb') as f:  
    imp4_pred = pickle.load(f)
with open('./results/2021_final_LR_model_test_pred_proba_imp_5.pickle','rb') as f:  
    imp5_pred = pickle.load(f)

### Keep only the predictions for the positive class (died_in_90)

In [5]:
pooled = pd.DataFrame()
pooled['imp1']=imp1_pred[:,1]
pooled['imp2']=imp2_pred[:,1]
pooled['imp3']=imp3_pred[:,1]
pooled['imp4']=imp4_pred[:,1]
pooled['imp5']=imp5_pred[:,1]

### Calculate the mean and standard deviation of the predicted probability for the positive class (died_in_90) for each patient/row 

In [6]:
pooled['score'] = pooled.mean(axis=1)
pooled['std_'] = pooled.std(axis=1)
pooled.head()

Unnamed: 0,imp1,imp2,imp3,imp4,imp5,score,std_
0,0.925393,0.914949,0.901626,0.929962,0.9217,0.918726,0.009861
1,0.778438,0.800176,0.776335,0.791798,0.777974,0.784944,0.009424
2,0.449145,0.417679,0.474578,0.594807,0.488599,0.484961,0.059995
3,0.63624,0.627138,0.649013,0.59881,0.635832,0.629407,0.016815
4,0.16801,0.186585,0.171646,0.276738,0.167967,0.194189,0.041841


### Import the details from the original data (postgres)

In [None]:
dataset = pd.read_sql_query('''SELECT  usrds_id, died_in_90, subset FROM medxpreesrd;''', con)

### sort the values so they are in the same order as when the LR was calculated and keep only the test set

In [None]:
dataset = dataset[dataset.subset > 6].copy().sort_values(by = 'usrds_id').reset_index(drop=True)

### merge the details with the pooled predictions

In [12]:
pooled = pooled.merge(dataset, left_index=True, right_index=True)
pooled

Unnamed: 0,imp1,imp2,imp3,imp4,imp5,score,std_,usrds_id,died_in_90,subset
0,0.925393,0.914949,0.901626,0.929962,0.921700,0.918726,0.009861,31089.0,1.0,9
1,0.778438,0.800176,0.776335,0.791798,0.777974,0.784944,0.009424,34521.0,0.0,8
2,0.449145,0.417679,0.474578,0.594807,0.488599,0.484961,0.059995,46751.0,0.0,8
3,0.636240,0.627138,0.649013,0.598810,0.635832,0.629407,0.016815,50506.0,0.0,9
4,0.168010,0.186585,0.171646,0.276738,0.167967,0.194189,0.041841,54985.0,0.0,8
...,...,...,...,...,...,...,...,...,...,...
345300,0.625205,0.574412,0.541137,0.574188,0.604139,0.583816,0.028736,4056286.0,0.0,8
345301,0.106419,0.106407,0.106438,0.106418,0.106436,0.106424,0.000012,4056351.0,0.0,7
345302,0.131457,0.131446,0.131473,0.131461,0.131455,0.131458,0.000009,4056356.0,0.0,9
345303,0.124623,0.091418,0.109285,0.103798,0.108954,0.107615,0.010684,4056511.0,0.0,7


### save as a pickle file

In [14]:
with open('./results/2021_final_LR_model_test_pred_proba_pooled.pickle', 'wb') as picklefile:  
        pickle.dump(pooled, picklefile)