# Predcition of Hospital Admission Related to Infections

In this notebook, we develop a Cox proportional hazard regression model to predict the risk of hospital admission for common infections including urinary tract infection (lrti), upper respiratory tract infection (lrti), lower respiratory tract infection (LRTI), sinusitis, otitis media or middle ear infection (ot media), and ear infection or otitis externa (ot externa).

In [1]:
import pandas as pd
from matplotlib import pyplot
import matplotlib.pyplot as plt
import seaborn as sns
import math
import numpy as np
import os
import glob
import gzip
from matplotlib.ticker import PercentFormatter
from patsy import dmatrices
from lifelines import CoxPHFitter
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer
import pickle
from sklearn.preprocessing import PolynomialFeatures
from lifelines.utils import k_fold_cross_validation
from lifelines.utils import concordance_index
from lifelines.calibration import survival_probability_calibration
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.metrics import brier_score_loss
from sklearn.calibration import CalibrationDisplay
from sklearn.preprocessing import OneHotEncoder
from sklearn.calibration import calibration_curve
from datetime import date
from operator import attrgetter
import io
import sys
from contextlib import redirect_stdout
# %matplotlib inline

In [2]:
import warnings
warnings.filterwarnings('ignore')
warnings.warn('DelftStack')
warnings.warn('Do not show this message')
print("No Warning Shown")



## 0- Functions

In [3]:
# def create_lineplot(data, var, title, legend_title, ax1):
def create_lineplot(data, var, title, ax1):
    a = data.groupby(['date', var]).size().reset_index()
    a.columns = ['date', var, 'Freq']
    # a.Freq = a.Freq*1000
    a = a[a.Freq > 5]
    sns.lineplot(x='date', y='Freq', 
            hue=var, 
            data=a,
            ax=ax1)
    ax1.set_title(title)
    ax1.set_ylabel('Count')
    ax1.xaxis.set_tick_params(which='both', labelbottom=True, labeltop=False, rotation=90)
#     ax1.legend(title=legend_title)
    # ax1.grid()
    ax1.axvline(x="2020-01", color='black', ls='--', lw=1.5)
    ax1.axvline(x="2020-04", color='black', ls='--', lw=1.5)
    ax1.axvline(x="2021-04", color='black', ls='--', lw=1.5)

In [4]:
# function to transform cph model summary and save it as html
def GetPrintSummary(model):
    output = ""
    with io.StringIO() as buf, redirect_stdout(buf):
        model.print_summary(style="html")
        output = buf.getvalue()
    return output

In [5]:
def ConvertColumns(InfectionKey, NumSets):
    for i in range(1, NumSets+1):        
        arrayKey1 = 'admitted_'+InfectionKey+'_date_'+str(i)
        #arrayKey2 = InfectionKey+'_date_'+str(i)
        data[arrayKey1] = data[arrayKey1].astype('datetime64[ns]')

In [6]:
def round_five_mul(x, base=5):
    return base * round(x/base)

## 1- Data Preparation

In [174]:
data = pd.read_csv(f'../output/hospitalisation_data/input_hospitalisation_lrti.csv.gz')
# data_bmi = pd.read_csv(f'../output/hospitalisation_data/input_bmi.csv.gz')

data.shape

(200000, 79)

In [None]:
# save model outputs
a = open("../output/hospitalisation_prediction_lrti/data_lrti_record_cph_lrti.txt", "w")
a.write("")
a.writelines(['data_lrti_1 #with a record, #with no record: %.0f' % (data['lrti_date_1'].notna().sum()), ', %.0f' % (data['lrti_date_1'].isna().sum()),
              '\ndata_lrti_2 #with a record, #with no record: %.0f' % (data['lrti_date_2'].notna().sum()), ', %.0f' % (data['lrti_date_2'].isna().sum()),
              '\ndata_lrti_3 #with a record, #with no record: %.0f' % (data['lrti_date_3'].notna().sum()), ', %.0f' % (data['lrti_date_3'].isna().sum()),
              '\ndata_lrti_4 #with a record, #with no record: %.0f' % (data['lrti_date_4'].notna().sum()), ', %.0f' % (data['lrti_date_4'].isna().sum()),
              ])

In [184]:
# save model outputs
a = open("../output/hospitalisation_prediction_lrti/data_gp_counts_cph_lrti.txt", "w")
a.write("")
a.writelines(['data_lrti_1 #with gp cons, #with no gp cons: %.0f' % (data['gp_cons_lrti_1'].notna().sum()), ', %.0f' % (data['gp_cons_lrti_1'].isna().sum()),
              '\ndata_lrti_2 #with gp cons, #with no gp cons: %.0f' % (data['gp_cons_lrti_2'].notna().sum()), ', %.0f' % (data['gp_cons_lrti_2'].isna().sum()),
              '\ndata_lrti_3 #with gp cons, #with no gp cons: %.0f' % (data['gp_cons_lrti_3'].notna().sum()), ', %.0f' % (data['gp_cons_lrti_3'].isna().sum()),
              '\ndata_lrti_4 #with gp cons, #with no gp cons: %.0f' % (data['gp_cons_lrti_4'].notna().sum()), ', %.0f' % (data['gp_cons_lrti_4'].isna().sum()),
              ])

In [8]:
data.describe(include='all')

Unnamed: 0,deregistered_date,died_date,bmi_date_measured,smoking_status_date,most_recent_unclear_smoking_cat_date,flu_vaccine_med,flu_vaccine_clinical,admitted_date,lrti_date_1,lrti_date_2,...,incdt_lrti_date_4,sgss_gp_cov_lrti_date_1,sgss_gp_cov_lrti_date_2,sgss_gp_cov_lrti_date_3,sgss_gp_cov_lrti_date_4,lrti_ab_count_1,lrti_ab_count_2,lrti_ab_count_3,lrti_ab_count_4,patient_id
count,10000,20000,150000,20000,20000,20000,20000,60000,20000,20000,...,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0,200000.0
unique,1332,749,44,44,44,13,13,1333,1333,1333,...,,,,,,,,,,
top,2020-07-06,2022-08-24,2021-07,2022-05,2019-09,2018-03,2018-03,2019-05-05,2019-07-02,2019-05-30,...,,,,,,,,,,
freq,16,150,3628,498,503,1758,1780,70,33,29,...,,,,,,,,,,
mean,,,,,,,,,,,...,0.1,0.1,0.1,0.1,0.1,0.900545,0.900275,0.90055,0.90038,998911.0
std,,,,,,,,,,,...,0.300001,0.300001,0.300001,0.300001,0.300001,1.860186,1.860268,1.859556,1.860197,578212.1
min,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,496772.8
50%,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,999489.5
75%,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1501442.0


### 1-2- Additional variables
#### 1-2-1- Calculating Charlson Comorbidity Index
Based on https://www.nature.com/articles/4500889.pdf?origin=ppub

In [1]:
# data['CCI'] = np.nan
# col0 = data.columns.get_loc('CCI')

# col1 = data.columns.get_loc('cancer_comor')
# col2 = data.columns.get_loc('cardiovascular_comor')
# col3 = data.columns.get_loc('chronic_obstructive_pulmonary_comor') ##
# col4 = data.columns.get_loc('heart_failure_comor') ##
# col5 = data.columns.get_loc('connective_tissue_comor')
# col6 = data.columns.get_loc('dementia_comor')
# col7 = data.columns.get_loc('diabetes_comor')
# col8 = data.columns.get_loc('diabetes_complications_comor')
# col9 = data.columns.get_loc('hemiplegia_comor')
# col10 = data.columns.get_loc('hiv_comor')
# col11 = data.columns.get_loc('metastatic_cancer_comor')
# col12 = data.columns.get_loc('mild_liver_comor')
# col13 = data.columns.get_loc('mod_severe_liver_comor')
# col14 = data.columns.get_loc('mod_severe_renal_comor')
# col15 = data.columns.get_loc('mi_comor')
# col16 = data.columns.get_loc('peptic_ulcer_comor')
# col17 = data.columns.get_loc('peripheral_vascular_comor')
# # col18 = data.columns.get_loc('age')


# for idx, x in enumerate(data['CCI']):
#     n=0
#     if data.iat[idx, col15] == 1: # mi_comor
#         n=n+1
#     if data.iat[idx, col2] == 1: # cardiovascular_comor
#         n=n+1 
#     if data.iat[idx, col17] == 1: # peripheral_vascular_comor
#         n=n+1 
#     if data.iat[idx, col3] == 1: # chronic_obstructive_pulmonary_comor
#         n=n+1
#     if data.iat[idx, col7] == 1: # diabetes_comor
#         n=n+1
#     if data.iat[idx, col6] == 1: # dementia_comor
#         n=n+1
#     if data.iat[idx, col16] == 1: # peptic_ulcer_comor
#         n=n+1
#     if data.iat[idx, col5] == 1: # connective_tissue_comor
#         n=n+1
#     if data.iat[idx, col12] == 1: # mild_liver_comor
#         n=n+1
#     if data.iat[idx, col9] == 1: # hemiplegia_comor
#         n=n+2
#     if data.iat[idx, col14] == 1: # mod_severe_renal_comor
#         n=n+2
#     if data.iat[idx, col8] == 1: # diabetes_complications_comor
#         n=n+2
#     if data.iat[idx, col1] == 1: # cancer_comor
#         n=n+2
#     if data.iat[idx, col13] == 1: # mod_severe_liver_comor
#         n=n+3
#     if data.iat[idx, col11] == 1: # metastatic_cancer_comor
#         n=n+6
#     if data.iat[idx, col10] == 1: # hiv_comor
#         n=n+6
    
#     data.iat[idx, col0]=n

# data.CCI.value_counts()

## 2- Data preparation
This includes data type conversion and exclusion of cases that are not considered in the infection-related hospital admission, i.e. those who were diagnosed with Covid19 three months (=90 days) before and one month (=30 days) after hospital admission.

### 2-1- Gathering data and exclusion of Covid-19

#### LRTI-1

In [11]:
# data_lrti_1 = data[['age', 'age_cat', 'sex', 'flu_vaccine', 'smoking_status', 'bmi', 'imd', 'ethnicity', 
#                     'region', 'hx_hosp', 'CCI', #'meds_nonabs', 
#                     'gp_count_1', 'antibacterial_brit_1',
#                     'died_date', 'deregistered_date', 'gp_cons_lrti_1', 'gp_cons_lrti_ab_1',
#                     'lrti_date_1', 'incdt_lrti_date_1', 'admitted_lrti_date_1', 'sgss_gp_cov_lrti_date_1']]

# data_lrti_1 = data_lrti_1[data_lrti_1['lrti_date_1'].notnull()]

# # exclusion of covid positive 90 days before and 30 days after dx with lrti_1
# data_lrti_1 = data_lrti_1[data_lrti_1['sgss_gp_cov_lrti_date_1'] == 0]

# # drop rows with no gp_cons_lrti_1 records
# data_lrti_1 = data_lrti_1.dropna(subset=['gp_cons_lrti_1'])
# data_lrti_1.shape

(1757, 22)

#### lrti-2

In [12]:
# data_lrti_2 = data[['age', 'age_cat', 'sex', 'flu_vaccine', 'smoking_status', 'bmi', 'imd', 'ethnicity', 
#                     'region', 'hx_hosp', 'CCI', #'meds_nonabs',
#                     'gp_count_2', 'antibacterial_brit_2',
#                     'died_date', 'deregistered_date', 'gp_cons_lrti_2', 'gp_cons_lrti_ab_2',
#                     'lrti_date_2', 'incdt_lrti_date_2', 'admitted_lrti_date_2', 'sgss_gp_cov_lrti_date_2']]

# data_lrti_2 = data_lrti_2[data_lrti_2['lrti_date_2'].notnull()]

# # exclusion of covid positive 90 days before and 30 days after dx with lrti_2
# data_lrti_2 = data_lrti_2[data_lrti_2['sgss_gp_cov_lrti_date_2'] == 0]

# # drop rows with no gp_cons_lrti_2 records
# data_lrti_2 = data_lrti_2.dropna(subset=['gp_cons_lrti_2'])
# data_lrti_2.shape

(1844, 22)

#### lrti_3

In [13]:
# data_lrti_3 = data[['age', 'age_cat', 'sex', 'flu_vaccine', 'smoking_status', 'bmi', 'imd', 'ethnicity', 
#                     'region', 'hx_hosp', 'CCI', #'meds_nonabs',
#                     'gp_count_3', 'antibacterial_brit_3',
#                     'died_date', 'deregistered_date', 'gp_cons_lrti_3', 'gp_cons_lrti_ab_3',
#                     'lrti_date_3', 'incdt_lrti_date_3', 'admitted_lrti_date_3', 'sgss_gp_cov_lrti_date_3']]

# data_lrti_3 = data_lrti_3[data_lrti_3['lrti_date_3'].notnull()]

# # exclusion of covid positive 90 days before and 30 days after dx with lrti_3
# data_lrti_3 = data_lrti_3[data_lrti_3['sgss_gp_cov_lrti_date_3'] == 0]

# # drop rows with no gp_cons_lrti_3 records
# data_lrti_3 = data_lrti_3.dropna(subset=['gp_cons_lrti_3'])
# data_lrti_3.shape

(1823, 22)

#### lrti_4

In [14]:
# data_lrti_4 = data[['age', 'age_cat', 'sex', 'flu_vaccine', 'smoking_status', 'bmi', 'imd', 'ethnicity', 
#                     'region', 'hx_hosp', 'CCI', #'meds_nonabs',
#                     'gp_count_4', 'antibacterial_brit_4',
#                     'died_date', 'deregistered_date', 'gp_cons_lrti_4', 'gp_cons_lrti_ab_4',
#                     'lrti_date_4', 'incdt_lrti_date_4', 'admitted_lrti_date_4', 'sgss_gp_cov_lrti_date_4']]

# data_lrti_4 = data_lrti_4[data_lrti_4['lrti_date_4'].notnull()]

# # exclusion of covid positive 90 days before and 30 days after dx with lrti_4
# data_lrti_4 = data_lrti_4[data_lrti_4['sgss_gp_cov_lrti_date_4'] == 0]

# # drop rows with no gp_cons_lrti_4 records
# data_lrti_4 = data_lrti_4.dropna(subset=['gp_cons_lrti_4'])
# data_lrti_4.shape

(1734, 22)

### 2-2- Combining data

In [15]:
# data_lrti_1.rename(columns={'lrti_date_1': 'lrti_date',
#                            'gp_count_1': 'gp_count', 'antibacterial_brit_1': 'antibacterial_brit',
#                            'gp_cons_lrti_1': 'gp_cons_lrti', 'gp_cons_lrti_ab_1': 'gp_cons_lrti_ab',
#                            'incdt_lrti_date_1': 'incdt_lrti_date', #'prevl_lrti_date_1': 'prevl_lrti_date',
#                            'admitted_lrti_date_1': 'admitted_lrti_date',
#                            'sgss_gp_cov_lrti_date_1': 'sgss_gp_cov_lrti_date'},
#                 inplace=True)
# data_lrti_2.rename(columns={'lrti_date_2': 'lrti_date',
#                            'gp_count_2': 'gp_count', 'antibacterial_brit_2': 'antibacterial_brit',
#                            'gp_cons_lrti_2': 'gp_cons_lrti', 'gp_cons_lrti_ab_2': 'gp_cons_lrti_ab',
#                            'incdt_lrti_date_2': 'incdt_lrti_date', #'prevl_lrti_date_2': 'prevl_lrti_date', 
#                            'admitted_lrti_date_2': 'admitted_lrti_date',
#                            'sgss_gp_cov_lrti_date_2': 'sgss_gp_cov_lrti_date'},
#                 inplace=True)
# data_lrti_3.rename(columns={'lrti_date_3': 'lrti_date',
#                            'gp_count_3': 'gp_count', 'antibacterial_brit_3': 'antibacterial_brit',
#                            'gp_cons_lrti_3': 'gp_cons_lrti', 'gp_cons_lrti_ab_3': 'gp_cons_lrti_ab',
#                            'incdt_lrti_date_3': 'incdt_lrti_date', #'prevl_lrti_date_3': 'prevl_lrti_date', 
#                            'admitted_lrti_date_3': 'admitted_lrti_date',
#                            'sgss_gp_cov_lrti_date_3': 'sgss_gp_cov_lrti_date'},
#                  inplace=True)
# data_lrti_4.rename(columns={'lrti_date_4': 'lrti_date', 
#                            'gp_count_4': 'gp_count', 'antibacterial_brit_4': 'antibacterial_brit',
#                            'gp_cons_lrti_4': 'gp_cons_lrti', 'gp_cons_lrti_ab_4': 'gp_cons_lrti_ab',
#                            'incdt_lrti_date_4': 'incdt_lrti_date', #'prevl_lrti_date_4': 'prevl_lrti_date',
#                            'admitted_lrti_date_4': 'admitted_lrti_date',
#                            'sgss_gp_cov_lrti_date_4': 'sgss_gp_cov_lrti_date'}, 
#                  inplace=True)

# data_lrti = pd.concat([data_lrti_1, data_lrti_2, data_lrti_3, data_lrti_4])
# data_lrti.reset_index(inplace=True, drop=True)
# data_lrti.shape

(7158, 22)

### 2-3- Add season, event, and duration variables
#### Season

In [16]:
# # convert data types
# data_lrti['lrti_date'] = data_lrti['lrti_date'].astype('datetime64[ns]')
# data_lrti['gp_cons_lrti'] = data_lrti['gp_cons_lrti'].astype('datetime64[ns]')
# data_lrti['admitted_lrti_date'] = data_lrti['admitted_lrti_date'].astype('datetime64[ns]')
# data_lrti.died_date = data_lrti.died_date.astype('datetime64[ns]')
# data_lrti.deregistered_date = data_lrti.deregistered_date.astype('datetime64[ns]')

# #add a variable called date using gp consultation dates
# data_lrti['date'] = data_lrti['gp_cons_lrti']
# data_lrti['date'] = data_lrti['date'].dt.strftime('%Y-%m')
# data_lrti.shape

(7158, 23)

In [17]:
# #get today's date in year and month
# today_date_y_m = date.today()
# today_date_y_m = today_date_y_m.strftime('%Y-%m')
# # today_date_y_m

# #drop any records of data_lrti with today's date in year and month
# data_lrti = data_lrti[data_lrti['date'] != today_date_y_m]

# #get 2 months before today's date in year and month 
# last_month_date_y_m = date.today() - pd.DateOffset(months=2)
# last_month_date_y_m = last_month_date_y_m.strftime('%Y-%m')
# #drop any record of data with 1 month before today's date in year and month
# data_lrti = data_lrti[data_lrti['date'] != last_month_date_y_m]

# #add a variable called season based on the month of lrti records
# data_lrti['season'] = np.nan
# data_lrti['date_month'] = pd.DatetimeIndex(data_lrti['date']).month

# conditions = [
#     (data_lrti['date_month'] >= 3) & (data_lrti['date_month'] <= 5),
#     (data_lrti['date_month'] >= 6) & (data_lrti['date_month'] <= 8),
#     (data_lrti['date_month'] >= 9) & (data_lrti['date_month'] <= 11),]
# choices = ['spring', 'summer', 'autumn']
# data_lrti['season'] = np.select(conditions, choices, default='winter')

# data_lrti['season'].value_counts()

spring    1976
winter    1767
summer    1672
autumn    1430
Name: season, dtype: int64

In [18]:
# data_lrti['date'].min()

'2019-01'

In [19]:
# # data_lrti = data_lrti[data_lrti['date'] <= '2019-12']
# # data_lrti = data_lrti[(data_lrti['date'] >= '2020-03') & (data_lrti['date']<= '2021-03')]
# # data_lrti = data_lrti[data_lrti['date'] >= '2021-04']
# data_lrti.shape

(6845, 25)

In [20]:
# data_lrti['date'].max()

'2022-07'

#### Adding event and duration columns

In [21]:
# #scenario 1
# #not hosped (nothing happened)
# #data_lrti = data_lrti[data_lrti['admitted_lrti_date'].notnull()]
# data_lrti.loc[data_lrti['admitted_lrti_date'].isnull(), 'event_lrti_admitted'] = 0
# data_lrti['event_lrti_admitted'].value_counts()

0.0    4840
Name: event_lrti_admitted, dtype: int64

In [2]:
# #scenario 2 
# # become a case (uncensoring)
# #calculating days between infection gp consultation and hosp
# # data_lrti['delta_lrti_admitted'] = (data_lrti['admitted_lrti_date'] - data_lrti['lrti_date']).dt.days
# data_lrti['delta_lrti_admitted'] = (data_lrti['admitted_lrti_date'] - data_lrti['gp_cons_lrti']).dt.days
# data_lrti.loc[((data_lrti['delta_lrti_admitted'] >= 0) & (data_lrti['delta_lrti_admitted'] <= 30)), 'event_lrti_admitted'] = 1
# # data_lrti['event_lrti_admitted'].value_counts()

# #scenario 2
# #drop whoever was admitted before lrti consultation, i.e. negative value for delta_lrti_admitted
# data_lrti = data_lrti[~(data_lrti['delta_lrti_admitted'] < 0)]
# data_lrti['delta_lrti_admitted'].value_counts()

In [3]:
# #scenario 3
# #censor died patients
# data_lrti['delta_admitted_died'] = (data_lrti['died_date'] - data_lrti['admitted_lrti_date']).dt.days
# data_lrti.loc[data_lrti['delta_admitted_died'] < 0, 'delta_admitted_died'] = np.NaN
# # data_lrti.loc[data_lrti['delta_admitted_died'] >= 0, 'event_lrti_admitted'] = 0
# data_lrti.loc[((data_lrti['delta_admitted_died'] >= 0) & (data_lrti['delta_admitted_died'] <= 30)), 'event_lrti_admitted'] = 0
# # data_lrti['event_lrti_admitted'].value_counts()

# #scenario 3
# #censor deregistered patients
# data_lrti['delta_admitted_deregistered'] = (data_lrti['deregistered_date'] - data_lrti['admitted_lrti_date']).dt.days
# data_lrti.loc[data_lrti['delta_admitted_deregistered'] < 0, 'delta_admitted_deregistered'] = np.NaN
# # data_lrti.loc[data_lrti['delta_admitted_deregistered'] >= 0, 'event_lrti_admitted'] = 0
# data_lrti.loc[((data_lrti['delta_admitted_deregistered'] > 0) & (data_lrti['delta_admitted_deregistered'] <= 30)), 'event_lrti_admitted'] = 0
# # data_lrti['event_lrti_admitted'].value_counts()

# #agg scenario 3s
# data_lrti['delta_admitted_died_deregistered'] = data_lrti['delta_admitted_deregistered'].combine_first(data_lrti['delta_admitted_died'])
# data_lrti.loc[data_lrti['delta_admitted_died_deregistered'] < 0, 'delta_admitted_died_deregistered'] = np.NaN
# data_lrti['delta_admitted_died_deregistered'].isnull().sum()#.value_counts()

In [24]:
# #scenario 1
# #any other patients (nothing happened)
# data_lrti['event_lrti_admitted'] = data_lrti['event_lrti_admitted'].replace(np.NaN, 0)
# data_lrti['event_lrti_admitted'].value_counts()

0.0    5831
1.0      54
Name: event_lrti_admitted, dtype: int64

In [4]:
# #assign values for duration column
# data_lrti['duration_lrti_admitted'] = data_lrti['delta_lrti_admitted'].combine_first(data_lrti['delta_admitted_died_deregistered'])
# data_lrti['duration_lrti_admitted'] = data_lrti['duration_lrti_admitted'].replace(np.NaN, 30)
# data_lrti.loc[(data_lrti['duration_lrti_admitted'] > 30), 'duration_lrti_admitted'] = 30
# data_lrti['duration_lrti_admitted'].value_counts()

In [5]:
# # give value 1 to event_lrti_admitted if duration_lrti_admitted is greater or equal to 0 and less than 30
# data_lrti.loc[((data_lrti['duration_lrti_admitted'] >= 0) & (data_lrti['duration_lrti_admitted'] < 30)), 'event_lrti_admitted'] = 1
# data_lrti['event_lrti_admitted'].value_counts()

In [27]:
# #drop any rows with value 0 in duration column
# data_lrti = data_lrti[~(data_lrti['duration_lrti_admitted'] == 0)]

# #scenario2 (uncensoring) again to prevent conflict with other scenarios
# data_lrti.loc[((data_lrti['delta_lrti_admitted'] > 0) & (data_lrti['delta_lrti_admitted'] < 30)), 'event_lrti_admitted'] = 1

In [6]:
# # data_lrti[data_lrti['duration_lrti_admitted']==0][['delta_lrti_admitted', 'lrti_date', 'gp_cons_lrti', 'admitted_lrti_date', 'delta_lrti_admitted', 'delta_admitted_died_deregistered', 'event_lrti_admitted', 'duration_lrti_admitted']]
# # data_lrti.loc[151:200, ['event_lrti_admitted', 'duration_lrti_admitted']]
# # data_lrti.loc[201:250, ['event_lrti_admitted', 'duration_lrti_admitted']].sort_values(by=['duration_lrti_admitted'])
# data_lrti[['event_lrti_admitted', 'duration_lrti_admitted']].sort_values(by=['duration_lrti_admitted'])

In [None]:
# # save data shape
# a = open("../output/hospitalisation_prediction_lrti/data_shape_cph_lrti_after_season.txt", "w")
# a.write("")
# a.writelines(['data_lrti #observations, #events: %.0f' % (data_lrti.event_lrti_admitted.value_counts()[0]), ', %.0f' % (data_lrti.event_lrti_admitted.value_counts()[1]),
#             #   '\ndata_lrti_abs_incdt #observations, #events: %.0f' % (data_lrti_abs_incdt.event_lrti_admitted.value_counts()[0]), ', %.0f' % (data_lrti_abs_incdt.event_lrti_admitted.value_counts()[1]),
#             #   '\ndata_lrti_no_abs_prevl #observations, #events: %.0f' % (data_lrti_no_abs_prevl.event_lrti_admitted.value_counts()[0]), ', %.0f' % (data_lrti_no_abs_prevl.event_lrti_admitted.value_counts()[1]),
#             #   '\ndata_lrti_abs_prevl #observations, #events: %.0f' % (data_lrti_abs_prevl.event_lrti_admitted.value_counts()[0]), ', %.0f' % (data_lrti_abs_prevl.event_lrti_admitted.value_counts()[1])
#               ])