In [3]:
# Import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# import psycopg2

# below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

%matplotlib inline
plt.style.use('ggplot')

# information used to create a database connection
sqluser = 'postgres'
dbname = 'mimic'
schema_name = 'mimiciii'
pw = 'postgres'

# Connect to postgres with a copy of the MIMIC-III database
con = psycopg2.connect(dbname=dbname, user=sqluser, password=pw)

# the below statement is prepended to queries to ensure they select from the right schema
query_schema = 'set search_path to ' + schema_name + ';'

NameError: name 'psycopg2' is not defined

In [None]:
#get data from table: prescriptions
query = query_schema +"""
SELECT row_id, subject_id, hadm_id, icustay_id, startdate
    , enddate, drug_type, drug, drug_name_poe
    , drug_name_generic, formulary_drug_cd
FROM prescriptions
"""
prescriptions = pd.read_sql_query(query, con)

In [None]:
#get data from table: diagnoses_icd
query = query_schema +"""
SELECT subject_id, hadm_id, icd9_code
FROM diagnoses_icd
"""
diagnoses_icd = pd.read_sql_query(query, con)

In [None]:
#get data from table: d_diagnoses_icd
query = query_schema +"""
SELECT short_title, icd9_code, long_title
FROM d_icd_diagnoses
"""
d_diagnoses_icd = pd.read_sql_query(query, con)

In [None]:
#get data from tables: admissions and patients (+ calculate age)
query = query_schema +"""
(
SELECT adm.subject_id, adm.hadm_id, adm.admission_type
    , adm.diagnosis, adm.admittime, adm.dischtime, adm.deathtime
    , adm.insurance, adm.language, adm.religion
    , adm.marital_status, adm.ethnicity, pat.gender, pat.expire_flag
    , EXTRACT('epoch' from adm.admittime - pat.dob)  / 60.0 / 60.0 / 24.0 / 365.242 AS age
    , EXTRACT('epoch' from pat.dod - pat.dob)  / 60.0 / 60.0 / 24.0 / 365.242 AS age_death
FROM admissions adm
INNER JOIN patients pat
    ON adm.subject_id = pat.subject_ID
)
"""
admissions = pd.read_sql_query(query, con)

In [None]:
#get data from table: drgcodes
query = query_schema +"""
SELECT subject_id, hadm_id, drg_type, drg_code
    , description, drg_severity, drg_mortality
FROM drgcodes
"""
drgcodes = pd.read_sql_query(query, con)

In [None]:
#get icd-9 codes from table: procedures_icd
query = query_schema +"""
SELECT subject_id, hadm_id, icd9_code
FROM procedures_icd
"""
procedures_icd = pd.read_sql_query(query, con)

In [None]:
#get data from table: d_icd_procedures
query = query_schema +"""
SELECT icd9_code, short_title, long_title
FROM d_icd_diagnoses
"""
d_procedures_icd = pd.read_sql_query(query, con)

In [None]:
# Import salmonella trigger codes and select only ICD-9 codes
salmonellaTC = pd.read_csv("C:/Users/Maggie/OneDrive/UW-BHI/Research Ish/PHIN VADS/salmonellaRCTC.csv")

salmonellaICD = salmonellaTC[salmonellaTC.CodeSystem=='ICD9CM']

#### Reprocessing dates from random future dates to MIMIC dataframe date range

In [1]:
admissions.info()

NameError: name 'admissions' is not defined

In [None]:
#make a copy of admissions
admissions2=admissions.copy()

In [None]:
#group future dates into 2001-2012
admissions2['admit_year'] = pd.Series(np.zeros(admissions2.shape[0]))
admissions2.loc[admissions2['admittime'].dt.year>2201, 'admit_year'] = int(2012)
admissions2.loc[(admissions2['admittime'].dt.year<=2201) & (admissions2['admittime'].dt.year>2192), 'admit_year'] = int(2011)
admissions2.loc[(admissions2['admittime'].dt.year<=2192) & (admissions2['admittime'].dt.year>2183), 'admit_year'] = 2010
admissions2.loc[(admissions2['admittime'].dt.year<=2183) & (admissions2['admittime'].dt.year>2174), 'admit_year'] = 2009
admissions2.loc[(admissions2['admittime'].dt.year<=2174) & (admissions2['admittime'].dt.year>2165), 'admit_year'] = 2008
admissions2.loc[(admissions2['admittime'].dt.year<=2165) & (admissions2['admittime'].dt.year>2156), 'admit_year'] = 2007
admissions2.loc[(admissions2['admittime'].dt.year<=2156) & (admissions2['admittime'].dt.year>2147), 'admit_year'] = 2006
admissions2.loc[(admissions2['admittime'].dt.year<=2147) & (admissions2['admittime'].dt.year>2138), 'admit_year'] = 2005
admissions2.loc[(admissions2['admittime'].dt.year<=2138) & (admissions2['admittime'].dt.year>2129), 'admit_year'] = 2004
admissions2.loc[(admissions2['admittime'].dt.year<=2129) & (admissions2['admittime'].dt.year>2120), 'admit_year'] = 2003
admissions2.loc[(admissions2['admittime'].dt.year<=2120) & (admissions2['admittime'].dt.year>2111), 'admit_year'] = 2002
admissions2.loc[admissions2['admittime'].dt.year<=2111, 'admit_year'] = 2001

In [None]:
#convert new variable to an integer
admissions2['admit_year'] = admissions2['admit_year'].astype('int')
admissions2.head()

In [None]:
#extract month and date
admissions2['admit_month']=admissions2['admittime'].dt.month
admissions2['admit_day']=admissions2['admittime'].dt.day

#convert all leap year days to 2004 to avoid conflict
admissions2.loc[(admissions2['admit_day']==29) & (admissions2['admit_month']==2), 'admit_year'] = 2004

In [None]:
#convert to strings
admissions2['admit_year'] = admissions2['admit_year'].astype('str')
admissions2['admit_month'] = admissions2['admit_month'].astype('str')
admissions2['admit_day'] = admissions2['admit_day'].astype('str')

#merge new variables
admissions2['admit_new']=admissions2[['admit_year', 'admit_month', 'admit_day']].apply(lambda x: '-'.join(x), axis=1)

#convert string to date
admissions2['admit_new'] = pd.to_datetime(admissions2['admit_new'])

#print dataset head
admissions2.head()

In [None]:
#remove all new variables except converted date
cols=[16,17,18]
admissions2.drop(admissions2.columns[cols], axis=1,inplace=True)

#print dataframe info
admissions2.info()

In [None]:
#repeating above process for discharge date
admissions2['disch_year'] = pd.Series(np.zeros(admissions2.shape[0]))
admissions2.loc[admissions2['dischtime'].dt.year>2201, 'disch_year'] = 2012
admissions2.loc[(admissions2['dischtime'].dt.year<=2201) & (admissions2['dischtime'].dt.year>2192), 'disch_year'] = 2011
admissions2.loc[(admissions2['dischtime'].dt.year<=2192) & (admissions2['dischtime'].dt.year>2183), 'disch_year'] = 2010
admissions2.loc[(admissions2['dischtime'].dt.year<=2183) & (admissions2['dischtime'].dt.year>2174), 'disch_year'] = 2009
admissions2.loc[(admissions2['dischtime'].dt.year<=2174) & (admissions2['dischtime'].dt.year>2165), 'disch_year'] = 2008
admissions2.loc[(admissions2['dischtime'].dt.year<=2165) & (admissions2['dischtime'].dt.year>2156), 'disch_year'] = 2007
admissions2.loc[(admissions2['dischtime'].dt.year<=2156) & (admissions2['dischtime'].dt.year>2147), 'disch_year'] = 2006
admissions2.loc[(admissions2['dischtime'].dt.year<=2147) & (admissions2['dischtime'].dt.year>2138), 'disch_year'] = 2005
admissions2.loc[(admissions2['dischtime'].dt.year<=2138) & (admissions2['dischtime'].dt.year>2129), 'disch_year'] = 2004
admissions2.loc[(admissions2['dischtime'].dt.year<=2129) & (admissions2['dischtime'].dt.year>2120), 'disch_year'] = 2003
admissions2.loc[(admissions2['dischtime'].dt.year<=2120) & (admissions2['dischtime'].dt.year>2111), 'disch_year'] = 2002
admissions2.loc[admissions2['dischtime'].dt.year<=2111, 'disch_year'] = 2001

In [None]:
admissions2['disch_year'] = admissions2['disch_year'].astype('int')
admissions2.head()

In [None]:
admissions2['disch_month']=admissions2['dischtime'].dt.month
admissions2['disch_day']=admissions2['dischtime'].dt.day

admissions2.loc[(admissions2['disch_day']==29) & (admissions2['disch_month']==2), 'disch_year'] = 2004

In [None]:
admissions2['disch_year'] = admissions2['disch_year'].astype('str')
admissions2['disch_month'] = admissions2['disch_month'].astype('str')
admissions2['disch_day'] = admissions2['disch_day'].astype('str')

admissions2['disch_new']=admissions2[['disch_year', 'disch_month', 'disch_day']].apply(lambda x: '-'.join(x), axis=1)

admissions2['disch_new'] = pd.to_datetime(admissions2['disch_new'])

admissions2.head()

In [None]:
cols=[17,18,19]
admissions2.drop(admissions2.columns[cols], axis=1,inplace=True)

admissions2.info()

### Merge imported datasets

In [None]:
merge_diagnoses = pd.merge(diagnoses_icd, d_diagnoses_icd, how='inner'
                           , left_on='icd9_code', right_on='icd9_code')

In [None]:
merge_procedures = pd.merge(procedures_icd, d_procedures_icd, how='inner'
                            , left_on='icd9_code', right_on='icd9_code')

In [None]:
merge_diag_proc = [merge_diagnoses, merge_procedures]
merge_diag_proc = pd.concat(merge_diag_proc)

In [None]:
#Connect all ICD-9 codes to salmonella trigger codes
merge_diag_proc_salm = pd.merge(merge_diag_proc, salmonellaICD
                                , how='inner', left_on='icd9_code'
                                , right_on='Code')
#cut down columns
merge_diag_proc_salm = merge_diag_proc_salm[['subject_id','hadm_id'
                                             , 'Code', 'Descriptor']]
#merge cases with all other diagnoses during that visit
merge_diag_proc_salm = pd.merge(merge_diag_proc_salm, merge_diag_proc
                               , how='left', left_on=['subject_id', 'hadm_id']
                               , right_on=['subject_id', 'hadm_id'])
#merge_diag_proc_salm #should be 1297 unique

In [None]:
#merge with admissions
merge_salm_admit = pd.merge(merge_diag_proc_salm, admissions2
                               , how='left', left_on=['subject_id', 'hadm_id']
                               , right_on=['subject_id', 'hadm_id'])

In [None]:
#merge with drgcodes
merge_salm_admit_drg = pd.merge(merge_salm_admit, drgcodes
                               , how='left', left_on=['subject_id', 'hadm_id']
                               , right_on=['subject_id', 'hadm_id'])

In [None]:
#merge with prescriptions
merge_all_salmonella = pd.merge(merge_salm_admit_drg, prescriptions
                               , how='left', left_on=['subject_id', 'hadm_id']
                               , right_on=['subject_id', 'hadm_id'])

In [None]:
# export file
merge_all_salmonella.to_csv("C:/Users/Maggie/OneDrive/UW-BHI/2018Fall/CSE583/Project/mimic_salmonella.csv")