# Combining CZ and Non-CZ data into one file




In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from datetime import date
import datetime
import numpy as np
import seaborn as sns
import re
import logging
import sys
sys.path.append('..')
from utilities import logger
from utilities import configuration



In [2]:
config = configuration.get_config()
for key in config:
    print(f'{key:40} (type: {type(config[key])})')

system_log                               (type: <class 'str'>)
unified_merged_file_cz                   (type: <class 'str'>)
unified_merged_file_noncz                (type: <class 'str'>)
unified_merged_file                      (type: <class 'str'>)
data_path                                (type: <class 'str'>)
cz_files                                 (type: <class 'list'>)
noncz_files                              (type: <class 'list'>)


In [3]:
logger.init_logger(config['system_log'])
logging.debug('logger has started ...')

2023-09-15 11:13:45,794 - root - DEBUG - logger has started ...


In [50]:
cz_df = pd.read_csv(config['unified_merged_file_cz'])
cz_df['is ALC Patient'] = cz_df['ALCDays'] > 0
cz_df['Admit Date:'] = [date.fromisoformat(date_.replace('/','-')) for date_ in cz_df['Admit Date:']]
cz_df['Disch Date:'] = [date.fromisoformat(date_.replace('/','-')) for date_ in cz_df['Disch Date:']]
cz_df['Disch Date (year-month):'] = [str(date_)[:7] for date_ in cz_df['Disch Date:']]
cz_df['Total Days in Hospital'] = [1 if (discharge-admit).days==0 else (discharge-admit).days  
                                   for admit,discharge in zip(cz_df['Admit Date:'], cz_df['Disch Date:'])]
cz_df[['Admit Date:', 'Disch Date:', 'Patient Age:', 'ALCDays','Disch Date (year-month):']]

cz_df['CZ status']=['cz']*cz_df.shape[0]

logging.debug(f"All entries for dataset 'CZ' - found:    {cz_df.shape[0]:9,} entries")


noncz_df = pd.read_csv(config['unified_merged_file_noncz'])
noncz_df['is ALC Patient'] = noncz_df['ALCDays'] > 0
noncz_df['Admit Date:'] = [None if date_=='**' else datetime.datetime.strptime(date_, "%Y-%m-%d") for date_ in noncz_df['Admit Date:']]
noncz_df['Disch Date:'] = [datetime.datetime.strptime(date_, "%Y-%m-%d")  for date_ in noncz_df['Disch Date:']]
noncz_df['Disch Date (year-month):'] = [str(date_)[:7] for date_ in noncz_df['Disch Date:']]
noncz_df['Total Days in Hospital'] = [1 if (discharge-admit).days==0 else (discharge-admit).days  
                                   for admit,discharge in zip(noncz_df['Admit Date:'], noncz_df['Disch Date:'])]
noncz_df[['Admit Date:', 'Disch Date:', 'Patient Age:', 'ALCDays','Disch Date (year-month):']]

logging.debug(f"All entries for dataset 'Non-CZ' - found: {noncz_df.shape[0]:9,} entries")

noncz_df['CZ status']=['Non-cz']*noncz_df.shape[0]

noncz_df = noncz_df.rename(columns={'Inst Type 2018':'Institution Type', 'Nursing Unit:': 'Discharge Nurse Unit'})


  cz_df = pd.read_csv(config['unified_merged_file_cz'])


2023-09-15 10:28:46,400 - root - DEBUG - All entries for dataset 'CZ' - found:      256,006 entries


  noncz_df = pd.read_csv(config['unified_merged_file_noncz'])


2023-09-15 10:28:53,270 - root - DEBUG - All entries for dataset 'Non-CZ' - found:   362,691 entries


In [57]:
full_df = pd.concat([cz_df, noncz_df])
full_df.to_csv(config['unified_merged_file'], index=False)

logging.debug(f"Combining Non-cz and cz data into a single file: {config['unified_merged_file']}")

2023-09-15 10:36:40,306 - root - DEBUG - Combining Non-cz and cz data into a single file: /Users/marianomaisonnave/Documents/CBU Postdoc/Grant Data/Merged/2015_2022/full_database.csv
