In [1]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)


In [2]:
# Import the data excluding the 1st column HCC_CODE, so there are <=100 columns so df.info() will work
# 297 homes

# ltc = pd.read_excel('../data/BC/QFD2019-public release-20191009.xlsx', sheet_name = 'QFD 2019', usecols=range(1, 101))
# ltc.info()

ltc = pd.read_csv('../data/BC/ngan_final_bcltc.csv')
ltc.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 297 entries, 0 to 296
Columns: 105 entries, FACILITY_NAME to outbreak
dtypes: float64(22), int64(6), object(77)
memory usage: 243.7+ KB


In [3]:
# Remove rows identified as special units
# Three facilities have special care units identified in the Directory with a separate data. These are not normally counted as separate facilities.
ltc = ltc[~ltc.FACILITY_NAME.str.contains("Special Unit", na=False)]



In [4]:
# Remove rows with fewer than 7 beds
# Data has been suppressed for facilities with 5 beds or less
# Bella Coola General Hospital (VCHA)
# Mackenzie & District Hospital and Health Centre (NHA)
# Northern Haida Gwaii Hospital and Health Centre (NHA)
# R.W. Large Memorial Hospital (6 beds but suppressed at VCHA's request)
ltc = ltc.loc[ltc['BEDS_TOTAL'] > 7] 



In [5]:
# Remove Cariboo Place as it just opened last year and is missing a lot of data
# There is one new facility added this year. Cariboo Place opened on April 1, 2019. It is included for searchability, but will have no indicator data for 2018/19.
ltc = ltc[~ltc.FACILITY_NAME.str.contains("Cariboo", na=False)]

In [6]:
# Remove facilities in Northern Health
# Complaints data for Hospital Act facilities in Northern Health are not available
ltc = ltc[~ltc.HLTH_AUTH.str.contains("Northern", na=False)]

In [7]:
# Remove facilities in Island Health
# Incident data for Hospital Act facilities in Vancouver Island Health are not available
ltc = ltc[~ltc.HLTH_AUTH.str.contains("Island", na=False)]

In [8]:
ltc.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 3 to 296
Columns: 105 entries, FACILITY_NAME to outbreak
dtypes: float64(22), int64(6), object(77)
memory usage: 172.2+ KB


In [9]:
# # Proportion of values missing for each column
# def missing(dff):
#     print (round((dff.isnull().sum() * 100/ len(dff)),2).sort_values(ascending=False))

# missing(ltc)

# # Column variables with > 10% of values missing
# ltc.columns[ltc.isnull().mean() > 0.1]

In [10]:
# x = ltc[ltc['CMI'].isnull()]
# x
# len(x)

# ltc['CMI'].value_counts()
# ltc.loc[ltc['CMI'] == 'suppressed']

In [11]:
# Drop columns
listofcols = [
    # Not relevant
    'STREET_ADDRESS',
    'CITY',
    'POSTAL',
    'PHONE',
    'OPERATOR_NAME',
#     'COUNCIL',
    'MEETINGS_HELD',
    'PARKING_COSTS',
    'ACCRED_EXPIRY',
    'COMPLAINT_CONTACT',
    'COMPLAINT_PHONE',
    'SPENDING_ACCOUNT',
    'FOOD_SERVICE',
    'FOOD_PREP',
    'FOOD_COST_LASTYR',
    'FOOD_COST_CURRENTYR',
    'LANGUAGES',
    'WEBPAGE',
    'SURVEY_URL',
    'FEES_CABLE',
    'FEES_TELE',
    'FEES_INTERNET',
    'FEES_OTHER',
    'WEBPAGE',
    'SURVEY_URL',
    'MISSING_SURVEY',
    'PER_DIEM_LASTYR',
    'PER_DIEM_CURRENTYR',
    'CONTRACTED_HOUSEKEEPING', # > 10% missing
    'CONTRACTED_PLANT', # > 10% missing
    
    'DCH_NURSE_LASTYR',
       'DCH_ALLIED_LASTYR', 'DCH_TOTAL_LASTYR',
    
    # Demographic/resident case descriptors may affect fatalities but should not affect outbreak status
    'AGE', 'FEMALE',
       'AGE_85_PLUS', 'AGE_UNDER_65', 'STAY_LENGTH', 'DEPRESSION',
       'ADL_DEPENDENT', 'CPS_SEVERE', 'DEMENTIA', 'ABS_PHYS_ABUSIVE',
       'ISE', 'ISE_LOW', 'THERAPY_PT', 'THERAPY_RT', 'THERAPY_OT',
       'MEDS_DEPRESSION', 'MEDS_ANTIPSYCHOTICS', 'RESTRAINTS',
    
#     'CMI',
    
    # Excess of missing data
    'ADVERSE_EVENT', # 275 listed as not applicable 
    'CONTRACTED_NURSING', 'CONTRACTED_CARE_AIDES'
]

ltc2 = ltc.drop(listofcols, axis = 1)


In [12]:
# Convert datatype for CMI
ltc2['CMI'] = pd.to_numeric(ltc2['CMI'])

In [13]:
# Convert datatype for OPEN_DATE
ltc2['OPEN_DATE'].replace({'2009 / 2016': "2009"}, inplace=True)
ltc2['OPEN_DATE'] = pd.to_numeric(ltc2['OPEN_DATE'])

# Create new columns WRT facility dates (surrogate for design standards)
ltc2.loc[ltc2['OPEN_DATE'] > 1994, 'facility_design'] = 'post1994'
ltc2.loc[ltc2['OPEN_DATE'] < 1994, 'facility_design'] = 'pre1994'

In [14]:
# Define new variables for resident and family councils
ltc2['RESIDENT_COUNCIL'] = ltc2['COUNCIL'].str.contains('Resident', regex=False, na=False)
ltc2['FAMILY_COUNCIL'] = ltc2['COUNCIL'].str.contains('Family', regex=False, na=False)

In [15]:
# Define proportion of private vs public beds
ltc2['BEDS_PRIVATEprop'] = ltc2['BEDS_PRIVATE']/ltc2['BEDS_TOTAL']
ltc2['BEDS_PUBLICprop'] = ltc2['BEDS_PUBLIC']/ltc2['BEDS_TOTAL']

In [16]:
# Define total rooms and proportion of private rooms etc.
ltc2['ROOMS_TOTAL'] = ltc2['ROOMS_PRIVATE'] + ltc2['ROOMS_SEMI'] + ltc2['ROOMS_MULTI']
ltc2['ROOMS_PRIVATEprop'] = ltc2['ROOMS_PRIVATE']/ltc2['ROOMS_TOTAL']
ltc2['ROOMS_SEMIprop'] = ltc2['ROOMS_SEMI']/ltc2['ROOMS_TOTAL']
ltc2['ROOMS_MULTIprop'] = ltc2['ROOMS_MULTI']/ltc2['ROOMS_TOTAL']


In [17]:
ltc2.info()
ltc2.head()
ltc2.columns.values

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 3 to 296
Data columns (total 63 columns):
FACILITY_NAME              208 non-null object
HLTH_AUTH                  208 non-null object
OWNERSHIP                  208 non-null object
OPEN_DATE                  199 non-null float64
COUNCIL                    204 non-null object
REGULATION                 208 non-null object
ACCRED_STATUS              208 non-null object
DCH_NURSE_CURRENTYR        208 non-null float64
DCH_ALLIED_CURRENTYR       208 non-null float64
DCH_TOTAL_CURRENTYR        208 non-null float64
BEDS_PRIVATE               208 non-null int64
BEDS_PUBLIC                208 non-null int64
BEDS_TOTAL                 208 non-null int64
ROOMS_PRIVATE              208 non-null int64
ROOMS_SEMI                 208 non-null int64
ROOMS_MULTI                208 non-null int64
COMPLAINTS                 208 non-null object
SUB_COMPLAINTS             208 non-null object
INCIDENT_OUTBREAK          208 non-null object
INCI

array(['FACILITY_NAME', 'HLTH_AUTH', 'OWNERSHIP', 'OPEN_DATE', 'COUNCIL',
       'REGULATION', 'ACCRED_STATUS', 'DCH_NURSE_CURRENTYR',
       'DCH_ALLIED_CURRENTYR', 'DCH_TOTAL_CURRENTYR', 'BEDS_PRIVATE',
       'BEDS_PUBLIC', 'BEDS_TOTAL', 'ROOMS_PRIVATE', 'ROOMS_SEMI',
       'ROOMS_MULTI', 'COMPLAINTS', 'SUB_COMPLAINTS', 'INCIDENT_OUTBREAK',
       'INCIDENT_ABUSE', 'INCIDENT_FALL', 'INCIDENT_POISON',
       'INCIDENT_MEDICATION', 'INCIDENT_WANDERING', 'INCIDENT_INJURY',
       'INCIDENT_AGGRESSION', 'INCIDENT_OUTBREAK_100',
       'INCIDENT_ABUSE_100', 'INCIDENT_FALL_100', 'INCIDENT_POISON_100',
       'INCIDENT_MEDICATION_100', 'INCIDENT_WANDERING_100',
       'INCIDENT_INJURY_100', 'INCIDENT_AGGRESSION_100', 'CMI',
       'INSPECTIONS', 'INFRACTIONS', 'INFRACTIONS_LICENSING',
       'INFRACTIONS_FACILITY', 'INFRACTIONS_STAFFING',
       'INFRACTIONS_POLICY', 'INFRACTIONS_CARE', 'INFRACTIONS_DISEASE',
       'INFRACTIONS_FOOD', 'INFRACTIONS_MEDICATION',
       'INFRACTIONS_PROGRAM

In [21]:
listofcols = ['HLTH_AUTH', 'OPEN_DATE', 'COUNCIL']
ltc3 = ltc2.drop(listofcols, axis = 1)

In [22]:
# For homes without facility design data, impute unknown
ltc3["facility_design"].fillna("unknown", inplace = True)
ltc3["facility_name"].fillna("na", inplace = True)
ltc3["num_cases"].fillna("na", inplace = True)
ltc3["num_deaths"].fillna("na", inplace = True)

In [23]:
ltc3.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 208 entries, 3 to 296
Data columns (total 60 columns):
FACILITY_NAME              208 non-null object
OWNERSHIP                  208 non-null object
REGULATION                 208 non-null object
ACCRED_STATUS              208 non-null object
DCH_NURSE_CURRENTYR        208 non-null float64
DCH_ALLIED_CURRENTYR       208 non-null float64
DCH_TOTAL_CURRENTYR        208 non-null float64
BEDS_PRIVATE               208 non-null int64
BEDS_PUBLIC                208 non-null int64
BEDS_TOTAL                 208 non-null int64
ROOMS_PRIVATE              208 non-null int64
ROOMS_SEMI                 208 non-null int64
ROOMS_MULTI                208 non-null int64
COMPLAINTS                 208 non-null object
SUB_COMPLAINTS             208 non-null object
INCIDENT_OUTBREAK          208 non-null object
INCIDENT_ABUSE             208 non-null object
INCIDENT_FALL              208 non-null object
INCIDENT_POISON            208 non-null object
INCID

In [24]:
# ltc3.loc[ltc3['DCH_NURSE_LASTYR'].isnull()]
ltc4 = ltc3[~ltc3.isna().any(axis=1)]
ltc4.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 202 entries, 3 to 296
Data columns (total 60 columns):
FACILITY_NAME              202 non-null object
OWNERSHIP                  202 non-null object
REGULATION                 202 non-null object
ACCRED_STATUS              202 non-null object
DCH_NURSE_CURRENTYR        202 non-null float64
DCH_ALLIED_CURRENTYR       202 non-null float64
DCH_TOTAL_CURRENTYR        202 non-null float64
BEDS_PRIVATE               202 non-null int64
BEDS_PUBLIC                202 non-null int64
BEDS_TOTAL                 202 non-null int64
ROOMS_PRIVATE              202 non-null int64
ROOMS_SEMI                 202 non-null int64
ROOMS_MULTI                202 non-null int64
COMPLAINTS                 202 non-null object
SUB_COMPLAINTS             202 non-null object
INCIDENT_OUTBREAK          202 non-null object
INCIDENT_ABUSE             202 non-null object
INCIDENT_FALL              202 non-null object
INCIDENT_POISON            202 non-null object
INCID

In [25]:
ltc4.to_csv(r'../data/BC/ngan_bcLTC_foranalysis.csv', index = False)

In [None]:
ltc2['facility_design'].value_counts()

# x = [i for i in ltc['OPEN_DATE'] if i>1994]
# len(x)

In [None]:
ltc['ADVERSE_EVENT'].value_counts()
# Need to create new columns for Resident Council, Family Council 
