In [10]:
import pandas as pd
import numpy as np
import glob

#### Start with cleaning ESSENCE

In [11]:
inputfiles = glob.glob('../Data/health/ESSENCE/cleaned*') #read in all essence files that were cleaned previously
df_from_each_file = (pd.read_csv(f) for f in inputfiles)
essence = pd.concat(df_from_each_file, sort = False) # concat into one big file
essence['Date'] = pd.to_datetime(essence['Date'], utc = True) # convert date to datetime
essence['Zip'] = essence['Patient_Zip'].str[:5] # for ZIP codes, only take first 5 values in the string (some are longer)
essence['Zip'] = pd.to_numeric(essence['Zip'], errors='coerce').fillna(0).astype(np.int64) #convert all the ZIPs to numbers, and if they won't convert then make them a zero
essence = essence[essence['Zip'] != 0] # exclude all ZIPs that we couldn't convert, so we made them zeros in the step above
essence = essence.dropna(subset = 'Zip') # also drop any columns that didn't have a ZIP associated because we can't use this in the analysis

In [12]:
smoke = pd.read_csv('../Data/smoke/KateZip/AllZipSmoke_Total_os.csv') #read in the smoke data so we can get a list of the 2010 US Census ZCTA ZIPs
listy = smoke['Zip'].unique() # make a list of all unique ZIPs
print('Percent of ESSENCE data with Zips that do not match smoke exposure: ', round(len(essence[~essence['Zip'].isin(listy)])/len(essence)*100, 2), '%')

Percent of ESSENCE data with Zips that do not match smoke exposure:  1.75 %


In [13]:
newessence = essence[essence['Zip'].isin(listy)] #only take zipcodes in the ED data that will match with the health data
newessence['New_Patient_ID'] = range(1, len(newessence)+1) # Make a new unique Patient ID for each row that will be used in the CC data
newessence['New_Patient_ID']  = newessence['New_Patient_ID'] .astype(str)
newessence['New_Patient_ID'] = 'OS_' + newessence['New_Patient_ID'] # Just add to the Patient ID to make sure it is clear that "OS" made it, and it's not a legit Patient ID
newessence.to_csv('../Data/health/ESSENCE/ALLCLEANED_ESSENCE.csv')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newessence['New_Patient_ID'] = range(1, len(newessence)+1) # Make a new unique Patient ID for each row that will be used in the CC data
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newessence['New_Patient_ID']  = newessence['New_Patient_ID'] .astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-co

#### Next, we need to finish cleaning the ED data

In [14]:
ED_original = pd.read_csv("C:/Users/olivia.sablan/OneDrive - State of New Mexico/Documents/Data/health/ED/ED_data_multipleremoved.csv") #read in the previously cleaned ED data
ED = ED_original.copy(deep = True) # Make a copy to be compared to the final cleaned data
ED = ED.rename(columns = {'ZIP': 'Zip', 'date_of_visit':'Date'})
ED['Zip'] = ED['Zip'].str[:5] # for ZIP codes, only take first 5 values in the string (some are longe
ED['Zip'] = pd.to_numeric(ED['Zip'], errors='coerce').fillna(0).astype(np.int64) #convert all the ZIPs to numbers, and if they won't convert then make them a zero
ED = ED[ED['Zip'] != 0]  # exclude all ZIPs that we couldn't convert, so we made them zeros in the step above
ED = ED.dropna(subset = ['Zip']) # also drop any columns that didn't have a ZIP associated because we can't use this in the analysis
ED['Zip'] = ED['Zip'].astype('str') # Convert back to string for next step
# We only want to keep records with ZIPs in NM, which all start with eith 88 or 87 or the other four that border the state
ED = ED.loc[(ED['Zip'].str.startswith('88', na=False)) | (ED['Zip'].str.startswith('87', na=False)) | (ED['Zip'] == '79837') | (ED['Zip'] == '81137') | (ED['Zip'] == '86504') | (ED['Zip'] == '86515')]
# Again, make sure the ZIP is the right length
ED = ED[ED['Zip'].str.len() == 5]
ED['Zip'] = ED['Zip'].astype('int') # Need ZIP as an int to compare to the smoke PM2.5 data

  ED_original = pd.read_csv("C:/Users/olivia.sablan/OneDrive - State of New Mexico/Documents/Data/health/ED/ED_data_multipleremoved.csv") #read in the previously cleaned ED data


In [15]:
print('Data removed bc error in zipcode: ', round(100 - (len(ED) / len(ED_original) * 100), 2))

Data removed bc error in zipcode:  1.15


In [16]:
print('Percent of ED data with Zips that do not match smoke exposure: ', round(len(ED[~ED['Zip'].isin(listy)])/len(ED)*100, 2), '%')

Percent of ED data with Zips that do not match smoke exposure:  1.98 %


In [17]:
ED = ED[ED['Zip'].isin(listy)] #only take zipcodes in the ED data that will match with the health data

In [18]:
# Need to create a ED file that can be easily fed into the "preparehealthforCC.R" code
# We have to make new rows corresponding to diagnoses, with a 1 or a 0
# Each row will have a 1 for the primary diagnoses and a 0 for the others (although if the primary diagnosis fits in allresp1 or allcardio1 then it may have two 1's)
ED['allresp1'] = np.where(ED['All_respiratory'] == 1, 1, 0)
ED['asthma1'] = np.where(ED['Asthma'] == 1, 1, 0)
ED['COPD1'] = np.where(ED['COPD'] == 1, 1, 0)
ED['pneumonia1'] = np.where(ED['Pneumonia'] == 1, 1, 0)
ED['bronchitis1'] = np.where(ED['Bronchitis'] == 1, 1, 0)
ED['allcardio1'] = np.where(ED['All_cardiovascular'] == 1, 1, 0)
ED['cardiacarrest1'] = np.where(ED['Cardiac_arrest'] == 1, 1, 0)
ED['arrythmia1'] = np.where(ED['Arrythmia'] == 1, 1, 0)
ED['heartfail1'] = np.where(ED['Heart_failure'] == 1, 1, 0)
ED['ischemic1'] = np.where(ED['Ischemic'] == 1, 1, 0)
ED['MI1'] = np.where(ED['MI'] == 1, 1, 0)
ED['cerebrovascular1'] = np.where(ED['Cerebrovascular'] == 1, 1, 0)

# Drop the previous columns of diagnoses
ED = ED.drop(columns = ['All_respiratory', 'Asthma', 'COPD', 'Pneumonia', 'Bronchitis', 'All_cardiovascular', 'Cardiac_arrest', 'Arrythmia', 'Heart_failure', 'MI', 'Cerebrovascular'])
# Create a column to sum all the diagnoses
ED['colSUM'] = ED["allresp1"] + ED["asthma1"] + ED["COPD1"] + ED["pneumonia1"] + ED["bronchitis1"] + ED["allcardio1"] + ED["cardiacarrest1"] + ED["arrythmia1"] + ED["heartfail1"] + ED["ischemic1"] + ED["MI1"] + ED["cerebrovascular1"]
# Only keep rows were the sum of all diagnoses is not zero
# The original data had secondary diagnoses so we need to the drop that rows that were secondary only 
ED = ED[ED['colSUM'] != 0]

# Create a unique patient ID for each row, and make it clear that this is not a reported ID by adding "OS"
ED['New_Patient_ID'] = range(1, len(ED)+1)
ED['New_Patient_ID']  = ED['New_Patient_ID'] .astype(str)
ED['New_Patient_ID'] = 'OS_' + ED['New_Patient_ID']

In [19]:
ED.to_csv('../Data/health/ED/ED_data.csv')

### Get stats for Table 1
#### First doing ED data

In [20]:
outcomes = ["allresp1","asthma1","COPD1","pneumonia1","bronchitis1","allcardio1",
         "cardiacarrest1","arrythmia1","heartfail1","ischemic1","MI1",
         "cerebrovascular1"]

print('TOTAL CASES (total:', len(ED), ')\n \n')
for i in range(len(outcomes)):
    oneOutcome = ED[ED[outcomes[i]] == 1]
    print(outcomes[i], ': n = ', len(oneOutcome))

TOTAL CASES (total: 733078 )
 

allresp1 : n =  508532
asthma1 : n =  51297
COPD1 : n =  35696
pneumonia1 : n =  76282
bronchitis1 : n =  89688
allcardio1 : n =  217270
cardiacarrest1 : n =  7959
arrythmia1 : n =  41345
heartfail1 : n =  17342
ischemic1 : n =  49592
MI1 : n =  41606
cerebrovascular1 : n =  42624


In [21]:
print('PERCENTAGES IN AGE CATEGORIES (totals:', len(ED[ED['age'] < 15]), len(ED[(ED['age'] >= 15) & (ED['age'] < 65)]), len(ED[ED['age'] >= 65]), ')\n \n')

for i in range(len(outcomes)):
    oneOutcome = ED[ED[outcomes[i]] == 1]
    young = len(oneOutcome[oneOutcome['age'] < 15])
    mid = len(oneOutcome[(oneOutcome['age'] >= 15) & (oneOutcome['age'] < 65)])
    older = len(oneOutcome[oneOutcome['age'] >= 65])
    total = len(oneOutcome)
    print(outcomes[i], '- ',str(round(young/total*100,1)), '%', str(round(mid/total*100, 1)), '%', str(round(older/total*100, 1)), '%')

PERCENTAGES IN AGE CATEGORIES (totals: 158668 342822 231505 )
 

allresp1 -  31.0 % 49.8 % 19.2 %
asthma1 -  28.9 % 62.5 % 8.6 %
COPD1 -  0.3 % 37.6 % 62.1 %
pneumonia1 -  14.9 % 43.4 % 41.7 %
bronchitis1 -  40.9 % 45.8 % 13.3 %
allcardio1 -  0.4 % 40.2 % 59.3 %
cardiacarrest1 -  3.6 % 45.0 % 50.5 %
arrythmia1 -  0.7 % 35.8 % 63.4 %
heartfail1 -  0.1 % 31.5 % 68.4 %
ischemic1 -  0.1 % 41.0 % 59.0 %
MI1 -  0.0 % 40.2 % 59.8 %
cerebrovascular1 -  0.2 % 32.5 % 67.2 %


In [22]:
print('PERCENTAGES IN SEX CATEGORIES (totals:', len(ED[ED['sex'] == 1]), len(ED[ED['sex'] == 2]), ')\n \n')
for i in range(len(outcomes)):
    oneOutcome = ED[ED[outcomes[i]] == 1]
    male = len(oneOutcome[oneOutcome['sex'] == 1])
    female = len(oneOutcome[oneOutcome['sex'] == 2])
    total = len(oneOutcome)
    print(outcomes[i], '- ',str(round(male/total*100,1)), '%', str(round(female/total*100, 1)), '%')

PERCENTAGES IN SEX CATEGORIES (totals: 349839 383168 )
 

allresp1 -  45.7 % 54.3 %
asthma1 -  43.2 % 56.8 %
COPD1 -  46.8 % 53.2 %
pneumonia1 -  50.4 % 49.6 %
bronchitis1 -  45.7 % 54.3 %
allcardio1 -  52.5 % 47.5 %
cardiacarrest1 -  61.6 % 38.4 %
arrythmia1 -  50.8 % 49.1 %
heartfail1 -  52.4 % 47.6 %
ischemic1 -  61.5 % 38.5 %
MI1 -  61.8 % 38.2 %
cerebrovascular1 -  48.8 % 51.2 %


### Next, work on ESSENCE

In [23]:
outcomes = ['BroadResp', 'Asthma', 'AQResp', 'Cardio']

print('TOTAL CASES \n \n')
for i in range(len(outcomes)):
    oneOutcome = newessence[newessence[outcomes[i]] == 1]
    print(outcomes[i], ': n = ', len(oneOutcome))

TOTAL CASES 
 

BroadResp : n =  598409
Asthma : n =  22750
AQResp : n =  326635
Cardio : n =  449997


In [24]:
print('PERCENTAGES IN AGE CATEGORIES \n \n')
for i in range(len(outcomes)):
    oneOutcome = newessence[newessence[outcomes[i]] == 1]
    young = len(oneOutcome[oneOutcome['Age'] < 15])
    mid = len(oneOutcome[(oneOutcome['Age'] >= 15) & (oneOutcome['Age'] < 65)])
    older = len(oneOutcome[oneOutcome['Age'] >= 65])
    total = len(oneOutcome)
    print(outcomes[i], '- ',str(round(young/total*100,1)), '%', str(round(mid/total*100, 1)), '%', str(round(older/total*100, 1)), '%')

PERCENTAGES IN AGE CATEGORIES 
 

BroadResp -  28.8 % 52.8 % 18.4 %
Asthma -  23.8 % 65.1 % 11.2 %
AQResp -  11.3 % 56.8 % 31.8 %
Cardio -  0.8 % 47.5 % 51.7 %


In [25]:
print('PERCENTAGES IN SEX CATEGORIES \n \n')
for i in range(len(outcomes)):
    oneOutcome = newessence[newessence[outcomes[i]] == 1]
    male = len(oneOutcome[oneOutcome['Sex'] == 'M'])
    female = len(oneOutcome[oneOutcome['Sex'] == 'F'])
    total = len(oneOutcome)
    print(outcomes[i], '- ',str(round(male/total*100,1)), '%', str(round(female/total*100, 1)), '%')

PERCENTAGES IN SEX CATEGORIES 
 

BroadResp -  45.8 % 54.1 %
Asthma -  41.8 % 58.2 %
AQResp -  45.4 % 54.6 %
Cardio -  50.4 % 49.6 %


In [26]:
print(str(round(len(essence[essence['Sex'] == 'U'])/len(essence), 4)))

0.0001
