In [1]:
import pandas as pd

In [2]:
# load in (cleaned) table containing race info
df = pd.read_csv("/Users/patrickburke/Library/CloudStorage/OneDrive-EmoryUniversity/ECON496RW/cleaned_small/pcrpatientracegroup.csv")

Wow only took ~15s to load all that

In [3]:
df.head()

Unnamed: 0,PcrPatientRaceGroupKey,PcrKey,ePatient_14
0,2088217,1472192,7701003
1,16835623,13273412,2514011
2,18418102,14548133,7701003
3,21377926,17184298,2514011
4,38817731,21699657,2514011


In [4]:
unique_values = list(df.ePatient_14.unique())
print(unique_values)

[7701003, 2514011, 2514001, 2514005, 2514007, 7701001, 2514009, 2514003]


- 2514001 - American Indian or Alaska Native 
- 2514003 - Asian
- 2514005 - Black or African American 
- 2514007 - Hispanic or Latino
- 2514009 - Native Hawaiian or Other Pacific Islander
- 2514011 - White
- 7701001 - Not Applicable
- 7701003 - Not Recorded

In [5]:
len(df.index)

49555288

- total observations = 49,555,288
- According to https://nemsis.org/using-ems-data/request-research-data/ the dataset includes 48,982,990 EMS activations
    - Thus we have 572,298 extra rows
    - This makes sense as there must be patients in the dataset that are listed as multiple races (this is perfectly OK)
- I'm gonna pull the observations with multiple races and check those out

In [7]:
dup = df[df.duplicated('PcrKey',False)]

In [9]:
dup.head()

Unnamed: 0,PcrPatientRaceGroupKey,PcrKey,ePatient_14
68,158136243,118170967,2514005
69,158136244,118170967,2514007
243,158136383,118171544,2514009
244,158136384,118171544,2514011
570,186751735,118172608,2514011


- For example, this first person, PcrKey = 118170967, is listed as both 2514005 (black) AND 2514007 (hispanic/latino)
- This makes perfect sense

In [15]:
print(len(dup))
print(dup.PcrKey.nunique())
print(len(dup)/dup.PcrKey.nunique())

1064494
492196
2.1627441100699722


- We have a total of 1,064,494 PCRs that contain more than one value for race
- We have a total of 492,196 patients that identify as 2 or more races
- Each multi-racial patient has an average of 2.16 races

In [26]:
# check race proportions
prop = df['ePatient_14'].value_counts(normalize=True)
prop

2514011    0.432621
7701003    0.301635
2514005    0.156769
2514007    0.065644
7701001    0.023590
2514003    0.010136
2514001    0.006942
2514009    0.002662
Name: ePatient_14, dtype: float64

- 43.3% White
- 30.2% Not Recorded
- 15.7% Black
- 6.6% Hispanic
- 2.4% Not Applicable
- 1.0% Asian
- 0.7% Native American
- 0.2% American Indian or Alaska Native

- I'm gonna drop all of the entries with missing race data
    - Can't do any stats with missing data
- 30% not recorded, 2.3% not applicable
    - This is sorta interesting, not recorded is obviously just missing data but not applicable could be people that don't fit into any of these categories?
    - For now I'll just throw them all out, perhaps will come back if I see fit


In [17]:
# drop not recorded and not applicable
# pandas uses | as the symbol for "or"
nan_obs = df[ (df['ePatient_14'] == 7701003) | (df['ePatient_14'] == 7701001)].index
df.drop(nan_obs, inplace = True)
df.head()

Unnamed: 0,PcrPatientRaceGroupKey,PcrKey,ePatient_14
1,16835623,13273412,2514011
3,21377926,17184298,2514011
4,38817731,21699657,2514011
6,97354357,72057010,2514011
11,111683931,83733589,2514001


In [20]:
print(len(df.index))
print(49555288 - len(df.index))

33438658
16116630


- We just threw out 16,116,630 rows
- This should be fine
- Now our total sample size is 33,438,658

In [21]:
# check new proportions
prop = df['ePatient_14'].value_counts(normalize=True)
prop

2514011    0.641134
2514005    0.232328
2514007    0.097283
2514003    0.015021
2514001    0.010288
2514009    0.003946
Name: ePatient_14, dtype: float64

- 64.1% white
- 23.2% black
- 9.7% hispanic / latino
- 1.5% Asian
- 1.0% American Indian or Alaska Native
- 0.4% Native Hawaiian or Other Pacific Islander

In [22]:
dup = df[df.duplicated('PcrKey',False)]
print(len(dup))
print(dup.PcrKey.nunique())

1064494
492196


- Perfect, still 492,196 that are two or more races

In [16]:
df.head()

Unnamed: 0,PcrPatientRaceGroupKey,PcrKey,ePatient_14
0,2088217,1472192,7701003
1,16835623,13273412,2514011
2,18418102,14548133,7701003
3,21377926,17184298,2514011
4,38817731,21699657,2514011


- Now, I'll create additional columns for the races

In [25]:
def isblack(row):
    if row['ePatient_14'] == 2514005:
        return 1
    else:
        return 0

df['black'] = df.apply(lambda row: isblack(row), axis=1)

- 6min 5sec for that to run

In [26]:
df.head()

Unnamed: 0,PcrPatientRaceGroupKey,PcrKey,ePatient_14,black
1,16835623,13273412,2514011,0
3,21377926,17184298,2514011,0
4,38817731,21699657,2514011,0
6,97354357,72057010,2514011,0
11,111683931,83733589,2514001,0


In [27]:
len(df.index)

33438658

- The index looks weird idk why it's skipping but length remains same so should be fine

In [28]:
def iswhite(row):
    if row['ePatient_14'] == 2514011:
        return 1
    else:
        return 0

df['white'] = df.apply(lambda row: iswhite(row), axis=1)

- 7min 4sec
- I'm gonna export and work on a fresh ipynb as I think some of the old dfs are still loaded

In [29]:
df.head()

Unnamed: 0,PcrPatientRaceGroupKey,PcrKey,ePatient_14,black,white
1,16835623,13273412,2514011,0,1
3,21377926,17184298,2514011,0,1
4,38817731,21699657,2514011,0,1
6,97354357,72057010,2514011,0,1
11,111683931,83733589,2514001,0,0


In [30]:
df.to_csv('/Users/patrickburke/Library/CloudStorage/OneDrive-EmoryUniversity/ECON496RW/raceInProgress/raceBW.csv')

- 1min 28sec