In [1]:
import polars as pl
import pandas as pd
import os

In [14]:
root = "../CAMS/"
units = "../assets/CAMS_units.csv"
zip_2010 = "../assets/tx_texas_zip_codes_geo.min.json"
hospital_data = "/media/teamlary/ssd/Discharge Data/Inpatient/Data/"
census_dir = "../Census/"
icd_data = "../icd10/"

fullZipPop = "../assets/census/interpolatedPopulations.csv"
maleZipPop = "../assets/census/interpolatedMalePopulations.csv"
femaleZipPop = "../assets/census/interpolatedFemalePopulations.csv"

full_zip_population = pl.read_csv(fullZipPop)
male_zip_population = pl.read_csv(maleZipPop)
female_zip_population = pl.read_csv(femaleZipPop)

In [15]:
icd = pd.read_pickle('../assets/ICDPickle.pkl')
icd = pl.DataFrame._from_pandas(icd)
icd

PRINC_DIAG_CODE,Count,ICD-9,icd9cm_mult
str,f64,str,list[str]
"""Z3800""",3.940227e6,"""V3900""","[""V3000"", ""V3900""]"
"""Z3801""",2.099687e6,"""V3001""","[""V3001""]"
"""A419""",1.211138e6,"""99591""","[""0389"", ""99591""]"
"""J189""",820711.0,"""486""","[""486""]"
"""Z5189""",676973.0,"""V589""","[""V5889"", ""V589""]"
"""O3421""",627506.0,"""65423""","[""65421"", ""65423""]"
"""J441""",500761.0,"""49322""","[""49121"", ""49322""]"
"""I2510""",488267.0,"""4292""","[""41401"", ""4292""]"
"""N179""",470121.0,"""5849""","[""5849""]"
"""I214""",464760.0,"""41071""","[""41071""]"


In [24]:
def loopICDPerQuarter(time_period, zipPopulation, data_aggregation, nthresh=1,  icd10=True):
    data = pl.DataFrame()
    for ind, quarter in enumerate(time_period):
        print(quarter)
        hospital_df = pl.read_csv(f"{hospital_data}PUDF_base1_{quarter}_tab.csv", infer_schema_length=0)
        # print(len(hospital_df))
        hospital_df = (hospital_df
                        # .filter(~(pl.col('PAT_ZIP').str.ends_with('.0')))
                        .with_columns(pl.col('PAT_ZIP').str.replace(r'\.0$',''))
                        .filter(pl.col('PAT_ZIP') != '`')
                        .filter(pl.col('PAT_ZIP').str.lengths() == 5)
                        .with_columns(pl.col('PAT_ZIP').cast(pl.Int64))
                        .filter(pl.col('SEX_CODE') == 'M')
        )
        
        print(len(hospital_df))
        # print(hospital_df.glimpse())
        # zip_population = interpCensus(quarter)
        year = quarter[2:]
        zip_population = (zipPopulation
                          .select(['PAT_ZIP',year])
                          .rename({year: 'population'})
        )
        # print(zip_population.glimpse())

        if icd10 == True:
            codes = icd['PRINC_DIAG_CODE']
        else:
            codes = icd['ICD-9']

        for sind, icd_code in enumerate(codes[:500]):
            os.makedirs(f'../icd10_{data_aggregation}/{icd["PRINC_DIAG_CODE"][sind]}', exist_ok=True)
            # print(icd_code)
            # if icd_code != 'N210':
            #     continue

            base_df = pl.DataFrame()
            if icd10 == False:
                for item in icd['icd9cm_mult'][sind]: 
                    # print(icd_code)
                    icd_df = (hospital_df
                              .filter(hospital_df['PRINC_DIAG_CODE'] == item)
                              .group_by(['PAT_ZIP']).count()                              
                              .rename({'count': 'ICD'})
                              .join(zip_population, on='PAT_ZIP')
                              .with_columns((pl.col('ICD')/pl.col('population'))
                                            .alias('normalized'))                        
                              .filter(pl.col('ICD') >= nthresh)
                              .sort('ICD', descending=True)
                    )
                    print(len(icd_df))
                    # print(icd_df.glimpse())

                    base_df = pl.concat([base_df, icd_df])
            else:
                icd_df = (hospital_df
                              .filter(hospital_df['PRINC_DIAG_CODE'] == icd_code)
                              .group_by(['PAT_ZIP']).count()
                              .sort('count', descending=True)
                              .rename({'count': 'ICD'})
                              .join(zip_population, on='PAT_ZIP')
                              .with_columns((pl.col('ICD')/pl.col('population'))
                                            .alias('normalized'))                              
                              .filter(pl.col('ICD') >= nthresh) 
                    )
                #print(icd_df.glimpse())
                base_df = pl.concat([base_df, icd_df])
            save_quarter = quarter[2:] + 'q' + quarter[:1]
            #print(save_quarter)
            base_df.write_csv(f'../icd10_{data_aggregation}/{icd["PRINC_DIAG_CODE"][sind]}/{save_quarter}.csv')
        
            # del hospital_df
    del hospital_df

    return data



In [17]:
start_year = 2005
end_year = 2022

hospital_quarters = [f"{quarter}q{year}" for year in range(start_year, end_year + 1) for quarter in range(1, 5) if not (year == end_year and quarter > 2)]
hospital_quarters = hospital_quarters[:-1]
icd9_subset = hospital_quarters[hospital_quarters.index(f'1q{start_year}'):hospital_quarters.index('4q2015')]
icd10_subset = hospital_quarters[hospital_quarters.index('4q2015'):hospital_quarters.index(f'1q{end_year}')]

loopICDPerQuarter function gets:
- time_period: specifies the range of quarters to grab hospital data from
- zipPopulation: the dataframe which contains populations per zip code for a desired demographic (male, female, total - for now). This information is in full/male/female_zip_population dataframes
- data_aggregration: label name to call aggregated hospital data folder
- nthresh: minimum number of cases required for zip code to be included in dataset
- icd10: a boolean flag to process either icd10 or icd9 hospitalization codes



In [25]:
loopICDPerQuarter(time_period = icd9_subset, 
        zipPopulation = male_zip_population, 
        data_aggregation='male_all_ages', nthresh=1, icd10=False)
loopICDPerQuarter(time_period = icd10_subset, 
        zipPopulation = male_zip_population, 
        data_aggregation='male_all_ages', nthresh=1, icd10=True)

1q2005


  .filter(pl.col('PAT_ZIP').str.lengths() == 5)


258479
1341
0
1213
751
15
1271
43
1
6
0
1019
400
1306
4
690
984
727
6
1
0
4
0
753
495
1198
7
3
0
836
0
865
847
4
370
7
0
29
0
229
453
0
804
30
3
262
657
555
654
509
3
514
552
462
418
9
6
1198
54
732
519
590
590
179
15
0
362
378
386
592
0
385
0
0
76
0
498
411
313
182
518
37
1198
47
313
74
15
258
453
511
0
0
0
0
0
0
757
329
209
718
137
1
334
0
248
0
0
0
382
71
396
605
1
0
0
1
138
1
481
209
142
521
77
415
203
602
243
3
0
58
0
621
488
209
396
198
1
0
1
0
24
2
149
539
0
0
451
314
179
435
213
229
0
273
287
0
0
0
0
343
1198
13
341
217
0
331
162
2
137
228
237
241
373
422
363
0
293
126
15
381
104
316
1
234
312
4
0
163
0
0
0
1
196
139
0
0
0
0
0
267
0
97
210
33
71
262
209
267
0
0
283
15
86
83
0
259
360
141
173
252
174
279
70
178
244
114
514
0
260
105
208
214
221
106
0
0
275
229
187
288
7
0
25
260
0
0
147
2
1
108
123
0
157
106
134
0
15
162
109
0
0
133
15
273
116
195
223
40
10
132
174
1198
40
0
0
0
0
194
24
0
0
226
2
0
0
0
40
136
321
176
333
157
0
272
129
70
2
0
1198
52
190
0
76
297
166
16
182
3
17

In [8]:
q4_2015 = pl.read_csv([hospital_data+i for i in os.listdir(hospital_data) if 'base1_4q2015_tab.csv' in i][0], infer_schema_length=0)
temp_df = (q4_2015
                        # .filter(q4_2015['PRINC_DIAG_CODE'] == 'N210')
                        .with_columns(pl.col('PAT_ZIP').str.replace(r'\.0$',''))
                        .filter(pl.col('PAT_ZIP') != '`')
                        .filter(pl.col('PAT_ZIP').str.lengths() == 5)
                        .with_columns(pl.col('PAT_ZIP').cast(pl.Int64))
                        .filter(pl.col('SEX_CODE') == 'F')
                                                      
                              .group_by(['PAT_ZIP']).count()
                              .sort('count', descending=True)
                              .rename({'count': 'ICD'})
                              #.join(zip_population, on='PAT_ZIP')
                              #.with_columns((pl.col('ICD')/pl.col('population'))
                              #              .alias('normalized'))                              
                              #.filter(pl.col('ICD') >= nthresh)
 )
temp_df

  .filter(pl.col('PAT_ZIP').str.lengths() == 5)


PAT_ZIP,ICD
i64,u32
88888,2749
78521,1930
79936,1559
77449,1501
78572,1485
75217,1424
77084,1379
77036,1321
78577,1321
75228,1318
