In [1]:
from io import StringIO
import pandas
import glob

Download NCHS linked data (and discard non-NHIS linked data)

In [None]:
!wget --recursive ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/datalinkage/linked_mortality
!mkdir NCHS_linked_data
!mv ./ftp.cdc.gov/pub/Health_Statistics/NCHS/datalinkage/linked_mortality/NHIS_*.dat ./NCHS_linked_data/
!rm -r ./ftp.cdc.gov

Read in all the NCHS linked mortality datafiles

In [2]:
nchs_file_paths = glob.glob("./NCHS_linked_data/NHIS_*.dat")

In [3]:
# Sanity check
assert len(nchs_file_paths) > 0, "No NCHS files available!"

In [4]:
nchs_column_widths = [14,1,1,3,1,1,1,4,8,8]
nchs_column_names = ["PUBLICID", "ELIGSTAT", "MORTSTAT", "UCOD_LEADING", "DIABETES", "HYPERTEN", "DODQTR", "DODYEAR", "WGT_NEW", "SA_WGT_NEW"]

nchs_dataframes = [
    pandas.read_fwf(
        file_path,
        widths=nchs_column_widths,
        names=nchs_column_names,
        dtype=False, #{"PUBLICID": "object"}
        na_values=['.']
    )
    for file_path
    in nchs_file_paths
]

In [5]:
# Sanity check
total_row_count = sum([dataframe.shape[0] for dataframe in nchs_dataframes])
total_line_count_string = !wc -l ./NCHS_linked_data/NHIS_*.dat | grep total | cut -f2 -d' '
total_line_count = int(total_line_count_string[0])
assert total_row_count == total_line_count, "Expected {} to equal {}".format(total_row_count, total_line_count)

In [6]:
# Debug info
for dataframe in nchs_dataframes:
   print("{:8d}".format(dataframe.shape[0]))

print("=" * 8)
print("{:8d}".format(total_row_count))

  120032
   89976
  116179
   62052
   75764
   93386
  116929
   74236
  101875
  119631
  102467
  112053
   88446
  122310
   75716
  122859
   92148
  100618
   98649
   97059
   63402
  108131
  109671
  100760
   94460
  103477
  128412
  104520
   98785
 2894003


Combine NCHS dataframes into one big dataframe

In [7]:
nchs_data = pandas.concat(nchs_dataframes)

In [8]:
# Sanity check
separate_row_count = sum([dataframe.shape[0] for dataframe in nchs_dataframes])
combined_row_count = nchs_data.shape[0]
assert separate_row_count == combined_row_count, "Expected {} to equal {}".format(separate_row_count, combined_row_count)

Read in the NHIS data extract

In [9]:
# Set the path of your NHIS data extract (csv)
nhis_file_path = "../NHIS/nhis_test.csv.gz"

In [14]:
nhis_chunks = pandas.read_csv(
    nhis_file_path,
    compression="gzip",
    chunksize=250000
)

Join the NCHS linked data with the raw NHIS data one chunk at a time, discarding unlinked data. Write merged dataframes to files.

In [None]:
total_rows_processed = 0
total_rows_merged = 0

for chunk_index, chunk in enumerate(nhis_chunks):
    print("Merged {} of {} rows. Processing CHUNK {}...".format(total_rows_merged, total_rows_processed, chunk_index))

    merged_dataframe = pandas.merge(
        chunk,
        nchs_data,
        left_on='NHISPID',
        right_on='PUBLICID',
        how='inner'#, suffixes=('_ldf', '_rdf')
    )

    merged_dataframe.to_csv(
        '/tmp/NCHS_NHIS_linked_{}.csv'.format(chunk_index),
        index=None,
        header=True
    )
    
    total_rows_merged += len(merged_dataframe)
    total_rows_processed += len(chunk)

Merged 0 of 0 rows. Processing CHUNK 0...
Merged 0 of 250000 rows. Processing CHUNK 1...
Merged 0 of 500000 rows. Processing CHUNK 2...
Merged 0 of 750000 rows. Processing CHUNK 3...
Merged 0 of 1000000 rows. Processing CHUNK 4...
Merged 0 of 1250000 rows. Processing CHUNK 5...
Merged 0 of 1500000 rows. Processing CHUNK 6...
Merged 0 of 1750000 rows. Processing CHUNK 7...
Merged 0 of 2000000 rows. Processing CHUNK 8...
Merged 0 of 2250000 rows. Processing CHUNK 9...
Merged 0 of 2500000 rows. Processing CHUNK 10...
Merged 0 of 2750000 rows. Processing CHUNK 11...
Merged 0 of 3000000 rows. Processing CHUNK 12...
Merged 0 of 3250000 rows. Processing CHUNK 13...
Merged 0 of 3500000 rows. Processing CHUNK 14...
Merged 0 of 3750000 rows. Processing CHUNK 15...


```python
pandas.merge(
    restaurant_ids_dataframe,
    restaurant_review_frame,
    on='business_id',
    how='outer',
    suffixes=('_restaurant_id', '_restaurant_review')
)
```

Read merged files and concat them