In [2]:
from io import StringIO
import pandas
import glob

Download NCHS linked data (and discard non-NHIS linked data)

In [None]:
!wget --recursive ftp://ftp.cdc.gov/pub/Health_Statistics/NCHS/datalinkage/linked_mortality
!mkdir NCHS_linked_data
!mv ./ftp.cdc.gov/pub/Health_Statistics/NCHS/datalinkage/linked_mortality/NHIS_*.dat ./NCHS_linked_data/
!rm -r ./ftp.cdc.gov

Read in all the NCHS linked mortality datafiles

In [3]:
nchs_file_paths = glob.glob("./NCHS_linked_data/NHIS_*.dat")

In [5]:
# Sanity check
assert len(nchs_file_paths) > 0, "No NCHS files available!"

In [6]:
nchs_column_widths = [14,1,1,3,1,1,1,4,8,8]
nchs_column_names = ["PUBLICID", "ELIGSTAT", "MORTSTAT", "UCOD_LEADING", "DIABETES", "HYPERTEN", "DODQTR", "DODYEAR", "WGT_NEW", "SA_WGT_NEW"]

nchs_dataframes = [
    pandas.read_fwf(
        file_path,
        widths=nchs_column_widths,
        names=nchs_column_names,
        dtype=False, #{"PUBLICID": "object"}
        na_values=['.']
    )
    for file_path
    in nchs_file_paths
]

In [10]:
# Sanity check
total_row_count = sum([dataframe.shape[0] for dataframe in nchs_dataframes])
total_line_count_string = !wc -l ./NCHS_linked_data/NHIS_*.dat | grep total | cut -f2 -d' '
total_line_count = int(total_line_count_string[0])
assert total_row_count == total_line_count, "Expected {} to equal {}".format(total_row_count, total_line_count)

In [11]:
# Debug info
for dataframe in nchs_dataframes:
   print("{:8d}".format(dataframe.shape[0]))

print("=" * 8)
print("{:8d}".format(total_row_count))

  120032
   89976
  116179
   62052
   75764
   93386
  116929
   74236
  101875
  119631
  102467
  112053
   88446
  122310
   75716
  122859
   92148
  100618
   98649
   97059
   63402
  108131
  109671
  100760
   94460
  103477
  128412
  104520
   98785
 2894003


Combine NCHS dataframes into one big dataframe

In [13]:
nchs_data = pandas.concat(nchs_dataframes)

In [14]:
# Sanity check
separate_row_count = sum([dataframe.shape[0] for dataframe in nchs_dataframes])
combined_row_count = nchs_data.shape[0]
assert separate_row_count == combined_row_count, "Expected {} to equal {}".format(separate_row_count, combined_row_count)

Read in the NHIS data extract

In [15]:
# Set the path of your NHIS data extract (csv)
nhis_file_path = "../NHIS/nhis_test.csv.gz"

In [16]:
nhis_chunks = pandas.read_csv(
    nhis_file_path,
    compression="gzip",
    chunksize=500000
)

Join the NCHS linked data with the raw NHIS data

In [17]:
for chunk in nhis_chunks:
    print("CHUNK {}".format(len(chunk)))
    # MERGE
    # pandas.merge(ldf, rdf, on='column', how='right', suffixes='')
    .to_csv (r'C:\Users\Ron\Desktop\export_dataframe.csv', index = None, header=True)
    # WRITE MERGED TO DISK

  interactivity=interactivity, compiler=compiler, result=result)


CHUNK 500000


  interactivity=interactivity, compiler=compiler, result=result)


CHUNK 500000
CHUNK 500000


  interactivity=interactivity, compiler=compiler, result=result)


CHUNK 500000
CHUNK 500000
CHUNK 500000
CHUNK 500000
CHUNK 500000
CHUNK 500000
CHUNK 500000
CHUNK 500000
CHUNK 490154


```python
pandas.merge(
    restaurant_ids_dataframe,
    restaurant_review_frame,
    on='business_id',
    how='outer',
    suffixes=('_restaurant_id', '_restaurant_review')
)
```