In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
with open('./Nat2021US.txt', 'r') as f:
    stripped = (line.strip() for line in f)
    with open('national_data.csv', 'w') as out_file:
        writer = csv.writer(out_file)
        writer.writerows(stripped)

In [None]:
with open('national_data.csv', newline='') as f:
    reader = csv.reader(f)
    data = list(reader)

In [None]:
def recode_pos(num):
    return num - 9

In [None]:
df_dict = {}
for i, info in enumerate(data):
    if i % 100 == 5:
        df_dict.setdefault('birth_place',[]).append(info[recode_pos(32)])
        df_dict.setdefault('mother_age',[]).append(int(''.join(info[recode_pos(75):recode_pos(76)+1])))
        df_dict.setdefault('mother_race',[]).append(info[recode_pos(107)])
        df_dict.setdefault('marital_status',[]).append(info[recode_pos(120)])
        df_dict.setdefault('mother_edu',[]).append(int(info[recode_pos(124)]))
        df_dict.setdefault('father_edu',[]).append(int(info[recode_pos(163)]))
        df_dict.setdefault('prev_kid',[]).append(int(''.join(info[recode_pos(171):recode_pos(172)+1])))
        df_dict.setdefault('prev_terminate',[]).append(int(''.join(info[recode_pos(175):recode_pos(176)+1])))
        df_dict.setdefault('spacing_live_birth',[]).append(int(''.join(info[recode_pos(198):recode_pos(200)+1])))
        df_dict.setdefault('spacing_preg',[]).append(int(''.join(info[recode_pos(206):recode_pos(208)+1])))
        df_dict.setdefault('prenatal_care_start',[]).append(int(''.join(info[recode_pos(224):recode_pos(225)+1])))
        df_dict.setdefault('num_visit',[]).append(int(''.join(info[recode_pos(238):recode_pos(239)+1])))
        df_dict.setdefault('WIC',[]).append(info[recode_pos(251)])
        df_dict.setdefault('cig_before_preg',[]).append(int(info[recode_pos(261)]))
        df_dict.setdefault('cig_t1',[]).append(int(info[recode_pos(262)]))
        df_dict.setdefault('cig_t2',[]).append(int(info[recode_pos(263)]))
        df_dict.setdefault('cig_t3',[]).append(int(info[recode_pos(264)]))
        #risk factor
        df_dict.setdefault('pre_diabetes',[]).append(info[recode_pos(313)])
        df_dict.setdefault('gas_diabetes',[]).append(info[recode_pos(314)])
        df_dict.setdefault('pre_hypertension',[]).append(info[recode_pos(315)])
        df_dict.setdefault('gas_hypertension',[]).append(info[recode_pos(316)])
        df_dict.setdefault('hyper_eclampsia',[]).append(info[recode_pos(317)])
        df_dict.setdefault('prev_preterm',[]).append(info[recode_pos(318)])
        df_dict.setdefault('prev_C',[]).append(info[recode_pos(331)])
        df_dict.setdefault('no_infection',[]).append(info[recode_pos(353)])
        
        #coding infections
        df_dict.setdefault('Gonorrhea',[]).append(info[recode_pos(343)])
        df_dict.setdefault('Syphilis',[]).append(info[recode_pos(344)])
        df_dict.setdefault('Chlamydia',[]).append(info[recode_pos(345)])
        df_dict.setdefault('Hepatitis_B',[]).append(info[recode_pos(346)])
        df_dict.setdefault('Hepatitis_C',[]).append(info[recode_pos(347)])

        #outcome
        df_dict.setdefault('weight_class',[]).append(int(info[recode_pos(511)]))
        df_dict.setdefault('no_abnormal',[]).append(int(info[recode_pos(531)]))

In [None]:
with open("nat_clean.csv", "w") as outfile:
   
   # pass the csv file to csv.writer.
    writer = csv.writer(outfile)
     
    # convert the dictionary keys to a list
    key_list = list(df_dict.keys())
     
    # find the length of the key_list
    limit = len(df_dict['birth_place'])
     
    # the length of the keys corresponds to
    # no. of. columns.
    writer.writerow(df_dict.keys())
     
    # iterate each column and assign the
    # corresponding values to the column
    for i in range(limit):
        writer.writerow([df_dict[x][i] for x in key_list])

In [None]:
df = pd.read_csv("nat_clean.csv")

In [None]:
df.columns

In [None]:
def corr_heat_map(dataset, target_var, num_shown):
    fig, axes = plt.subplots(1, 2, figsize=(12, 6))

    #positive
    df_corr = dataset.corr().sort_values(target_var, ascending=False)
    top_corr_features = df_corr.index[:num_shown]
    top_corr = dataset[top_corr_features].corr()
    sns.heatmap(top_corr, annot=True, cmap="crest", ax=axes[0])
    axes[0].set_title('Most Positive Correlated Variables', fontsize=11)
    #negative
    df_corr = dataset.corr().sort_values(target_var, ascending=True)
    top_corr_features = df_corr.index[:num_shown]
    top_corr = dataset[top_corr_features].corr()
    sns.heatmap(top_corr, annot=True, cmap="Reds", ax=axes[1])
    axes[1].set_title('Most Negative Correlated Variables', fontsize=11)

In [None]:
corr_heat_map(df, 'LBW', 10)

# Data Processing #

In [None]:
df.loc[df.loc[:,'birth_place'] == 9,'birth_place'] = np.nan

df.loc[df.loc[:, 'mother_race'] == 2,'black'] = 1
df.loc[df.loc[:, 'mother_race'] != 2,'black'] = 0
df.loc[df.loc[:,'marital_status'] == ' ','marital_status'] = np.nan
df.loc[df.loc[:,'marital_status'] != ' ','marital_status'] = df.loc[:,'marital_status'].astype(str)
df.loc[df.loc[:,'mother_edu'] == 9,'mother_edu'] == np.nan
df.loc[df.loc[:,'father_edu'] == 9,'father_edu'] == np.nan
df.loc[:,'parent_edu'] = df.loc[:,'mother_edu'] + df.loc[:,'father_edu']
df.loc[df.loc[:,'prev_kid'] == 99,'prev_kid'] = np.nan
df.loc[df.loc[:,'prev_terminate'] == 99,'prev_terminate'] = np.nan
df.loc[(df.loc[:,'spacing_live_birth'] == 999) | (df.loc[:,'spacing_live_birth'] == 888)
       | (df.loc[:,'spacing_live_birth'] < 4),'spacing_live_birth'] = np.nan
df.loc[(df.loc[:,'spacing_preg'] == 999) | (df.loc[:,'spacing_preg'] == 888)
       | (df.loc[:,'spacing_preg'] < 4),'spacing_preg'] = np.nan
df.loc[(df.loc[:,'prenatal_care_start'] == 99) | (df.loc[:,'prenatal_care_start'] ==0), 'prenatal_care_start'] = np.nan
df.loc[(df.loc[:,'prenatal_care_start'] == 0),'had_prenatal_care'] = 0
df.loc[(df.loc[:,'prenatal_care_start'] != 0) & (df.loc[:,'prenatal_care_start'] != 99) ,'had_prenatal_care'] = 1
df.loc[df.loc[:,'num_visit'] == 99, 'num_visit'] = np.nan
df.loc[df.loc[:,'cig_before_preg'] != 0, 'cig_before_preg'] = 1
df.loc[df.loc[:,'cig_t1'] != 0, 'cig_t1'] = 1
df.loc[df.loc[:,'cig_t2'] != 0, 'cig_t2'] = 1
df.loc[df.loc[:,'cig_t3'] != 0, 'cig_t3'] = 1
df.loc[df.loc[:,'no_infection'] == '9','no_infection'] = np.nan
df.loc[df.loc[:,'no_infection'] != '9','no_infection'] = df.loc[:,'no_infection'].astype(int)
df.loc[df.loc[:,'no_abnormal'] == 9,'no_abnormal'] = np.nan
df.loc[df.loc[:,'weight_class'] == 4, 'weight_class'] = np.nan
df.loc[df.loc[:,'weight_class'] < 3,'LBW'] = 1
df.loc[df.loc[:,'weight_class'] ==  3,'LBW'] = 0
df.loc[df.loc[:,'marital_status'] == '2','single_mom'] = 1
df.loc[df.loc[:,'marital_status'] == '1','single_mom'] = 0

In [None]:
def letter_trans(ans):
    if ans == 'Y':
        return 1
    elif ans == 'N':
        return 0
    else:
        return np.nan

In [None]:
y_n_lst = ['WIC','pre_diabetes', 'gas_diabetes', 'pre_hypertension', 'gas_hypertension',
       'hyper_eclampsia','prev_preterm','prev_C','Gonorrhea', 'Syphilis', 
           'Chlamydia', 'Hepatitis_B', 'Hepatitis_C']

for var in y_n_lst:
    df.loc[:,var] = df.loc[:,var].apply(letter_trans)

In [None]:
df.to_csv('./cleaned_data/national_cleaned.csv')