In [2]:
import pandas as pd

In [8]:
plans = pd.read_csv('../data/normalized_tables/plans.csv')
benefits = pd.read_csv('../data/normalized_tables/benefits.csv')
premium = pd.read_csv('../data/normalized_tables/premium.csv')
county_area = pd.read_csv('../data/normalized_tables/county_area_crosswalk.csv')
zip_fips = pd.read_csv('../data/normalized_tables/zip_fips_crosswalk.csv')
state_age = pd.read_csv('../data/normalized_tables/state_age_curve.csv')
deductibles = pd.read_csv('../data/normalized_tables/deductibles.csv')
moop = pd.read_csv('../data/normalized_tables/moop.csv')

In [10]:
benefits_bool_cols = [
    'LIMITED', 'CopayInnTier1Complex', 'CopayInnTier2Complex',
    'CoinsInnTier1Complex', 'CoinsInnTier2Complex',
    'CopayOutofNetComplex', 'CoinsOutofNetComplex'
]

for col in benefits_bool_cols:
    print(f"{col} unique values:", benefits[col].unique())

LIMITED unique values: [ 0. nan  1.]
CopayInnTier1Complex unique values: [ 0. nan  1.]
CopayInnTier2Complex unique values: [ 0. nan  1.]
CoinsInnTier1Complex unique values: [ 0. nan  1.]
CoinsInnTier2Complex unique values: [ 0. nan]
CopayOutofNetComplex unique values: [ 0. nan  1.]
CoinsOutofNetComplex unique values: [ 0. nan]


In [18]:
# Replace nan values by the code '99' for unknown and convert float to integer
for col in benefits_bool_cols:
    benefits[col] = pd.to_numeric(benefits[col], errors='coerce').fillna(99).astype(int)

In [16]:
for col in benefits_bool_cols:
    print(f"{col} unique values:", benefits[col].unique())

LIMITED unique values: [ 0 99  1]
CopayInnTier1Complex unique values: [ 0 99  1]
CopayInnTier2Complex unique values: [ 0 99  1]
CoinsInnTier1Complex unique values: [ 0 99  1]
CoinsInnTier2Complex unique values: [ 0 99]
CopayOutofNetComplex unique values: [ 0 99  1]
CoinsOutofNetComplex unique values: [ 0 99]


In [26]:
# The following columns are those which we need to be integer datatype in our database

benefits_int_cols = [
    'LIMITED', 'CopayInn_TIERS', 'CopayInnTier1Complex', 'CopayInnTier1',
    'CopayInnTier2Complex', 'CopayInnTier2',
    'CoinsInn_TIERS', 'CoinsInnTier1Complex', 'CoinsInnTier1',
    'CoinsInnTier2Complex', 'CoinsInnTier2',
    'CopayOutofNetComplex', 'CopayOutofNet',
    'CoinsOutofNetComplex', 'CoinsOutofNet'
]

for col in benefits_int_cols:
    print(f"{col} unique values:", benefits[col].unique())

LIMITED unique values: [ 0 99  1]
CopayInn_TIERS unique values: [ 1.  2. nan  3.]
CopayInnTier1Complex unique values: [ 0 99  1]
CopayInnTier1 unique values: [ 0.  3.  4.  2.  1. nan  6.  8.]
CopayInnTier2Complex unique values: [ 0 99  1]
CopayInnTier2 unique values: [ 0.  3.  2. nan  1.  4.  6.  8.]
CoinsInn_TIERS unique values: [ 1.  2. nan  3.]
CoinsInnTier1Complex unique values: [ 0 99  1]
CoinsInnTier1 unique values: [ 4.  3.  0. nan]
CoinsInnTier2Complex unique values: [ 0 99]
CoinsInnTier2 unique values: [ 0.  4.  3. nan]
CopayOutofNetComplex unique values: [ 0 99  1]
CopayOutofNet unique values: [ 0.  3.  4.  2.  1. nan  8.]
CoinsOutofNetComplex unique values: [ 0 99]
CoinsOutofNet unique values: [ 4.  3.  0. nan]


In [28]:
# Replace nan values by the code '99' for unknown and convert float to integers
for col in benefits_int_cols:
    benefits[col] = pd.to_numeric(benefits[col], errors='coerce').fillna(99).astype(int)

In [34]:
benefits.to_csv('../data/normalized_tables/benefits.csv', index=False)

In [36]:
# nan values are ok when we want the column datatype to be numeric
benefits.head()

Unnamed: 0,PLANID,AREA,LIMITED,CopayInn_TIERS,CopayInnTier1Complex,CopayInnTier1,CopayInnTier1A,CopayInnTier2Complex,CopayInnTier2,CopayInnTier2A,...,CoinsInnTier2Complex,CoinsInnTier2,CoinsInnTier2A,CopayOutofNetComplex,CopayOutofNet,CopayOutofNetA,CoinsOutofNetComplex,CoinsOutofNet,CoinsOutofNetA,benefit_code
0,73836AK0950001,AK01,0,1,0,0,,0,0,,...,0,0,,0,0,,0,4,40.0,AB
1,73836AK0950001,AK02,0,1,0,0,,0,0,,...,0,0,,0,0,,0,4,40.0,AB
2,73836AK0930001,AK02,0,1,0,0,,0,0,,...,0,0,,0,0,,0,4,30.0,AB
3,73836AK0950001,AK03,0,1,0,0,,0,0,,...,0,0,,0,0,,0,4,40.0,AB
4,73836AK0930001,AK03,0,1,0,0,,0,0,,...,0,0,,0,0,,0,4,30.0,AB


In [44]:
# Since we originally created PREMI21_BASE as a fraction of PREMI27/(multiplier for 27), it is stored as a long decimal

premium.head()

Unnamed: 0,PLANID,AREA,ST,PREMI21_BASE,PREMI2C30,PREMC2C30
0,73836AK0950001,AK01,AK,789.122137,2104.0,3000.0
1,73836AK0950001,AK02,AK,830.152672,2212.0,3154.0
2,73836AK0930001,AK02,AK,745.229008,1985.0,2830.0
3,73836AK0950001,AK03,AK,808.206107,2153.0,3070.0
4,73836AK0930001,AK03,AK,725.19084,1933.0,2756.0


In [46]:
# We want to round up PREMI21_BASE to 2 decimal places, as it is supposed to be dollar value

premium['PREMI21_BASE'] = premium['PREMI21_BASE'].round(2)
premium.to_csv('../data/normalized_tables/premium.csv', index=False)

In [54]:
# The Kaggle dataset for ZIP-COUNTY-FIP CROSSWALK seemed like it had all unique values for ZIP

zip_fips.head()

Unnamed: 0,ZIP,COUNTY,ST,FIPS
0,36003,Autauga County,AL,1001
1,36006,Autauga County,AL,1001
2,36067,Autauga County,AL,1001
3,36066,Autauga County,AL,1001
4,36703,Autauga County,AL,1001


In [84]:
# However, we have repeated values for ZIP, perhaps because one ZIP code could be linked to 2 counties.

for col in zip_fips.columns:
    col_duplicates = zip_fips[zip_fips[col].duplicated()]
    print(f"Number of duplicate {col}: {len(col_duplicates)}")

Number of duplicate ZIP: 13433
Number of duplicate COUNTY: 50931
Number of duplicate ST: 52835
Number of duplicate FIPS: 49666


In [64]:
# For example, let's look at all rows where ZIP = 36053

zip_fips[zip_fips['ZIP'] == 36053]

Unnamed: 0,ZIP,COUNTY,ST,FIPS
46,36053,Barbour County,AL,1005
85,36053,Bullock County,AL,1011


In [88]:
is_unique = zip_fips[['ZIP', 'COUNTY']].duplicated().any()
print("Is (ZIP, COUNTY) unique for all rows?:", not is_unique)

Is (ZIP, COUNTY) unique for all rows?: True


In [72]:
county_area.head()

Unnamed: 0,FIPS,COUNTY,AREA_COUNT,AREA
0,1001,Autauga County,1,AL11
1,1003,Baldwin County,1,AL13
2,1005,Barbour County,1,AL13
3,1007,Bibb County,1,AL03
4,1009,Blount County,1,AL03


In [80]:
repeated_fips = county_area['FIPS'][county_area['FIPS'].duplicated()]
print(repeated_fips.unique())

[ 2013  2016  2020  2050  2063  2070  2122  2150  2158  2164  2290  6037
 25005 25007 25013 25017 25021 25023 25027]


In [86]:
for col in county_area.columns:
    col_duplicates = county_area[county_area[col].duplicated()]
    print(f"Number of duplicate {col}: {len(col_duplicates)}")

Number of duplicate FIPS: 24
Number of duplicate COUNTY: 1289
Number of duplicate AREA_COUNT: 3163
Number of duplicate AREA: 2663
