# Data Arrangement

Because the medicare data from cms is poorly labelled, it is hard to write a script to download it. But supposing you have downloaded penetration data and the county-state-plan subscription data, here is a workbook that unzips all the files and makes them into a single panel data set, and saves it in Stata form. So:

In [1]:
import os, zipfile, glob, re

import pandas as pd
import numpy as np

In [6]:
dude = 'mjbaker'

dir_name_1 = 'C:\\Users\\' + dude + '\\Documents\\Medicare\\Markets\\'
dir_name_2 = 'C:\\Users\\' + dude + '\\Documents\\Medicare\\Shares\\'
extension = ".zip"

In [7]:
os.chdir(dir_name_1) # change directory from working dir to dir with files

for item in os.listdir(dir_name_1):            # loop through items in dir
    if item.endswith(extension):               # check for ".zip" extension
        file_name = os.path.abspath(item)      # get full path of files
        zip_ref = zipfile.ZipFile(file_name)   # create zipfile object
        zip_ref.extractall(dir_name_1)         # extract file to dir
        zip_ref.close()                        # close file

In [8]:
os.chdir(dir_name_2)

for item in os.listdir(dir_name_2):
    if item.endswith(extension):
        file_name = os.path.abspath(item)
        zip_ref = zipfile.ZipFile(file_name)
        zip_ref.extractall(dir_name_2)
        zip_ref.close()

In [9]:
current_dir = os.getcwd()

os.chdir(dir_name_1)

file_list_1 = []
for file in glob.glob("*.csv"):
    file_list_1.append(file)

In [10]:
os.chdir(dir_name_2)

file_list_2 = []
for file in glob.glob("*.csv"):
    file_list_2.append(file)

# Making DataFrames

In [11]:
pendata = pd.read_csv(dir_name_1 + file_list_1[0])
pendata['fn'] = file_list_1[0]

for file in file_list_1:
    df = pd.read_csv(dir_name_1 + file)
    df['fn'] = file
    pendata = pendata.append(df)

# Read the Pickle...if previously made

In [14]:
try:
    shrdata = pd.read_pickle('C:\\Users\\matthew\\downloads\\tempshr.pkl')
    print('File exists and is read in...')
except:
    shrdata = pd.read_csv(dir_name_2 + file_list_2[0])
    shrdata['fn'] = file_list_2[0]

    for file in file_list_2:
        print(len(shrdata), end='.')
        df = pd.read_csv(dir_name_2 + file)
        df['fn'] = file
        shrdata = shrdata.append(df)
    
    shrdata.to_pickle('C:\\Users\\' + dude + '\\downloads\\tempshr.pkl')

237662.475324.713237.951155.1189435.1427774.1665374.1903414.2393560.2883863.3373795.3863219.4352283.4841683.5331169.5820494.6309929.6799186.7288466.7778162.8237184.8697074.9156317.9615433.10074526.10520028.10979413.11438897.11898667.12358379.12818199.13278865.13690062.14100568.14510731.14920373.15329147.15738408.16147741.16556698.16965769.17374539.17783438.18193401.18599097.19003627.19407393.19808767.20209839.20609616.21007892.21406811.21806330.22205315.22604276.23004725.23408002.23809869.24211454.24612365.25012666.25413609.25814461.26215524.26617540.27019500.27421272.27824791.28248601.28670547.29091403.29510828.29930432.30351255.30771933.31193113.31615531.32037730.32459828.32883656.33286947.33688644.34090380.34491370.34892271.35294055.35695664.36097354.36500373.36903221.37306156.37710598.38061664.38411058.38760624.39108925.39457008.39805819.40154199.40503017.40852715.41202272.41551980.41903102.42251001.42596332.42941888.43286212.43630216.43975162.44319463.44664236.45009831.45354706.45

# Taking a look at the data

We first see that our data sets are quite long! Unfortunately, we need to make them a little longer by adding in information about the year and the month. Let's first see if we don't need some of the observations.

In [15]:
shrdata.head()

Unnamed: 0,County,State,Contract ID,Organization Name,Organization Type,Plan Type,SSA Code,FIPS Code,Enrolled,fn
0,Autauga,AL,90091,UNITED MINE WORKERS OF AMERICA HEALTH & RETIRE...,HCPP - 1833 Cost,HCPP - 1833 Cost,1000,1001.0,,SCC_Enrollment_MA_2008_06.csv
1,Autauga,AL,E5088,DESERET HEALTHCARE EMPLOYEE BENEFITS TRUST,Employer/Union Only Direct Contract PFFS,Employer/Union Only Direct Contract PFFS,1000,1001.0,,SCC_Enrollment_MA_2008_06.csv
2,Autauga,AL,H0087,HEALTH ALLIANCE MEDICAL PLANS,PFFS,PFFS,1000,1001.0,,SCC_Enrollment_MA_2008_06.csv
3,Autauga,AL,H0104,BLUE CROSS AND BLUE SHIELD OF ALABAMA,Local CCP,Local PPO,1000,1001.0,593.0,SCC_Enrollment_MA_2008_06.csv
4,Autauga,AL,H0150,"HEALTHSPRING OF ALABAMA, INC.",Local CCP,HMO/HMOPOS,1000,1001.0,218.0,SCC_Enrollment_MA_2008_06.csv


We will use the file names that we affixed to get a year and a date for each observation as we need this to merge on. In any event:

In [16]:
years  = [re.findall(r"\d{4}", file)[0] for file in file_list_2]
years  = [float(item) for item in years]

months = [re.findall(r"\_\d{2}\.", file)[0][1:3] for file in file_list_2]
months = [float(item) for item in months]

In [17]:
shr_data_year_dict = dict(zip(file_list_2, years))
shr_data_mont_dict = dict(zip(file_list_2, months))

In [18]:
shrdata['year'] = shrdata['fn'].map(shr_data_year_dict)

In [19]:
shrdata['month'] = shrdata['fn'].map(shr_data_mont_dict)

In [20]:
shrdata.drop(['fn'], axis=1, inplace=True)

In [21]:
years = [re.findall(r"\d{4}", file)[0] for file in file_list_1]
years = [float(item) for item in years]

months = [re.findall(r"\_\d{2}\.", file)[0][1:3] for file in file_list_1]
months = [float(item) for item in months]


In [22]:
pen_data_year_dict = dict(zip(file_list_1, years))
pen_data_mont_dict = dict(zip(file_list_1, months))

pendata['year']  = pendata['fn'].map(pen_data_year_dict)
pendata['month'] = pendata['fn'].map(pen_data_mont_dict) 

pendata.drop(['fn'], axis=1, inplace=True)

# Merging Data

Before, we merged on state, county, and other stuff. I guess we could try this on the codes and all that. The problem is that there are FIPS codes that mix floats and strings. I don't trust them so I will just do the following:

In [23]:
state_match = {'Alabama':'AL', 'Alaska':'AK', 'American Samoa':'AS', 'Arizona':'AZ', 'Arkansas':'AR', 'California':'CA',
 'Colorado':'CO', 'Connecticut':'CT', 'Delaware':'DE', 'District Of Columbia':'DC',
 'Florida':'FL', 'Georgia':'GA', 'Guam':'GU', 'Hawaii':'HI', 'Idaho':'ID', 'Illinois':'IL',
 'Indiana':'IN', 'Iowa':'IA', 'Kansas':'KS', 'Kentucky':'KY', 'Louisiana':'LA', 'Maine':'ME', 'Maryland':'MD', 
 'Massachusetts':'MA',
 'Michigan':'MI', 'Minnesota':'MN', 'Mississippi':'MS', 'Missouri':'MO', 'Montana':'MT', 'Nebraska':'NE', 'Nevada':'NV', 
 'New Hampshire':'NH', 'New Jersey':'NJ', 'New Mexico':'NM',
 'New York':'NY', 'North Carolina':'NC', 'North Dakota':'ND', 'Ohio':'OH',
 'Oklahoma':'OK', 'Oregon':'OR', 'Pending State Designation':'GB', 'Pennsylvania':'PA', 'Puerto Rico':'PR', 
 'Rhode Island':'RI', 'South Carolina':'SC',
 'South Dakota':'SD', 'Tennessee':'TN', 'Texas':'TX', 'Utah':'UT', 'Vermont':'VT', 'Virgin Islands':'VI',
 'Virginia':'VA',
 'Wake Island':'QW', 'Washington':'WA', 'Washington D.C.':'DC', 'West Virginia':'WV', 'Wisconsin':'WI', 'Wyoming':'WY'}

In [24]:
pendata['State'] = pendata['State Name'].map(state_match)

In [25]:
pendata.rename(columns={'County Name': 'County'}, inplace=True)

We now should be able to merge on County, State, year and month...which will probably take a while! However, there seems to be just too much data to actually pull this off. So, let's try and drop nan variables. First, what do we gain by doing this? 

In [26]:
shrdata['Enrolled'].notna().sum()

3150353

In [27]:
shrdata = shrdata.loc[shrdata['Enrolled'].notna()]

In [28]:
shrdata = pd.merge(shrdata, pendata, on=['State', 'County', 'year', 'month'], how='outer', indicator=True)

In [29]:
ro = shrdata[['County', 'State', 'year', 'month']].loc[shrdata['_merge'] == 'right_only']
lo = shrdata[['County', 'State', 'year', 'month']].loc[shrdata['_merge'] == 'left_only']

In [30]:
stateco = ro['County'] + ' ' + ro['State']
stateco2 = lo['County'] + ' ' + lo['State']

From the looks of the above, it seems as though most of the data from the left is out of the united states proper. It could be that no medicare advantage plans are offered in the rest of the places. 

In [31]:
obj_cols = list(shrdata.select_dtypes(include=['object']).columns)

for col in obj_cols:
    shrdata[col] = shrdata[col].astype(str)

In [33]:
shrdata.to_stata('C:\\Users\\'+ dude+ '\\downloads\\medicare.dta')