https://bhw.hrsa.gov/workforce-shortage-areas/shortage-designation 


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Import Data

In [2]:
hpsa = pd.read_csv("./Data/BCD_HPSA_FCT_DET_MH.csv")

### Understanding the Dataset

In [3]:
hpsa.shape

(27831, 66)

In [4]:
hpsa.head()

Unnamed: 0,HPSA Name,HPSA ID,Designation Type,HPSA Discipline Class,HPSA Score,Primary State Abbreviation,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,Metropolitan Indicator,...,Rural Status Code,State Abbreviation,State and County Federal Information Processing Standard Code,State FIPS Code,State Name,U.S. - Mexico Border 100 Kilometer Indicator,U.S. - Mexico Border County Indicator,Data Warehouse Record Create Date,Data Warehouse Record Create Date Text,Unnamed: 65
0,Stanley Correctional Institution,7551065910,Correctional Facility,Mental Health,15,WI,Designated,07/21/2003,08/02/2018,Unknown,...,N,WI,55017,55,Wisconsin,N,N,10/18/2021,2021/10/18,
1,Rock County,755105,High Needs Geographic HPSA,Mental Health,13,WI,Withdrawn,04/09/2014,07/02/2018,Unknown,...,P,WI,55105,55,Wisconsin,N,N,10/18/2021,2021/10/18,
2,Polk County,755095,High Needs Geographic HPSA,Mental Health,16,WI,Withdrawn,12/20/2002,07/02/2018,Unknown,...,R,WI,55095,55,Wisconsin,N,N,10/18/2021,2021/10/18,
3,Pierce,755093,Geographic HPSA,Mental Health,0,WI,Withdrawn,12/20/2002,06/29/2012,Non-Metropolitan,...,P,WI,55093,55,Wisconsin,N,N,10/18/2021,2021/10/18,
4,Pepin,755091,Geographic HPSA,Mental Health,9,WI,Withdrawn,09/25/1981,06/27/2013,Non-Metropolitan,...,R,WI,55091,55,Wisconsin,N,N,10/18/2021,2021/10/18,


In [5]:
hpsa.describe()

Unnamed: 0,HPSA Score,HPSA FTE,HPSA Designation Population,% of Population Below 100% Poverty,Longitude,Latitude,Common Postal Code,Common State FIPS Code,Discipline Class Number,HPSA Estimated Served Population,HPSA Estimated Underserved Population,HPSA Resident Civilian Population,HPSA Shortage,Primary State FIPS Code,State FIPS Code,Unnamed: 65
count,27831.0,21600.0,26557.0,18177.0,6403.0,6403.0,6149.0,27831.0,27831.0,18212.0,18212.0,3568.0,21057.0,27831.0,27831.0,0.0
mean,13.520858,3.52613,139489.7,20.717561,-97.443685,39.459821,60095.553261,26.22051,7.0,81500.49,86213.71,175994.0,6.233885,26.220653,26.22051,
std,6.15564,7.08191,181676.7,9.85595,22.83623,7.403072,28130.438774,16.164099,0.0,170758.3,136834.5,210886.1,7.206671,16.164026,16.164099,
min,0.0,0.0,0.0,0.0,-176.65757,-14.319,617.0,1.0,7.0,0.0,-1397769.0,0.0,-8.66,1.0,1.0,
25%,12.0,0.17,33328.0,14.2,-110.796567,35.060558,38585.0,12.0,7.0,3200.0,29355.5,7400.0,1.3492,12.0,12.0,
50%,16.0,1.0,86382.0,19.3,-93.231199,39.069776,62848.0,26.0,7.0,21000.0,64915.5,110520.0,4.08,26.0,26.0,
75%,18.0,3.7,186496.0,26.3,-83.623423,42.97986,85132.0,36.0,7.0,82200.0,129171.0,244235.0,8.19,36.0,36.0,
max,25.0,133.0,4401127.0,80.0,166.412,71.298967,99950.0,78.0,7.0,1599800.0,2592339.0,3702339.0,86.41,78.0,78.0,


In [6]:
hpsa['HPSA Status'].value_counts()

Designated                 11631
Withdrawn                  10883
Proposed For Withdrawal     5317
Name: HPSA Status, dtype: int64

### Remove sparse columns

In [7]:
percent_missing = hpsa.isnull().sum() * 100 / len(hpsa)
missing_value_df = pd.DataFrame({'column_name': hpsa.columns,
                                 'percent_missing': percent_missing})

In [8]:
missing_value_df.sort_values('percent_missing', inplace=True)


In [9]:
missing_value_df[missing_value_df['percent_missing']>0]

Unnamed: 0,column_name,percent_missing
HPSA Designation Population,HPSA Designation Population,4.577629
Rural Status,Rural Status,8.749236
Rural Status Code,Rural Status Code,8.749236
HPSA Degree of Shortage,HPSA Degree of Shortage,20.753836
HPSA FTE,HPSA FTE,22.388703
HPSA Component State Abbreviation,HPSA Component State Abbreviation,23.006719
HPSA Population Type,HPSA Population Type,23.006719
HPSA Population Type Code,HPSA Population Type Code,23.010312
HPSA Shortage,HPSA Shortage,24.339765
HPSA Provider Ratio Goal,HPSA Provider Ratio Goal,30.509144


In [10]:
cols_to_keep = missing_value_df[missing_value_df['percent_missing']<30].index

In [11]:
cols_to_keep = cols_to_keep.insert(0,'Withdrawn Date') # add withdrawn date back into the cols to keep

In [12]:
final_df = hpsa[cols_to_keep]

In [13]:
final_df.shape # final dataframe has 48 columns

(27831, 49)

In [14]:
# create column for time as designated 
final_df[['HPSA Designation Date','Withdrawn Date']]

final_df['DaysBeforeWithdrawn'] = pd.to_datetime(final_df['Withdrawn Date']) - pd.to_datetime(final_df['HPSA Designation Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df['DaysBeforeWithdrawn'] = pd.to_datetime(final_df['Withdrawn Date']) - pd.to_datetime(final_df['HPSA Designation Date'])


In [15]:
# drop column with only one value type
final_df = final_df.drop(columns = ['Break in Designation','Discipline Class Number','Data Warehouse Record Create Date Text',
                                    'Data Warehouse Record Create Date','HPSA Discipline Class'])

In [16]:
final_df

Unnamed: 0,Withdrawn Date,HPSA Name,Common State County FIPS Code,Common State FIPS Code,Common State Name,County Equivalent Name,HPSA Component Name,HPSA Component Type Code,HPSA Component Type Description,HPSA Designation Population Type Description,...,HPSA Designation Population,Rural Status,Rural Status Code,HPSA Degree of Shortage,HPSA FTE,HPSA Component State Abbreviation,HPSA Population Type,HPSA Population Type Code,HPSA Shortage,DaysBeforeWithdrawn
0,,Stanley Correctional Institution,55017,55,Wisconsin,Chippewa,Stanley Correctional Institution,UNK,Unknown,Correctional Facility,...,2885.0,Non-Rural,N,6,0.6,,,,0.84,NaT
1,07/02/2018,Rock County,55105,55,Wisconsin,Rock,Rock,SCTY,Single County,Geographic Population,...,156639.0,Partially Rural,P,Not applicable,8.5,WI,Geographic Population,TRC,0.74,1545 days
2,07/02/2018,Polk County,55095,55,Wisconsin,Polk,Polk,SCTY,Single County,Geographic Population,...,43071.0,Rural,R,Not applicable,3.9,WI,Geographic Population,TRC,1.24,5673 days
3,06/29/2012,Pierce,55093,55,Wisconsin,Pierce,Pierce,SCTY,Single County,Geographic Population,...,35886.0,Partially Rural,P,Not applicable,0.0,WI,Geographic Population,TRC,1.20,3479 days
4,06/27/2013,Pepin,55091,55,Wisconsin,Pepin,Pepin,SCTY,Single County,Geographic Population,...,7339.0,Rural,R,Not applicable,0.0,WI,Geographic Population,TRC,0.20,11598 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27826,,Kokhanok Village Clinic,02164,2,Alaska,Lake and Peninsula,Kokhanok Village Clinic,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,330.0,Rural,R,,,,,,,NaT
27827,,Lime Village Clinic,02050,2,Alaska,Bethel,Lime Village Clinic,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,1235.0,Rural,R,,,,,,,NaT
27828,,Yukon-Koyukuk Census Area,02290,2,Alaska,Yukon-Koyukuk,Yukon-Koyukuk Census Area,SCTY,Single County,Geographic Population,...,5590.0,Rural,R,Not applicable,0.0,AK,Geographic Population,TRC,0.28,NaT
27829,,Chignik Bay Subregional Clinic,02164,2,Alaska,Lake and Peninsula,Chignik Bay Subregional Clinic,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,1041.0,Rural,R,,,,,,,NaT


In [17]:
cbsa = pd.read_csv('./Data/cbsa2fipsxw.csv')
cbsa.head()

Unnamed: 0,cbsacode,metrodivisioncode,csacode,cbsatitle,metropolitanmicropolitanstatis,metropolitandivisiontitle,csatitle,countycountyequivalent,statename,fipsstatecode,fipscountycode,centraloutlyingcounty
0,,,,,,,,,,,,
1,33860.0,,,"Montgomery, AL",Metropolitan Statistical Area,,,Autauga County,Alabama,1.0,1.0,Central
2,19300.0,,380.0,"Daphne-Fairhope-Foley, AL",Metropolitan Statistical Area,,"Mobile-Daphne-Fairhope, AL",Baldwin County,Alabama,1.0,3.0,Central
3,13820.0,,142.0,"Birmingham-Hoover, AL",Metropolitan Statistical Area,,"Birmingham-Hoover-Talladega, AL",Bibb County,Alabama,1.0,7.0,Outlying
4,13820.0,,142.0,"Birmingham-Hoover, AL",Metropolitan Statistical Area,,"Birmingham-Hoover-Talladega, AL",Blount County,Alabama,1.0,9.0,Outlying


In [18]:
final_df['County Equivalent Name New'] = final_df['County Equivalent Name'] + ' County' # add the word county to the end of the each county name

In [19]:
cbsa_data = cbsa[['countycountyequivalent', 'centraloutlyingcounty', 'statename', 'cbsacode','metropolitanmicropolitanstatis']]

final_df2 = final_df.merge(cbsa_data, how='left', left_on=['County Equivalent Name New','State Name'], right_on=['countycountyequivalent','statename'])

final_df2.head()

Unnamed: 0,Withdrawn Date,HPSA Name,Common State County FIPS Code,Common State FIPS Code,Common State Name,County Equivalent Name,HPSA Component Name,HPSA Component Type Code,HPSA Component Type Description,HPSA Designation Population Type Description,...,HPSA Population Type,HPSA Population Type Code,HPSA Shortage,DaysBeforeWithdrawn,County Equivalent Name New,countycountyequivalent,centraloutlyingcounty,statename,cbsacode,metropolitanmicropolitanstatis
0,,Stanley Correctional Institution,55017,55,Wisconsin,Chippewa,Stanley Correctional Institution,UNK,Unknown,Correctional Facility,...,,,0.84,NaT,Chippewa County,Chippewa County,Central,Wisconsin,20740.0,Metropolitan Statistical Area
1,07/02/2018,Rock County,55105,55,Wisconsin,Rock,Rock,SCTY,Single County,Geographic Population,...,Geographic Population,TRC,0.74,1545 days,Rock County,Rock County,Central,Wisconsin,27500.0,Metropolitan Statistical Area
2,07/02/2018,Polk County,55095,55,Wisconsin,Polk,Polk,SCTY,Single County,Geographic Population,...,Geographic Population,TRC,1.24,5673 days,Polk County,,,,,
3,06/29/2012,Pierce,55093,55,Wisconsin,Pierce,Pierce,SCTY,Single County,Geographic Population,...,Geographic Population,TRC,1.2,3479 days,Pierce County,Pierce County,Outlying,Wisconsin,33460.0,Metropolitan Statistical Area
4,06/27/2013,Pepin,55091,55,Wisconsin,Pepin,Pepin,SCTY,Single County,Geographic Population,...,Geographic Population,TRC,0.2,11598 days,Pepin County,,,,,


In [20]:
# add PDEN10 Mapper 

final_df2['PDEN10'] = np.where(final_df2['metropolitanmicropolitanstatis'] == 'Metropolitan Statistical Area',1,
                                np.where(final_df2['metropolitanmicropolitanstatis'] == 'Micropolitan Statistical Area',
                                2,3))
final_df2['PDEN10'].value_counts()

1    19142
3     6041
2     2648
Name: PDEN10, dtype: int64

In [21]:
# # save cleaned dataset in ./Data folder 
final_df2.to_csv('./Data/HPSA_Cleaned.csv',index=False)