https://bhw.hrsa.gov/workforce-shortage-areas/shortage-designation 


In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

### Import Data

In [2]:
hpsa = pd.read_csv("./Data/BCD_HPSA_FCT_DET_MH.csv")

### Understanding the Dataset

In [3]:
hpsa.shape

(27813, 65)

In [4]:
hpsa.head()

Unnamed: 0,HPSA Name,HPSA ID,Designation Type,HPSA Discipline Class,HPSA Score,Primary State Abbreviation,HPSA Status,HPSA Designation Date,HPSA Designation Last Update Date,Metropolitan Indicator,...,Provider Type,Rural Status Code,State Abbreviation,State and County Federal Information Processing Standard Code,State FIPS Code,State Name,U.S. - Mexico Border 100 Kilometer Indicator,U.S. - Mexico Border County Indicator,Data Warehouse Record Create Date,Data Warehouse Record Create Date Text
0,Stanley Correctional Institution,7551065910,Correctional Facility,Mental Health,15,WI,Designated,7/21/2003,8/2/2018,Unknown,...,,N,WI,55017,55,Wisconsin,N,N,10/5/2021,10/5/2021
1,Rock County,755105,High Needs Geographic HPSA,Mental Health,13,WI,Withdrawn,4/9/2014,7/2/2018,Unknown,...,Psychiatrist,P,WI,55105,55,Wisconsin,N,N,10/5/2021,10/5/2021
2,Polk County,755095,High Needs Geographic HPSA,Mental Health,16,WI,Withdrawn,12/20/2002,7/2/2018,Unknown,...,Psychiatrist,R,WI,55095,55,Wisconsin,N,N,10/5/2021,10/5/2021
3,Pierce,755093,Geographic HPSA,Mental Health,0,WI,Withdrawn,12/20/2002,6/29/2012,Non-Metropolitan,...,,P,WI,55093,55,Wisconsin,N,N,10/5/2021,10/5/2021
4,Pepin,755091,Geographic HPSA,Mental Health,9,WI,Withdrawn,9/25/1981,6/27/2013,Non-Metropolitan,...,,R,WI,55091,55,Wisconsin,N,N,10/5/2021,10/5/2021


In [5]:
hpsa.describe()

Unnamed: 0,HPSA Score,HPSA FTE,HPSA Designation Population,% of Population Below 100% Poverty,Longitude,Latitude,Common Postal Code,Common State FIPS Code,Discipline Class Number,HPSA Estimated Served Population,HPSA Estimated Underserved Population,HPSA Resident Civilian Population,HPSA Shortage,Primary State FIPS Code,State FIPS Code
count,27813.0,21582.0,26539.0,18159.0,6403.0,6403.0,6149.0,27813.0,27813.0,18194.0,18194.0,3568.0,21039.0,27813.0,27813.0
mean,13.520081,3.527171,139501.5,20.735244,-97.443685,39.459821,60095.553261,26.213713,7.0,81535.42,86223.93,175994.0,6.234354,26.213857,26.213713
std,6.158292,7.08436,181721.4,9.872512,22.83623,7.403072,28130.438774,16.163942,0.0,170831.1,136894.7,210886.1,7.208895,16.163869,16.163942
min,0.0,0.0,0.0,0.0,-176.65757,-14.319,617.0,1.0,7.0,0.0,-1397769.0,0.0,-8.66,1.0,1.0
25%,12.0,0.18,33328.0,14.2,-110.796567,35.060558,38585.0,12.0,7.0,3400.0,29406.0,7400.0,1.3492,12.0,12.0
50%,16.0,1.0,86382.0,19.3,-93.231199,39.069776,62848.0,26.0,7.0,21000.0,64868.0,110520.0,4.07,26.0,26.0
75%,18.0,3.7,186496.0,26.3,-83.623423,42.97986,85132.0,36.0,7.0,82200.0,130543.0,244235.0,8.19,36.0,36.0
max,25.0,133.0,4401127.0,80.0,166.412,71.298967,99950.0,78.0,7.0,1599800.0,2592339.0,3702339.0,86.41,78.0,78.0


In [6]:
hpsa['HPSA Status'].value_counts()

Designated                 11577
Withdrawn                  10883
Proposed For Withdrawal     5353
Name: HPSA Status, dtype: int64

### Remove sparse columns

In [7]:
percent_missing = hpsa.isnull().sum() * 100 / len(hpsa)
missing_value_df = pd.DataFrame({'column_name': hpsa.columns,
                                 'percent_missing': percent_missing})

In [8]:
missing_value_df.sort_values('percent_missing', inplace=True)


In [9]:
missing_value_df[missing_value_df['percent_missing']>0]

Unnamed: 0,column_name,percent_missing
HPSA Designation Population,HPSA Designation Population,4.580592
Rural Status,Rural Status,8.754899
Rural Status Code,Rural Status Code,8.754899
HPSA Degree of Shortage,HPSA Degree of Shortage,20.767267
HPSA FTE,HPSA FTE,22.403193
HPSA Component State Abbreviation,HPSA Component State Abbreviation,23.021609
HPSA Population Type,HPSA Population Type,23.021609
HPSA Population Type Code,HPSA Population Type Code,23.025204
HPSA Shortage,HPSA Shortage,24.355517
HPSA Provider Ratio Goal,HPSA Provider Ratio Goal,30.528889


In [10]:
cols_to_keep = missing_value_df[missing_value_df['percent_missing']<30].index

In [11]:
cols_to_keep = cols_to_keep.insert(0,'Withdrawn Date') # add withdrawn date back into the cols to keep

In [12]:
final_df = hpsa[cols_to_keep]

In [13]:
final_df.shape # final dataframe has 48 columns

(27813, 49)

In [14]:
# create column for time as designated 
final_df[['HPSA Designation Date','Withdrawn Date']]

final_df['DaysBeforeWithdrawn'] = pd.to_datetime(final_df['Withdrawn Date']) - pd.to_datetime(final_df['HPSA Designation Date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.


In [15]:
# drop column with only one value type
final_df = final_df.drop(columns = ['Break in Designation','Discipline Class Number','Data Warehouse Record Create Date Text',
                                    'Data Warehouse Record Create Date','HPSA Discipline Class'])

In [16]:
final_df

Unnamed: 0,Withdrawn Date,HPSA Name,Common State County FIPS Code,Common State FIPS Code,Common State Name,County Equivalent Name,HPSA Component Name,HPSA Component Type Code,HPSA Component Type Description,HPSA Designation Population Type Description,...,HPSA Designation Population,Rural Status,Rural Status Code,HPSA Degree of Shortage,HPSA FTE,HPSA Component State Abbreviation,HPSA Population Type,HPSA Population Type Code,HPSA Shortage,DaysBeforeWithdrawn
0,,Stanley Correctional Institution,55017,55,Wisconsin,Chippewa,Stanley Correctional Institution,UNK,Unknown,Correctional Facility,...,2885.0,Non-Rural,N,6,0.6,,,,0.84,NaT
1,7/2/2018,Rock County,55105,55,Wisconsin,Rock,Rock,SCTY,Single County,Geographic Population,...,156639.0,Partially Rural,P,Not applicable,8.5,WI,Geographic Population,TRC,0.74,1545 days
2,7/2/2018,Polk County,55095,55,Wisconsin,Polk,Polk,SCTY,Single County,Geographic Population,...,43071.0,Rural,R,Not applicable,3.9,WI,Geographic Population,TRC,1.24,5673 days
3,6/29/2012,Pierce,55093,55,Wisconsin,Pierce,Pierce,SCTY,Single County,Geographic Population,...,35886.0,Partially Rural,P,Not applicable,0.0,WI,Geographic Population,TRC,1.20,3479 days
4,6/27/2013,Pepin,55091,55,Wisconsin,Pepin,Pepin,SCTY,Single County,Geographic Population,...,7339.0,Rural,R,Not applicable,0.0,WI,Geographic Population,TRC,0.20,11598 days
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27808,,Saint George Health Clinic,2016,2,Alaska,Aleutians West,Saint George Health Clinic,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,528.0,Rural,R,,,,,,,NaT
27809,,AICS Gustavus Community Clinic,2105,2,Alaska,Hoonah-Angoon,AICS Gustavus Community Clinic,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,951.0,Rural,R,,,,,,,NaT
27810,,Phillips Ayagnirvik Residential Treatment,2050,2,Alaska,Bethel,Phillips Ayagnirvik Residential Treatment,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,14375.0,Rural,R,,,,,,,NaT
27811,,Portage Creek Village Clinic,2070,2,Alaska,Dillingham,Portage Creek Village Clinic,UNK,Unknown,"Indian Health Service, Tribal Health, and Urba...",...,3889.0,Rural,R,,,,,,,NaT


In [17]:
# save cleaned dataset in ./Data folder 
final_df.to_csv('./Data/HPSA_Cleaned.csv',index=False)