In [14]:
#Import dependencies
#Import data
#Preprocess for random forest model
#Scale data
#Bucket employment growth rate for prediction
#Instantiate, fit model, run predictions
#Measure accuracy

In [15]:
#Import dependencies
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from scipy import stats
from datetime import datetime
#import tensorflow as tf
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#display options
pd.set_option ('display.max_rows', 10)


In [16]:
#Set path of input file
file_path = "Resources/Cleaned_Data.csv"
#Read input file into a DataFrame
ratings_df = pd.read_csv(file_path)
ratings_df

Unnamed: 0,FIPS,State,County,Deaths,life_lost,fair_poor_health,unhealthy_days,mental_days,low_birth,smoker,...,poor_english,pct_female,pct_rural,employed_2015,employed_2020,emp_growth_rate,pcp_ratio,dentist_ratio,mental_ratio,opcp_ratio
0,1001,Alabama,Autauga,791.0,8129.0,21,4.7,4.7,9.0,18,...,1,51.4,42.0,23986,24580,2.476445,2220.0,3089.0,4277.0,2527.0
1,1003,Alabama,Baldwin,2967.0,7354.0,18,4.2,4.3,8.0,17,...,1,51.5,42.3,85953,98768,14.909311,1372.0,2019.0,1038.0,1787.0
2,1005,Alabama,Barbour,472.0,10254.0,30,5.4,5.2,11.0,22,...,2,47.2,67.8,8597,8707,1.279516,3159.0,2765.0,12441.0,1914.0
3,1007,Alabama,Bibb,471.0,11978.0,19,4.6,4.6,10.0,19,...,0,46.8,68.4,8294,8303,0.108512,2061.0,4480.0,4480.0,896.0
4,1009,Alabama,Blount,1085.0,11335.0,22,4.9,4.9,8.0,19,...,2,50.7,90.0,22189,22836,2.915859,4463.0,5258.0,6427.0,4449.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,Wyoming,Sweetwater,532.0,7832.0,15,3.4,3.6,10.0,18,...,2,48.5,10.9,23010,21572,-6.249457,2721.0,1872.0,395.0,1485.0
3138,56039,Wyoming,Teton,109.0,2731.0,12,3.0,3.2,8.0,15,...,4,48.4,46.4,14298,14909,4.273325,862.0,1154.0,210.0,923.0
3139,56041,Wyoming,Uinta,256.0,7331.0,16,3.6,3.7,10.0,17,...,1,49.3,43.1,10064,9710,-3.517488,2277.0,1128.0,282.0,1561.0
3140,56043,Wyoming,Washakie,110.0,6586.0,16,3.6,3.7,7.0,17,...,0,49.4,36.0,3892,3841,-1.310380,2016.0,1314.0,254.0,1971.0


In [17]:
#inspect data to see which fields are strings that need to be converted to numeric
ratings_df.select_dtypes(include=['object']).columns

Index(['State', 'County'], dtype='object')

In [18]:
#State we can get dummies for, and county is already represented by the FIPS code (which is the unique
#key which won't contribute information anyway) so it can be dropped

preprocessed_df = ratings_df
preprocessed_df.drop(columns=['County'],inplace=True)
preprocessed_df

Unnamed: 0,FIPS,State,Deaths,life_lost,fair_poor_health,unhealthy_days,mental_days,low_birth,smoker,obesity,...,poor_english,pct_female,pct_rural,employed_2015,employed_2020,emp_growth_rate,pcp_ratio,dentist_ratio,mental_ratio,opcp_ratio
0,1001,Alabama,791.0,8129.0,21,4.7,4.7,9.0,18,33,...,1,51.4,42.0,23986,24580,2.476445,2220.0,3089.0,4277.0,2527.0
1,1003,Alabama,2967.0,7354.0,18,4.2,4.3,8.0,17,31,...,1,51.5,42.3,85953,98768,14.909311,1372.0,2019.0,1038.0,1787.0
2,1005,Alabama,472.0,10254.0,30,5.4,5.2,11.0,22,42,...,2,47.2,67.8,8597,8707,1.279516,3159.0,2765.0,12441.0,1914.0
3,1007,Alabama,471.0,11978.0,19,4.6,4.6,10.0,19,38,...,0,46.8,68.4,8294,8303,0.108512,2061.0,4480.0,4480.0,896.0
4,1009,Alabama,1085.0,11335.0,22,4.9,4.9,8.0,19,34,...,2,50.7,90.0,22189,22836,2.915859,4463.0,5258.0,6427.0,4449.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3137,56037,Wyoming,532.0,7832.0,15,3.4,3.6,10.0,18,30,...,2,48.5,10.9,23010,21572,-6.249457,2721.0,1872.0,395.0,1485.0
3138,56039,Wyoming,109.0,2731.0,12,3.0,3.2,8.0,15,12,...,4,48.4,46.4,14298,14909,4.273325,862.0,1154.0,210.0,923.0
3139,56041,Wyoming,256.0,7331.0,16,3.6,3.7,10.0,17,36,...,1,49.3,43.1,10064,9710,-3.517488,2277.0,1128.0,282.0,1561.0
3140,56043,Wyoming,110.0,6586.0,16,3.6,3.7,7.0,17,29,...,0,49.4,36.0,3892,3841,-1.310380,2016.0,1314.0,254.0,1971.0


In [25]:
#Get dummies for State field
df_binary_encoded = pd.get_dummies(preprocessed_df, columns=['State'])
df_binary_encoded.describe()

Unnamed: 0,FIPS,Deaths,life_lost,fair_poor_health,unhealthy_days,mental_days,low_birth,smoker,obesity,food_index,...,State_South Dakota,State_Tennessee,State_Texas,State_Utah,State_Vermont,State_Virginia,State_Washington,State_West Virginia,State_Wisconsin,State_Wyoming
count,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,...,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0,3142.0
mean,30383.649268,1334.75781,8582.946648,17.930617,3.990643,4.167696,8.146293,17.4669,32.901655,7.451361,...,0.021006,0.030236,0.08084,0.00923,0.004456,0.04233,0.012412,0.017505,0.022915,0.00732
std,15162.508374,3189.377652,2480.245236,4.746985,0.705693,0.602053,2.02492,3.626552,5.456978,1.141906,...,0.143426,0.171262,0.272633,0.095643,0.066613,0.201372,0.110735,0.131163,0.149657,0.085258
min,1001.0,32.0,2731.0,8.0,2.4,2.5,3.0,6.0,12.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,18177.5,245.0,6910.5,14.0,3.5,3.7,7.0,15.0,29.0,6.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,29176.0,547.0,8582.946648,17.0,3.9,4.2,8.0,17.0,33.0,7.6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,45080.5,1334.75781,9917.25,21.0,4.4,4.6,9.0,20.0,37.0,8.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
max,56045.0,84296.0,29138.0,41.0,7.1,6.3,24.0,41.0,58.0,10.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [24]:
#Encode object type/string fields with get_dummies
df_binary_encoded = pd.get_dummies(df, columns=str_cols)

ValueError: Expected 2D array, got 1D array instead:
array=['Alabama' 'Alabama' 'Alabama' ... 'Wyoming' 'Wyoming' 'Wyoming'].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.