# Covid 19 Dataset

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from scipy import stats
from sklearn.linear_model import LinearRegression, LassoLars, TweedieRegressor


## Acquire

In [2]:
df = pd.read_csv(r'/Users/malachihale/codeup-data-science/covid-19-data/covid_data_log_200922.csv')

In [3]:
df.head()

Unnamed: 0,FIPS,stateFIPS,countyFIPS_2d,County,State,Cases,Deaths,Poverty,Population,W_Male,...,H_Male,H_Female,I_Male,I_Female,A_Male,A_Female,NH_Male,NH_Female,Risk_Index,Risk_Cat
0,1001,1,1,Autauga County,AL,10.590264,6.812345,10.916415,10.930765,10.65136,...,7.903227,7.787797,5.978886,6.100319,6.588926,6.778785,4.465908,4.317488,65.42,Above Average
1,1003,1,3,Baldwin County,AL,11.251171,6.864848,12.279579,12.315976,12.162852,...,9.734477,9.629182,7.908755,7.872455,7.756623,8.129764,5.537334,5.587249,68.39,Above Average
2,1005,1,5,Barbour County,AL,10.111517,5.043425,9.997843,10.113992,9.465448,...,7.552762,7.291656,5.652489,5.204007,4.844187,4.94876,4.276666,3.713572,97.09,High
3,1007,1,7,Bibb County,AL,9.520469,4.634729,9.914032,10.016548,9.777641,...,6.944087,6.746412,5.068904,5.01728,4.276666,4.290459,3.912023,2.772589,83.36,Above Average
4,1009,1,9,Blount County,AL,9.86843,4.406719,10.954973,10.965194,10.912649,...,9.09257,8.980424,6.383507,6.393591,5.446737,5.57973,4.624973,4.127134,81.75,Above Average


## Prepare

In [4]:
df.columns

Index(['FIPS', 'stateFIPS', 'countyFIPS_2d', 'County', 'State', 'Cases',
       'Deaths', 'Poverty', 'Population', 'W_Male', 'W_Female', 'B_Male',
       'B_Female', 'H_Male', 'H_Female', 'I_Male', 'I_Female', 'A_Male',
       'A_Female', 'NH_Male', 'NH_Female', 'Risk_Index', 'Risk_Cat'],
      dtype='object')

In [5]:
for col in df.columns:
    print(df[col].isnull().values.any())

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False


In [6]:
#Train test split
train_validate, test = train_test_split(df, test_size=.2, random_state=123, stratify=df.Risk_Cat)

In [7]:
train_validate.head()

Unnamed: 0,FIPS,stateFIPS,countyFIPS_2d,County,State,Cases,Deaths,Poverty,Population,W_Male,...,H_Male,H_Female,I_Male,I_Female,A_Male,A_Female,NH_Male,NH_Female,Risk_Index,Risk_Cat
1473,28145,28,145,Union County,MS,9.612467,6.558198,10.248849,10.268651,10.052338,...,7.694848,7.469654,5.159055,5.09375,6.115892,5.726848,3.828641,3.295837,80.54,Above Average
105,4017,4,17,Navajo County,AZ,12.518,9.116579,11.593703,11.616601,10.972551,...,9.964912,9.768755,10.814846,10.879047,6.668228,6.886532,5.828946,5.303305,61.21,Above Average
2244,42001,42,1,Adams County,PA,10.284933,6.866933,11.498451,11.542572,11.4826,...,9.398147,9.279866,6.4708,6.423247,6.841615,7.105786,4.477337,4.276666,33.62,Below Average
3117,55139,55,139,Winnebago County,WI,10.735113,6.666957,12.002732,12.054709,11.976603,...,9.35876,9.325364,7.537963,7.393878,8.614864,8.686936,5.153292,5.030438,20.82,Below Average
19,1039,1,39,Covington County,AL,10.087724,6.315358,10.501252,10.519997,10.334328,...,6.955593,6.96319,6.035481,5.968708,5.32301,5.407172,2.639057,2.564949,86.51,High


In [8]:
train, validate = train_test_split(train_validate, 
                                       test_size=.3, 
                                       random_state=123, 
                                       stratify=train_validate.Risk_Cat)

The DataFrame contains no null values. 

In [9]:
scaler = MinMaxScaler()

In [10]:
X =  train[['Poverty', 'Population', 'W_Male', 'W_Female', 'B_Male',
       'B_Female', 'H_Male', 'H_Female', 'I_Male', 'I_Female', 'A_Male',
       'A_Female', 'NH_Male', 'NH_Female']]

In [11]:
scaler.fit(X)

MinMaxScaler()

In [12]:
X_scaled = scaler.transform(X)

In [15]:
X_scaled = pd.DataFrame(X_scaled)

In [16]:
X_scaled

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,0.515337,0.523579,0.552559,0.575611,0.588336,0.573949,0.493910,0.517117,0.486976,0.536761,0.499818,0.502161,0.381351,0.350952
1,0.679113,0.683635,0.708249,0.724899,0.663456,0.623681,0.722740,0.736369,0.742368,0.738892,0.720129,0.739491,0.664168,0.681520
2,0.874678,0.878214,0.873641,0.885174,0.900355,0.899340,0.902420,0.917725,0.914886,0.926866,0.897939,0.914463,0.794320,0.810120
3,0.439574,0.445758,0.436684,0.469982,0.646760,0.649187,0.358906,0.396504,0.422797,0.444502,0.351128,0.365710,0.296898,0.289026
4,0.459960,0.467847,0.502698,0.523832,0.511955,0.489340,0.382494,0.409317,0.448536,0.482038,0.386878,0.413089,0.287744,0.278861
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1754,0.429882,0.434232,0.470311,0.495701,0.351698,0.391193,0.285358,0.326101,0.437329,0.479283,0.340286,0.345704,0.199733,0.200623
1755,0.483787,0.490643,0.517214,0.540496,0.583308,0.580450,0.525677,0.552695,0.564484,0.587874,0.406817,0.418821,0.354321,0.355900
1756,0.408174,0.414236,0.452001,0.474042,0.406765,0.387725,0.382332,0.411432,0.397923,0.421268,0.308915,0.344339,0.186907,0.187740
1757,0.117896,0.128171,0.162330,0.200334,0.099394,0.137767,0.097700,0.191205,0.223785,0.217954,0.118668,0.143393,0.000000,0.000000


In [17]:
X_scaled.columns = ['Poverty', 'Population', 'W_Male', 'W_Female', 'B_Male',
       'B_Female', 'H_Male', 'H_Female', 'I_Male', 'I_Female', 'A_Male',
       'A_Female', 'NH_Male', 'NH_Female']

In [None]:
X_scaled

## Explore Data

In [None]:
X_scaled

In [None]:
train.columns

In [None]:
categorical = ['FIPS', 'stateFIPS', 'countyFIPS_2d', 'County', 'State']

## Modeling

In [None]:
y_train = pd.DataFrame(train.Risk_Index)
y_validate = pd.DataFrame(validate.Risk_Index)
y_test = pd.DataFrame(test.Risk_Index)

In [None]:
y_train

In [None]:
y_validate

In [None]:
# Predict taxvaluedollarcnt_pred_mean
risk_pred_mean = y_train.Risk_Index.mean()
y_train['risk_pred_mean'] = risk_pred_mean
y_validate['risk_pred_mean'] = risk_pred_mean

In [None]:
from sklearn.metrics import explained_variance_score

In [None]:
def make_metric_df(y, y_pred, model_name, metric_df):
    if metric_df.size ==0:
        metric_df = pd.DataFrame(data=[
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }])
        return metric_df
    else:
        return metric_df.append(
            {
                'model': model_name, 
                'RMSE_validate': mean_squared_error(
                    y,
                    y_pred) ** .5,
                'r^2_validate': explained_variance_score(
                    y,
                    y_pred)
            }, ignore_index=True)

In [None]:
metric_df = pd.DataFrame()
metric_df = make_metric_df(y_validate.Risk_Index,
                           y_validate.risk_pred_median,
                           'median_baseline',
                          metric_df)

In [None]:
#Create the model object
lm_1 = LinearRegression(normalize=True)

# fit the model to training data
lm_1.fit(X_scaled, y_train_dummies.adr)

y_train_dummies['risk_pred_lm_1'] = lm_1.predict(X_scaled)

# evaluate: rmse
rmse_train = mean_squared_error(y_train.Risk_Index, y_train.adr_pred_lm_1) ** (1/2)

# predict validate
y_validate_dummies['adr_pred_lm_1'] = lm_1.predict(X_validate_dummies)

# evaluate: rmse
rmse_validate = mean_squared_error(y_validate_dummies.adr, y_validate_dummies.adr_pred_lm_1) ** (1/2)

print("RMSE for OLS using LinearRegression\nTraining/In-Sample: ", rmse_train, 
      "\nValidation/Out-of-Sample: ", rmse_validate)