In [4]:
### Vaccination Index Predictions

In [5]:
# Import your dependencies
# Initial imports
import pandas as pd
from path import Path
import numpy as np
from sklearn import tree
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
from sklearn import *

In [6]:
# Read in the file
data = Path('../Resources/merged_dataset.csv')
county_df = pd.read_csv(data)
county_df.head()


Unnamed: 0,FIPS,State,County,City,UI_Index,MI_Index,CI_Index,Latitude,Longitude,Total_Vaccinated,Vac_Index
0,1001,AL,Autauga,Albertville,4.9,112.5,90.9,32.536382,–86.644490,2288180,46.7
1,1001,AL,Autauga,Anniston,4.9,112.5,85.7,32.536382,–86.644490,2288180,46.7
2,1001,AL,Autauga,Auburn,4.9,112.5,91.9,32.536382,–86.644490,2288180,46.7
3,1001,AL,Autauga,Birmingham,4.9,112.5,90.7,32.536382,–86.644490,2288180,46.7
4,1001,AL,Autauga,Cullman,4.9,112.5,90.5,32.536382,–86.644490,2288180,46.7


### Data Preprocessing for Machine Learning 

### Label Encoder - Replaces non numerical data with numbers 
### Can only use numerical values (Update City, County, and State)

In [8]:
# County, State, and City
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
county_df = county_df.copy()
# county_df['le_county'] = le.fit_transform(county_df['County'])
# county_df['le_state'] = le.fit_transform(county_df['State'])
# county_df['le_city'] = le.fit_transform(county_df['City'])
county_df2 = county_df.drop(columns = ['County', 'Latitude', 'Longitude','State','City'])
county_df2

Unnamed: 0,FIPS,UI_Index,MI_Index,CI_Index,Total_Vaccinated,Vac_Index
0,1001,4.9,112.5,90.9,2288180,46.7
1,1001,4.9,112.5,85.7,2288180,46.7
2,1001,4.9,112.5,91.9,2288180,46.7
3,1001,4.9,112.5,90.7,2288180,46.7
4,1001,4.9,112.5,90.5,2288180,46.7
...,...,...,...,...,...,...
41803,56041,6.3,107.0,96.9,268472,46.4
41804,56043,5.3,83.3,94.2,268472,46.4
41805,56043,5.3,83.3,96.9,268472,46.4
41806,56045,3.9,89.8,94.2,268472,46.4


In [9]:
### Update numerical values to Boolean

In [10]:
# Convert 'Median_Income_Index' column to Boolean
indexes_df= county_df2['MI_Index']>=100
# Add the converted column back to the DataFrame
county_df2['MI_Index'] = indexes_df.astype(int)
county_df2


Unnamed: 0,FIPS,UI_Index,MI_Index,CI_Index,Total_Vaccinated,Vac_Index
0,1001,4.9,1,90.9,2288180,46.7
1,1001,4.9,1,85.7,2288180,46.7
2,1001,4.9,1,91.9,2288180,46.7
3,1001,4.9,1,90.7,2288180,46.7
4,1001,4.9,1,90.5,2288180,46.7
...,...,...,...,...,...,...
41803,56041,6.3,1,96.9,268472,46.4
41804,56043,5.3,0,94.2,268472,46.4
41805,56043,5.3,0,96.9,268472,46.4
41806,56045,3.9,0,94.2,268472,46.4


In [11]:
indexes_df= county_df2['CI_Index']>=100
county_df2['CI_Index'] = indexes_df.astype(int)
county_df2

Unnamed: 0,FIPS,UI_Index,MI_Index,CI_Index,Total_Vaccinated,Vac_Index
0,1001,4.9,1,0,2288180,46.7
1,1001,4.9,1,0,2288180,46.7
2,1001,4.9,1,0,2288180,46.7
3,1001,4.9,1,0,2288180,46.7
4,1001,4.9,1,0,2288180,46.7
...,...,...,...,...,...,...
41803,56041,6.3,1,0,268472,46.4
41804,56043,5.3,0,0,268472,46.4
41805,56043,5.3,0,0,268472,46.4
41806,56045,3.9,0,0,268472,46.4


In [12]:
indexes_df= county_df2['UI_Index']<=6
county_df2['UI_Index'] = indexes_df.astype(int)
county_df2

Unnamed: 0,FIPS,UI_Index,MI_Index,CI_Index,Total_Vaccinated,Vac_Index
0,1001,1,1,0,2288180,46.7
1,1001,1,1,0,2288180,46.7
2,1001,1,1,0,2288180,46.7
3,1001,1,1,0,2288180,46.7
4,1001,1,1,0,2288180,46.7
...,...,...,...,...,...,...
41803,56041,0,1,0,268472,46.4
41804,56043,1,0,0,268472,46.4
41805,56043,1,0,0,268472,46.4
41806,56045,1,0,0,268472,46.4


In [13]:
indexes_df= county_df2['Vac_Index']>=60
county_df2['Vac_Index'] = indexes_df.astype(int)
county_df2

Unnamed: 0,FIPS,UI_Index,MI_Index,CI_Index,Total_Vaccinated,Vac_Index
0,1001,1,1,0,2288180,0
1,1001,1,1,0,2288180,0
2,1001,1,1,0,2288180,0
3,1001,1,1,0,2288180,0
4,1001,1,1,0,2288180,0
...,...,...,...,...,...,...
41803,56041,0,1,0,268472,0
41804,56043,1,0,0,268472,0
41805,56043,1,0,0,268472,0
41806,56045,1,0,0,268472,0


In [14]:
# EXPORT TO CSV FOR TABLEAU AND FUTURE MODELS
county_df2.to_csv('encoded_dataset.csv', index=False)