In [6]:
import pandas as pd 
import requests
import json
import numpy as np
from sodapy import Socrata

from pathlib import Path
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
client = Socrata("data.cdc.gov", 'RR1JGCh5rZDwjeVYEvT9cDvj6')

results_2022 = client.get("epbn-9bv3", limit=500000)

results_2022_df = pd.DataFrame.from_records(results_2022)
results_2022_df = results_2022_df.drop(['data_value_footnote_symbol','data_value_footnote'], axis=1).dropna()
results_2022_df.head()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,geolocation,locationid,categoryid,measureid,datavaluetypeid,short_question_text
0,2020,AL,Alabama,Abbeville,BRFSS,Health Outcomes,Obesity among adults aged >=18 years,%,Crude prevalence,44.3,42.9,45.7,2688,"{'type': 'Point', 'coordinates': [-85.25278085...",100124,HLTHOUT,OBESITY,CrdPrv,Obesity
1,2020,AL,Alabama,Andalusia,BRFSS,Health Outcomes,Obesity among adults aged >=18 years,%,Crude prevalence,45.9,45.2,46.6,9015,"{'type': 'Point', 'coordinates': [-86.47806878...",101708,HLTHOUT,OBESITY,CrdPrv,Obesity
2,2020,AL,Alabama,Baileyton,BRFSS,Health Outcomes,Current asthma among adults aged >=18 years,%,Crude prevalence,10.4,9.9,11.0,610,"{'type': 'Point', 'coordinates': [-86.60822915...",103676,HLTHOUT,CASTHMA,CrdPrv,Current Asthma
3,2019,AL,Alabama,Beatrice,BRFSS,Prevention,Cholesterol screening among adults aged >=18 y...,%,Age-adjusted prevalence,87.3,85.9,88.6,301,"{'type': 'Point', 'coordinates': [-87.20904641...",104900,PREVENT,CHOLSCREEN,AgeAdjPrv,Cholesterol Screening
4,2020,AL,Alabama,Black,BRFSS,Health Outcomes,Stroke among adults aged >=18 years,%,Crude prevalence,4.2,3.6,4.6,207,"{'type': 'Point', 'coordinates': [-85.74279127...",107120,HLTHOUT,STROKE,CrdPrv,Stroke


In [3]:
results_2022_df['year'] = results_2022_df['year'].astype(int)
results_2022_df['totalpopulation'] = results_2022_df['totalpopulation'].astype(int)
results_2022_df['data_value'] = results_2022_df['data_value'].astype(float)
results_2022_df['low_confidence_limit'] = results_2022_df['low_confidence_limit'].astype(float)
results_2022_df['high_confidence_limit'] = results_2022_df['high_confidence_limit'].astype(float)
results_2022_df.dtypes

year                       int64
stateabbr                 object
statedesc                 object
locationname              object
datasource                object
category                  object
measure                   object
data_value_unit           object
data_value_type           object
data_value               float64
low_confidence_limit     float64
high_confidence_limit    float64
totalpopulation            int64
geolocation               object
locationid                object
categoryid                object
measureid                 object
datavaluetypeid           object
short_question_text       object
dtype: object

In [4]:
results_2023 = client.get("eav7-hnsx", limit=500000)

results_2023_df = pd.DataFrame.from_records(results_2023)
results_2023_df = results_2023_df.drop(['data_value_footnote_symbol','data_value_footnote',':@computed_region_bxsw_vy29',':@computed_region_he4y_prf8'], axis=1).dropna()
results_2023_df.head()

Unnamed: 0,year,stateabbr,statedesc,locationname,datasource,category,measure,data_value_unit,data_value_type,data_value,low_confidence_limit,high_confidence_limit,totalpopulation,geolocation,locationid,categoryid,measureid,datavaluetypeid,short_question_text
0,2020,AK,Alaska,Kiana,BRFSS,Health Outcomes,All teeth lost among adults aged >=65 years,%,Crude prevalence,38.5,29.0,48.0,347,"{'type': 'Point', 'coordinates': [-160.4343638...",239300,HLTHOUT,TEETHLOST,CrdPrv,All Teeth Lost
1,2021,AK,Alaska,Koliganek,BRFSS,Health Outcomes,Arthritis among adults aged >=18 years,%,Crude prevalence,22.0,18.5,25.7,209,"{'type': 'Point', 'coordinates': [-157.2259091...",241500,HLTHOUT,ARTHRITIS,CrdPrv,Arthritis
2,2021,AK,Alaska,Kongiganak,BRFSS,Health Outcomes,Arthritis among adults aged >=18 years,%,Crude prevalence,23.5,20.2,27.0,439,"{'type': 'Point', 'coordinates': [-162.8830767...",241610,HLTHOUT,ARTHRITIS,CrdPrv,Arthritis
3,2021,AK,Alaska,Lakes,BRFSS,Health Outcomes,Obesity among adults aged >=18 years,%,Crude prevalence,36.7,32.5,41.2,8364,"{'type': 'Point', 'coordinates': [-149.3066764...",242832,HLTHOUT,OBESITY,CrdPrv,Obesity
4,2021,AK,Alaska,Mountain Village,BRFSS,Health Outcomes,Obesity among adults aged >=18 years,%,Crude prevalence,47.3,39.2,56.1,813,"{'type': 'Point', 'coordinates': [-163.7209368...",251180,HLTHOUT,OBESITY,CrdPrv,Obesity


In [5]:
results_2023_df['year'] = results_2023_df['year'].astype(int)
results_2023_df['totalpopulation'] = results_2023_df['totalpopulation'].astype(int)
results_2023_df['data_value'] = results_2023_df['data_value'].astype(float)
results_2023_df['low_confidence_limit'] = results_2023_df['low_confidence_limit'].astype(float)
results_2023_df['high_confidence_limit'] = results_2023_df['high_confidence_limit'].astype(float)
results_2023_df.dtypes

year                       int64
stateabbr                 object
statedesc                 object
locationname              object
datasource                object
category                  object
measure                   object
data_value_unit           object
data_value_type           object
data_value               float64
low_confidence_limit     float64
high_confidence_limit    float64
totalpopulation            int64
geolocation               object
locationid                object
categoryid                object
measureid                 object
datavaluetypeid           object
short_question_text       object
dtype: object

Filtered by Age-adjusted prevalence

In [7]:
search_value = ['Age-adjusted prevalence']
age_results_2022_df = results_2022_df[results_2022_df['data_value_type'].isin(search_value)]

In [8]:
search_value = ['Age-adjusted prevalence']
age_results_2023_df = results_2023_df[results_2023_df['data_value_type'].isin(search_value)]

Filtered by Health Condition: 
1. Cancer (exlcuding Skin)
2. Coronary Heart Disease
3. COPD
4. Obesity
5. Diabetes
6. Health Insurance
7. Annual Checkup

In [9]:
health_condition_1 = ['Cancer (except skin)']

cancer_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_1)]
cancer_2022_df = cancer_2022_df.sort_values(['stateabbr'], ascending=True)

cancer_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_1)]
cancer_2023_df = cancer_2023_df.sort_values(['stateabbr'], ascending=True)

In [10]:
health_condition_2 = ['Coronary Heart Disease']

chd_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_2)]
chd_2022_df = chd_2022_df.sort_values(['stateabbr'], ascending=True)

chd_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_2)]
chd_2023_df = chd_2023_df.sort_values(['stateabbr'], ascending=True)

In [11]:
health_condition_3 = ['COPD']

copd_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_3)]
copd_2022_df = copd_2022_df.sort_values(['stateabbr'], ascending=True)

copd_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_3)]
copd_2023_df = copd_2023_df.sort_values(['stateabbr'], ascending=True)

In [12]:
health_condition_4 = ['Obesity']

obesity_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_4)]
obesity_2022_df = obesity_2022_df.sort_values(['stateabbr'], ascending=True)

obesity_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_4)]
obesity_2023_df = obesity_2023_df.sort_values(['stateabbr'], ascending=True)

In [13]:
health_condition_5 = ['Diabetes']

diabetes_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_5)]
diabetes_2022_df = diabetes_2022_df.sort_values(['stateabbr'], ascending=True)

diabetes_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_5)]
diabates_2023_df = diabetes_2023_df.sort_values(['stateabbr'], ascending=True)

In [14]:
health_condition_6 = ['Health Insurance']

insurance_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_6)]
insurance_2022_df = insurance_2022_df.sort_values(['stateabbr'], ascending=True)

insurance_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_6)]
insurance_2023_df = insurance_2023_df.sort_values(['stateabbr'], ascending=True)

In [15]:
health_condition_7 = ['Annual Checkup']

checkup_2022_df = age_results_2022_df[age_results_2022_df['short_question_text'].isin(health_condition_7)]
checkup_2022_df = checkup_2022_df.sort_values(['stateabbr'], ascending=True)

checkup_2023_df = age_results_2023_df[age_results_2023_df['short_question_text'].isin(health_condition_7)]
checkup_2023_df = checkup_2023_df.sort_values(['stateabbr'], ascending=True)

Filtered by States : Alabama, Arizona, Arkansas, California, Colorado, Connecticut, Delaware, District of California, Georgia, Hawaii, Idaho, Illinois, and Indiana.



Cancer (except Skin)

In [16]:
state_1 = ['Alabama']
AL_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_1)]
AL_cancer_2022_df = AL_cancer_2022_df.sort_values('locationname',ascending=True)

AL_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_1)]
AL_cancer_2023_df = AL_cancer_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_2)]
AZ_cancer_2022_df = AZ_cancer_2022_df.sort_values('locationname',ascending=True)

AZ_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_2)]
AZ_cancer_2023_df = AZ_cancer_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_3)]
AR_cancer_2022_df = AR_cancer_2022_df.sort_values('locationname',ascending=True)

AR_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_3)]
AR_cancer_2023_df = AR_cancer_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_4)]
CA_cancer_2022_df = CA_cancer_2022_df.sort_values('locationname',ascending=True)

CA_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_4)]
CA_cancer_2023_df = CA_cancer_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_5)]
CO_cancer_2022_df = CO_cancer_2022_df.sort_values('locationname',ascending=True)

CO_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_5)]
CO_cancer_2023_df = CO_cancer_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_6)]
CT_cancer_2022_df = CT_cancer_2022_df.sort_values('locationname',ascending=True)

CT_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_6)]
CT_cancer_2023_df = CT_cancer_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_7)]
DL_cancer_2022_df = DL_cancer_2022_df.sort_values('locationname',ascending=True)

DL_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_7)]
DL_cancer_2023_df = DL_cancer_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_8)]
DC_cancer_2022_df = DC_cancer_2022_df.sort_values('locationname',ascending=True)

DC_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_8)]
DC_cancer_2023_df = DC_cancer_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_9)]
GA_cancer_2022_df = GA_cancer_2022_df.sort_values('locationname',ascending=True)

GA_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_9)]
GA_cancer_2023_df = GA_cancer_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_10)]
HI_cancer_2022_df = HI_cancer_2022_df.sort_values('locationname',ascending=True)

HI_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_10)]
HI_cancer_2023_df = HI_cancer_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_11)]
ID_cancer_2022_df = ID_cancer_2022_df.sort_values('locationname',ascending=True)

ID_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_11)]
ID_cancer_2023_df = ID_cancer_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_12)]
IL_cancer_2022_df = IL_cancer_2022_df.sort_values('locationname',ascending=True)

IL_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_12)]
IL_cancer_2023_df = IL_cancer_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Indiana']
IN_cancer_2022_df = cancer_2022_df[cancer_2022_df['statedesc'].isin(state_12)]
IN_cancer_2022_df = IN_cancer_2022_df.sort_values('locationname',ascending=True)

IN_cancer_2023_df = cancer_2023_df[cancer_2023_df['statedesc'].isin(state_12)]
IN_cancer_2023_df = IN_cancer_2023_df.sort_values('locationname',ascending=True)

Coronary Heart Disease

In [17]:
state_1 = ['Alabama']
AL_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_1)]
AL_chd_2022_df = AL_chd_2022_df.sort_values('locationname',ascending=True)

AL_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_1)]
AL_chd_2023_df = AL_chd_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_2)]
AZ_chd_2022_df = AZ_chd_2022_df.sort_values('locationname',ascending=True)

AZ_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_2)]
AZ_chd_2023_df = AZ_chd_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_3)]
AR_chd_2022_df = AR_chd_2022_df.sort_values('locationname',ascending=True)

AR_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_3)]
AR_chd_2023_df = AR_chd_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_4)]
CA_chd_2022_df = CA_chd_2022_df.sort_values('locationname',ascending=True)

CA_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_4)]
CA_chd_2023_df = CA_chd_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_5)]
CO_chd_2022_df = CO_chd_2022_df.sort_values('locationname',ascending=True)

CO_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_5)]
CO_chd_2023_df = CO_chd_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_6)]
CT_chd_2022_df = CT_chd_2022_df.sort_values('locationname',ascending=True)

CT_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_6)]
CT_chd_2023_df = CT_chd_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_7)]
DL_chd_2022_df = DL_chd_2022_df.sort_values('locationname',ascending=True)

DL_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_7)]
DL_chd_2023_df = DL_chd_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_8)]
DC_chd_2022_df = DC_chd_2022_df.sort_values('locationname',ascending=True)

DC_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_8)]
DC_chd_2023_df = DC_chd_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_9)]
GA_chd_2022_df = GA_chd_2022_df.sort_values('locationname',ascending=True)

GA_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_9)]
GA_chd_2023_df = GA_chd_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_10)]
HI_chd_2022_df = HI_chd_2022_df.sort_values('locationname',ascending=True)

HI_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_10)]
HI_chd_2023_df = HI_chd_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_11)]
ID_chd_2022_df = ID_chd_2022_df.sort_values('locationname',ascending=True)

ID_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_11)]
ID_chd_2023_df = ID_chd_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_12)]
IL_chd_2022_df = IL_chd_2022_df.sort_values('locationname',ascending=True)

IL_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_12)]
IL_chd_2023_df = IL_chd_2023_df.sort_values('locationname',ascending=True)

state_13 = ['Indiana']
IN_chd_2022_df = chd_2022_df[chd_2022_df['statedesc'].isin(state_13)]
IN_chd_2022_df = IN_chd_2022_df.sort_values('locationname',ascending=True)

IN_chd_2023_df = chd_2023_df[chd_2023_df['statedesc'].isin(state_13)]
IN_chd_2023_df = IN_chd_2023_df.sort_values('locationname',ascending=True)

COPD

In [18]:
state_1 = ['Alabama']
AL_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_1)]
AL_copd_2022_df = AL_copd_2022_df.sort_values('locationname',ascending=True)

AL_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_1)]
AL_copd_2023_df = AL_copd_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_2)]
AZ_copd_2022_df = AZ_copd_2022_df.sort_values('locationname',ascending=True)

AZ_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_2)]
AZ_copd_2023_df = AZ_copd_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_3)]
AR_copd_2022_df = AR_copd_2022_df.sort_values('locationname',ascending=True)

AR_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_3)]
AR_copd_2023_df = AR_copd_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_4)]
CA_copd_2022_df = CA_copd_2022_df.sort_values('locationname',ascending=True)

CA_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_4)]
CA_copd_2023_df = CA_copd_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_5)]
CO_copd_2022_df = CO_copd_2022_df.sort_values('locationname',ascending=True)

CO_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_5)]
CO_copd_2023_df = CO_copd_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_6)]
CT_copd_2022_df = CT_copd_2022_df.sort_values('locationname',ascending=True)

CT_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_6)]
CT_copd_2023_df = CT_copd_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_7)]
DL_copd_2022_df = DL_copd_2022_df.sort_values('locationname',ascending=True)

DL_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_7)]
DL_copd_2023_df = DL_copd_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_8)]
DC_copd_2022_df = DC_copd_2022_df.sort_values('locationname',ascending=True)

DC_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_8)]
DC_copd_2023_df = DC_copd_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_9)]
GA_copd_2022_df = GA_copd_2022_df.sort_values('locationname',ascending=True)

GA_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_9)]
GA_copd_2023_df = GA_copd_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_10)]
HI_copd_2022_df = HI_copd_2022_df.sort_values('locationname',ascending=True)

HI_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_10)]
HI_copd_2023_df = HI_copd_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_11)]
ID_copd_2022_df = ID_copd_2022_df.sort_values('locationname',ascending=True)

ID_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_11)]
ID_copd_2023_df = ID_copd_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_12)]
IL_copd_2022_df = IL_copd_2022_df.sort_values('locationname',ascending=True)

IL_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_12)]
IL_copd_2023_df = IL_copd_2023_df.sort_values('locationname',ascending=True)

state_13 = ['Indiana']
IN_copd_2022_df = copd_2022_df[copd_2022_df['statedesc'].isin(state_13)]
IN_copd_2022_df = IN_copd_2022_df.sort_values('locationname',ascending=True)

IN_copd_2023_df = copd_2023_df[copd_2023_df['statedesc'].isin(state_13)]
IN_copd_2023_df = IN_copd_2023_df.sort_values('locationname',ascending=True)

Obesity

In [19]:
state_1 = ['Alabama']
AL_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_1)]
AL_obesity_2022_df = AL_obesity_2022_df.sort_values('locationname',ascending=True)

AL_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_1)]
AL_obesity_2023_df = AL_obesity_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_2)]
AZ_obesity_2022_df = AZ_obesity_2022_df.sort_values('locationname',ascending=True)

AZ_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_2)]
AZ_obesity_2023_df = AZ_obesity_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_3)]
AR_obesity_2022_df = AR_obesity_2022_df.sort_values('locationname',ascending=True)

AR_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_3)]
AR_obesity_2023_df = AR_obesity_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_4)]
CA_obesity_2022_df = CA_obesity_2022_df.sort_values('locationname',ascending=True)

CA_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_4)]
CA_obesity_2023_df = CA_obesity_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_5)]
CO_obesity_2022_df = CO_obesity_2022_df.sort_values('locationname',ascending=True)

CO_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_5)]
CO_obesity_2023_df = CO_obesity_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_6)]
CT_obesity_2022_df = CT_obesity_2022_df.sort_values('locationname',ascending=True)

CT_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_6)]
CT_obesity_2023_df = CT_obesity_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_7)]
DL_obesity_2022_df = DL_obesity_2022_df.sort_values('locationname',ascending=True)

DL_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_7)]
DL_obesity_2023_df = DL_obesity_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_8)]
DC_obesity_2022_df = DC_obesity_2022_df.sort_values('locationname',ascending=True)

DC_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_8)]
DC_obesity_2023_df = DC_obesity_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_9)]
GA_obesity_2022_df = GA_obesity_2022_df.sort_values('locationname',ascending=True)

GA_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_9)]
GA_obesity_2023_df = GA_obesity_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_10)]
HI_obesity_2022_df = HI_obesity_2022_df.sort_values('locationname',ascending=True)

HI_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_10)]
HI_obesity_2023_df = HI_obesity_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_11)]
ID_obesity_2022_df = ID_obesity_2022_df.sort_values('locationname',ascending=True)

ID_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_11)]
ID_obesity_2023_df = ID_obesity_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_12)]
IL_obesity_2022_df = IL_obesity_2022_df.sort_values('locationname',ascending=True)

IL_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_12)]
IL_obesity_2023_df = IL_obesity_2023_df.sort_values('locationname',ascending=True)

state_13 = ['Indiana']
IN_obesity_2022_df = obesity_2022_df[obesity_2022_df['statedesc'].isin(state_13)]
IN_obesity_2022_df = IN_obesity_2022_df.sort_values('locationname',ascending=True)

IN_obesity_2023_df = obesity_2023_df[obesity_2023_df['statedesc'].isin(state_13)]
IN_obesity_2023_df = IN_obesity_2023_df.sort_values('locationname',ascending=True)

Diabetes

In [20]:
state_1 = ['Alabama']
AL_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_1)]
AL_diabetes_2022_df = AL_diabetes_2022_df.sort_values('locationname',ascending=True)

AL_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_1)]
AL_diabetes_2023_df = AL_diabetes_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_2)]
AZ_diabetes_2022_df = AZ_diabetes_2022_df.sort_values('locationname',ascending=True)

AZ_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_2)]
AZ_diabetes_2023_df = AZ_diabetes_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_3)]
AR_diabetes_2022_df = AR_diabetes_2022_df.sort_values('locationname',ascending=True)

AR_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_3)]
AR_diabetes_2023_df = AR_diabetes_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_4)]
CA_diabetes_2022_df = CA_diabetes_2022_df.sort_values('locationname',ascending=True)

CA_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_4)]
CA_diabetes_2023_df = CA_diabetes_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_5)]
CO_diabetes_2022_df = CO_diabetes_2022_df.sort_values('locationname',ascending=True)

CO_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_5)]
CO_diabetes_2023_df = CO_diabetes_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_6)]
CT_diabetes_2022_df = CT_diabetes_2022_df.sort_values('locationname',ascending=True)

CT_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_6)]
CT_diabetes_2023_df = CT_diabetes_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_7)]
DL_diabetes_2022_df = DL_diabetes_2022_df.sort_values('locationname',ascending=True)

DL_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_7)]
DL_diabetes_2023_df = DL_diabetes_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_8)]
DC_diabetes_2022_df = DC_diabetes_2022_df.sort_values('locationname',ascending=True)

DC_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_8)]
DC_diabetes_2023_df = DC_diabetes_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_9)]
GA_diabetes_2022_df = GA_diabetes_2022_df.sort_values('locationname',ascending=True)

GA_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_9)]
GA_diabetes_2023_df = GA_diabetes_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_10)]
HI_diabetes_2022_df = HI_diabetes_2022_df.sort_values('locationname',ascending=True)

HI_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_10)]
HI_diabetes_2023_df = HI_diabetes_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_11)]
ID_diabetes_2022_df = ID_diabetes_2022_df.sort_values('locationname',ascending=True)

ID_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_11)]
ID_diabetes_2023_df = ID_diabetes_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_12)]
IL_diabetes_2022_df = IL_diabetes_2022_df.sort_values('locationname',ascending=True)

IL_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_12)]
IL_diabetes_2023_df = IL_diabetes_2023_df.sort_values('locationname',ascending=True)

state_13 = ['Indiana']
IN_diabetes_2022_df = diabetes_2022_df[diabetes_2022_df['statedesc'].isin(state_13)]
IN_diabetes_2022_df = IN_diabetes_2022_df.sort_values('locationname',ascending=True)

IN_diabetes_2023_df = diabetes_2023_df[diabetes_2023_df['statedesc'].isin(state_13)]
IN_diabetes_2023_df = IN_diabetes_2023_df.sort_values('locationname',ascending=True)

Health Insurance

In [21]:
state_1 = ['Alabama']
AL_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_1)]
AL_insurance_2022_df = AL_insurance_2022_df.sort_values('locationname',ascending=True)

AL_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_1)]
AL_insurance_2023_df = AL_insurance_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_2)]
AZ_insurance_2022_df = AZ_insurance_2022_df.sort_values('locationname',ascending=True)

AZ_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_2)]
AZ_insurance_2023_df = AZ_insurance_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_3)]
AR_insurance_2022_df = AR_insurance_2022_df.sort_values('locationname',ascending=True)

AR_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_3)]
AR_insurance_2023_df = AR_insurance_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_4)]
CA_insurance_2022_df = CA_insurance_2022_df.sort_values('locationname',ascending=True)

CA_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_4)]
CA_insurance_2023_df = CA_insurance_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_5)]
CO_insurance_2022_df = CO_insurance_2022_df.sort_values('locationname',ascending=True)

CO_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_5)]
CO_insurance_2023_df = CO_insurance_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_6)]
CT_insurance_2022_df = CT_insurance_2022_df.sort_values('locationname',ascending=True)

CT_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_6)]
CT_insurance_2023_df = CT_insurance_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_7)]
DL_insurance_2022_df = DL_insurance_2022_df.sort_values('locationname',ascending=True)

DL_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_7)]
DL_insurance_2023_df = DL_insurance_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_8)]
DC_insurance_2022_df = DC_insurance_2022_df.sort_values('locationname',ascending=True)

DC_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_8)]
DC_insurance_2023_df = DC_insurance_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_9)]
GA_insurance_2022_df = GA_insurance_2022_df.sort_values('locationname',ascending=True)

GA_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_9)]
GA_insurance_2023_df = GA_insurance_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_10)]
HI_insurance_2022_df = HI_insurance_2022_df.sort_values('locationname',ascending=True)

HI_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_10)]
HI_insurance_2023_df = HI_insurance_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_11)]
ID_insurance_2022_df = ID_insurance_2022_df.sort_values('locationname',ascending=True)

ID_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_11)]
ID_insurance_2023_df = ID_insurance_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_12)]
IL_insurance_2022_df = IL_insurance_2022_df.sort_values('locationname',ascending=True)

IL_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_12)]
IL_insurance_2023_df = IL_insurance_2023_df.sort_values('locationname',ascending=True)

state_13 = ['Indiana']
IN_insurance_2022_df = insurance_2022_df[insurance_2022_df['statedesc'].isin(state_13)]
IN_insurance_2022_df = IN_insurance_2022_df.sort_values('locationname',ascending=True)

IN_insurance_2023_df = insurance_2023_df[insurance_2023_df['statedesc'].isin(state_13)]
IN_insurance_2023_df = IN_insurance_2023_df.sort_values('locationname',ascending=True)

Annual Checkup

In [22]:
state_1 = ['Alabama']
AL_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_1)]
AL_checkup_2022_df = AL_checkup_2022_df.sort_values('locationname',ascending=True)

AL_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_1)]
AL_checkup_2023_df = AL_checkup_2023_df.sort_values('locationname',ascending=True)

state_2 = ['Arizona']
AZ_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_2)]
AZ_checkup_2022_df = AZ_checkup_2022_df.sort_values('locationname',ascending=True)

AZ_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_2)]
AZ_checkup_2023_df = AZ_checkup_2023_df.sort_values('locationname',ascending=True)

state_3 = ['Arkansas']
AR_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_3)]
AR_checkup_2022_df = AR_checkup_2022_df.sort_values('locationname',ascending=True)

AR_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_3)]
AR_checkup_2023_df = AR_checkup_2023_df.sort_values('locationname',ascending=True)

state_4 = ['California']
CA_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_4)]
CA_checkup_2022_df = CA_checkup_2022_df.sort_values('locationname',ascending=True)

CA_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_4)]
CA_checkup_2023_df = CA_checkup_2023_df.sort_values('locationname',ascending=True)

state_5 = ['Colorado']
CO_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_5)]
CO_checkup_2022_df = CO_checkup_2022_df.sort_values('locationname',ascending=True)

CO_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_5)]
CO_checkup_2023_df = CO_checkup_2023_df.sort_values('locationname',ascending=True)

state_6 = ['Connecticut']
CT_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_6)]
CT_checkup_2022_df = CT_checkup_2022_df.sort_values('locationname',ascending=True)

CT_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_6)]
CT_checkup_2023_df = CT_checkup_2023_df.sort_values('locationname',ascending=True)

state_7 = ['Delaware']
DL_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_7)]
DL_checkup_2022_df = DL_checkup_2022_df.sort_values('locationname',ascending=True)

DL_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_7)]
DL_checkup_2023_df = DL_checkup_2023_df.sort_values('locationname',ascending=True)

state_8 = ['District of Columbia']
DC_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_8)]
DC_checkup_2022_df = DC_checkup_2022_df.sort_values('locationname',ascending=True)

DC_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_8)]
DC_checkup_2023_df = DC_checkup_2023_df.sort_values('locationname',ascending=True)

state_9 = ['Georgia']
GA_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_9)]
GA_checkup_2022_df = GA_checkup_2022_df.sort_values('locationname',ascending=True)

GA_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_9)]
GA_checkup_2023_df = GA_checkup_2023_df.sort_values('locationname',ascending=True)

state_10 = ['Hawaii']
HI_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_10)]
HI_checkup_2022_df = HI_checkup_2022_df.sort_values('locationname',ascending=True)

HI_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_10)]
HI_checkup_2023_df = HI_checkup_2023_df.sort_values('locationname',ascending=True)

state_11 = ['Idaho']
ID_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_11)]
ID_checkup_2022_df = ID_checkup_2022_df.sort_values('locationname',ascending=True)

ID_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_11)]
ID_checkup_2023_df = ID_checkup_2023_df.sort_values('locationname',ascending=True)

state_12 = ['Illinois']
IL_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_12)]
IL_checkup_2022_df = IL_checkup_2022_df.sort_values('locationname',ascending=True)

IL_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_12)]
IL_checkup_2023_df = IL_checkup_2023_df.sort_values('locationname',ascending=True)

state_13 = ['Indiana']
IN_checkup_2022_df = checkup_2022_df[checkup_2022_df['statedesc'].isin(state_13)]
IN_checkup_2022_df = IN_checkup_2022_df.sort_values('locationname',ascending=True)

IN_checkup_2023_df = checkup_2023_df[checkup_2023_df['statedesc'].isin(state_13)]
IN_checkup_2023_df = IN_checkup_2023_df.sort_values('locationname',ascending=True)

Merging the data by state and health condition

Cancer Merge

In [23]:
col = ['year_y','stateabbr_y','statedesc_y', 'datasource_y', 'category_y', 'measure_y',
       'data_value_unit_y', 'data_value_type_y','totalpopulation_y','geolocation_y', 'locationid_y', 'categoryid_y',
       'measureid_y', 'datavaluetypeid_y','categoryid_x','measureid_x','datavaluetypeid_x','locationid_x']

AL_cancer = pd.merge(AL_cancer_2022_df, AL_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_cancer = pd.merge(AZ_cancer_2022_df, AZ_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_cancer = pd.merge(AR_cancer_2022_df, AR_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_cancer = pd.merge(CA_cancer_2022_df, CA_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_cancer = pd.merge(CO_cancer_2022_df, CO_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_cancer = pd.merge(CT_cancer_2022_df, CT_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_cancer = pd.merge(DL_cancer_2022_df, DL_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_cancer = pd.merge(DC_cancer_2022_df, DC_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_cancer = pd.merge(GA_cancer_2022_df, GA_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_cancer = pd.merge(HI_cancer_2022_df, HI_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_cancer = pd.merge(ID_cancer_2022_df, ID_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_cancer = pd.merge(IL_cancer_2022_df, IL_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_cancer = pd.merge(IN_cancer_2022_df, IN_cancer_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

cancer_df = pd.concat([AL_cancer,AZ_cancer,AR_cancer,CA_cancer,CO_cancer,CT_cancer,DL_cancer,DC_cancer,GA_cancer,HI_cancer,
                       ID_cancer,IL_cancer,IN_cancer], axis=0)

Coronary Heart Disease Merge

In [24]:
AL_chd = pd.merge(AL_chd_2022_df, AL_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_chd = pd.merge(AZ_chd_2022_df, AZ_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_chd = pd.merge(AR_chd_2022_df, AR_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_chd = pd.merge(CA_chd_2022_df, CA_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_chd = pd.merge(CO_chd_2022_df, CO_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_chd = pd.merge(CT_chd_2022_df, CT_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_chd = pd.merge(DL_chd_2022_df, DL_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_chd = pd.merge(DC_chd_2022_df, DC_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_chd = pd.merge(GA_chd_2022_df, GA_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_chd = pd.merge(HI_chd_2022_df, HI_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_chd = pd.merge(ID_chd_2022_df, ID_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_chd = pd.merge(IL_chd_2022_df, IL_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_chd = pd.merge(IN_chd_2022_df, IN_chd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

chd_df = pd.concat([AL_chd,AZ_chd,AR_chd,CA_chd,CO_chd,CT_chd,DL_chd,DC_chd,GA_chd,HI_chd,ID_chd,IL_chd,IN_chd],axis=0)

COPD Merge

In [25]:
AL_copd = pd.merge(AL_copd_2022_df, AL_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_copd = pd.merge(AZ_copd_2022_df, AZ_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_copd = pd.merge(AR_copd_2022_df, AR_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_copd = pd.merge(CA_copd_2022_df, CA_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_copd = pd.merge(CO_copd_2022_df, CO_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_copd = pd.merge(CT_copd_2022_df, CT_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_copd = pd.merge(DL_copd_2022_df, DL_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_copd = pd.merge(DC_copd_2022_df, DC_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_copd = pd.merge(GA_copd_2022_df, GA_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_copd = pd.merge(HI_copd_2022_df, HI_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_copd = pd.merge(ID_copd_2022_df, ID_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_copd = pd.merge(IL_copd_2022_df, IL_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_copd = pd.merge(IN_copd_2022_df, IN_copd_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

copd_df = pd.concat([AL_copd,AZ_copd,AR_copd,CA_copd,CO_copd,CT_copd,DL_copd,DC_copd,GA_copd,HI_copd,ID_copd,IL_copd,IN_copd],axis=0)

Obesity Merge

In [26]:
AL_obesity = pd.merge(AL_obesity_2022_df, AL_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_obesity = pd.merge(AZ_obesity_2022_df, AZ_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_obesity = pd.merge(AR_obesity_2022_df, AR_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_obesity = pd.merge(CA_obesity_2022_df, CA_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_obesity = pd.merge(CO_obesity_2022_df, CO_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_obesity = pd.merge(CT_obesity_2022_df, CT_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_obesity = pd.merge(DL_obesity_2022_df, DL_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_obesity = pd.merge(DC_obesity_2022_df, DC_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_obesity = pd.merge(GA_obesity_2022_df, GA_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_obesity = pd.merge(HI_obesity_2022_df, HI_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_obesity = pd.merge(ID_obesity_2022_df, ID_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_obesity = pd.merge(IL_obesity_2022_df, IL_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_obesity = pd.merge(IN_obesity_2022_df, IN_obesity_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

obesity_df = pd.concat([AL_obesity,AZ_obesity,AR_obesity,CA_obesity,CO_obesity,CT_obesity,DL_obesity,DC_obesity,GA_obesity,
                       HI_obesity,ID_obesity,IL_obesity,IN_obesity],axis=0)

Diabetes Merge

In [27]:
AL_diabetes = pd.merge(AL_diabetes_2022_df, AL_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_diabetes = pd.merge(AZ_diabetes_2022_df, AZ_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_diabetes = pd.merge(AR_diabetes_2022_df, AR_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_diabetes = pd.merge(CA_diabetes_2022_df, CA_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_diabetes = pd.merge(CO_diabetes_2022_df, CO_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_diabetes = pd.merge(CT_diabetes_2022_df, CT_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_diabetes = pd.merge(DL_diabetes_2022_df, DL_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_diabetes = pd.merge(DC_diabetes_2022_df, DC_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_diabetes = pd.merge(GA_diabetes_2022_df, GA_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_diabetes = pd.merge(HI_diabetes_2022_df, HI_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_diabetes = pd.merge(ID_diabetes_2022_df, ID_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_diabetes = pd.merge(IL_diabetes_2022_df, IL_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_diabetes = pd.merge(IN_diabetes_2022_df, IN_diabetes_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

diabetes_df = pd.concat([AL_diabetes,AZ_diabetes,AR_diabetes,CA_diabetes,CO_diabetes,CT_diabetes,DL_diabetes,DC_diabetes,
                        GA_diabetes,HI_diabetes,ID_diabetes,IL_diabetes,IN_diabetes],axis=0)

Health Insurance Merge

In [28]:
AL_insurance = pd.merge(AL_insurance_2022_df, AL_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_insurance = pd.merge(AZ_insurance_2022_df, AZ_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_insurance = pd.merge(AR_insurance_2022_df, AR_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_insurance = pd.merge(CA_insurance_2022_df, CA_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_insurance = pd.merge(CO_insurance_2022_df, CO_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_insurance = pd.merge(CT_insurance_2022_df, CT_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_insurance = pd.merge(DL_insurance_2022_df, DL_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_insurance = pd.merge(DC_insurance_2022_df, DC_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_insurance = pd.merge(GA_insurance_2022_df, GA_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_insurance = pd.merge(HI_insurance_2022_df, HI_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_insurance = pd.merge(ID_insurance_2022_df, ID_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_insurance = pd.merge(IL_insurance_2022_df, IL_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_insurance = pd.merge(IN_insurance_2022_df, IN_insurance_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

insurance_df = pd.concat([AL_insurance,AZ_insurance,AR_insurance,CA_insurance,CO_insurance,CT_insurance,DL_insurance,DC_insurance,
                         GA_insurance,HI_insurance,ID_insurance,IL_insurance,IN_insurance],axis=0)

Annual Checkup Merge

In [29]:
AL_checkup = pd.merge(AL_checkup_2022_df, AL_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AZ_checkup = pd.merge(AZ_checkup_2022_df, AZ_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
AR_checkup = pd.merge(AR_checkup_2022_df, AR_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CA_checkup = pd.merge(CA_checkup_2022_df, CA_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CO_checkup = pd.merge(CO_checkup_2022_df, CO_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
CT_checkup = pd.merge(CT_checkup_2022_df, CT_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DL_checkup = pd.merge(DL_checkup_2022_df, DL_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
DC_checkup = pd.merge(DC_checkup_2022_df, DC_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
GA_checkup = pd.merge(GA_checkup_2022_df, GA_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
HI_checkup = pd.merge(HI_checkup_2022_df, HI_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
ID_checkup = pd.merge(ID_checkup_2022_df, ID_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IL_checkup = pd.merge(IL_checkup_2022_df, IL_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()
IN_checkup = pd.merge(IN_checkup_2022_df, IN_checkup_2023_df, on='locationname', how='inner').drop(col, axis=1).dropna()

checkup_df = pd.concat([AL_checkup,AZ_checkup,AR_checkup,CA_checkup,CO_checkup,CT_checkup,DL_checkup,DC_checkup,GA_checkup,
                       HI_checkup,ID_checkup,IL_checkup,IN_checkup],axis=0)

Computing for y (change in data value)

if data_value_x < data_value_y, y = 1
if data_value_x >= data_value_y, y = 0

CANCER Datasets

In [30]:
AL_cancer['y'] = pd.Series(dtype=int)

for index, row in AL_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_cancer.at[index,'y'] = 1
    else:
        AL_cancer.at[index,'y'] = 0

AZ_cancer['y'] = pd.Series(dtype=int)

for index, row in AZ_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_cancer.at[index,'y'] = 1
    else:
        AZ_cancer.at[index,'y'] = 0

AR_cancer['y'] = pd.Series(dtype=int)

for index, row in AR_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_cancer.at[index,'y'] = 1
    else:
        AR_cancer.at[index,'y'] = 0

CA_cancer['y'] = pd.Series(dtype=int)

for index, row in CA_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_cancer.at[index,'y'] = 1
    else:
        CA_cancer.at[index,'y'] = 0

CO_cancer['y'] = pd.Series(dtype=int)

for index, row in CO_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_cancer.at[index,'y'] = 1
    else:
        CO_cancer.at[index,'y'] = 0

CT_cancer['y'] = pd.Series(dtype=int)

for index, row in CT_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_cancer.at[index,'y'] = 1
    else:
        CT_cancer.at[index,'y'] = 0

DL_cancer['y'] = pd.Series(dtype=int)

for index, row in DL_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_cancer.at[index,'y'] = 1
    else:
        DL_cancer.at[index,'y'] = 0

DC_cancer['y'] = pd.Series(dtype=int)

for index, row in DC_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_cancer.at[index,'y'] = 1
    else:
        DC_cancer.at[index,'y'] = 0

GA_cancer['y'] = pd.Series(dtype=int)

for index, row in GA_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_cancer.at[index,'y'] = 1
    else:
        GA_cancer.at[index,'y'] = 0

HI_cancer['y'] = pd.Series(dtype=int)

for index, row in HI_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_cancer.at[index,'y'] = 1
    else:
        HI_cancer.at[index,'y'] = 0

ID_cancer['y'] = pd.Series(dtype=int)

for index, row in ID_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_cancer.at[index,'y'] = 1
    else:
        ID_cancer.at[index,'y'] = 0

IL_cancer['y'] = pd.Series(dtype=int)

for index, row in IL_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_cancer.at[index,'y'] = 1
    else:
        IL_cancer.at[index,'y'] = 0

IN_cancer['y'] = pd.Series(dtype=int)

for index, row in IN_cancer.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_cancer.at[index,'y'] = 1
    else:
        IN_cancer.at[index,'y'] = 0

cancer_df['y'] = pd.Series(dtype=int)

for index, row in cancer_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        cancer_df.at[index,'y'] = 1
    else:
        cancer_df.at[index,'y'] = 0


CORONARY HEART DISEASE Datasets

In [31]:
AL_chd['y'] = pd.Series(dtype=int)

for index, row in AL_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_chd.at[index,'y'] = 1
    else:
        AL_chd.at[index,'y'] = 0

AZ_chd['y'] = pd.Series(dtype=int)

for index, row in AZ_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_chd.at[index,'y'] = 1
    else:
        AZ_chd.at[index,'y'] = 0

AR_chd['y'] = pd.Series(dtype=int)

for index, row in AR_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_chd.at[index,'y'] = 1
    else:
        AR_chd.at[index,'y'] = 0

CA_chd['y'] = pd.Series(dtype=int)

for index, row in CA_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_chd.at[index,'y'] = 1
    else:
        CA_chd.at[index,'y'] = 0

CO_chd['y'] = pd.Series(dtype=int)

for index, row in CO_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_chd.at[index,'y'] = 1
    else:
        CO_chd.at[index,'y'] = 0

CT_chd['y'] = pd.Series(dtype=int)

for index, row in CT_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_chd.at[index,'y'] = 1
    else:
        CT_chd.at[index,'y'] = 0

DL_chd['y'] = pd.Series(dtype=int)

for index, row in DL_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_chd.at[index,'y'] = 1
    else:
        DL_chd.at[index,'y'] = 0

DC_chd['y'] = pd.Series(dtype=int)

for index, row in DC_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_chd.at[index,'y'] = 1
    else:
        DC_chd.at[index,'y'] = 0

GA_chd['y'] = pd.Series(dtype=int)

for index, row in GA_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_chd.at[index,'y'] = 1
    else:
        GA_chd.at[index,'y'] = 0

HI_chd['y'] = pd.Series(dtype=int)

for index, row in HI_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_chd.at[index,'y'] = 1
    else:
        HI_chd.at[index,'y'] = 0

ID_chd['y'] = pd.Series(dtype=int)

for index, row in ID_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_chd.at[index,'y'] = 1
    else:
        ID_chd.at[index,'y'] = 0

IL_chd['y'] = pd.Series(dtype=int)

for index, row in IL_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_chd.at[index,'y'] = 1
    else:
        IL_chd.at[index,'y'] = 0

IN_chd['y'] = pd.Series(dtype=int)

for index, row in IN_chd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_chd.at[index,'y'] = 1
    else:
        IN_chd.at[index,'y'] = 0

chd_df['y'] = pd.Series(dtype=int)

for index, row in chd_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        chd_df.at[index,'y'] = 1
    else:
        chd_df.at[index,'y'] = 0


COPD Datasets

In [32]:
AL_copd['y'] = pd.Series(dtype=int)

for index, row in AL_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_copd.at[index,'y'] = 1
    else:
        AL_copd.at[index,'y'] = 0

AZ_copd['y'] = pd.Series(dtype=int)

for index, row in AZ_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_copd.at[index,'y'] = 1
    else:
        AZ_copd.at[index,'y'] = 0

AR_copd['y'] = pd.Series(dtype=int)

for index, row in AR_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_copd.at[index,'y'] = 1
    else:
        AR_copd.at[index,'y'] = 0

CA_copd['y'] = pd.Series(dtype=int)

for index, row in CA_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_copd.at[index,'y'] = 1
    else:
        CA_copd.at[index,'y'] = 0

CO_copd['y'] = pd.Series(dtype=int)

for index, row in CO_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_copd.at[index,'y'] = 1
    else:
        CO_copd.at[index,'y'] = 0

CT_copd['y'] = pd.Series(dtype=int)

for index, row in CT_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_copd.at[index,'y'] = 1
    else:
        CT_copd.at[index,'y'] = 0

DL_copd['y'] = pd.Series(dtype=int)

for index, row in DL_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_copd.at[index,'y'] = 1
    else:
        DL_copd.at[index,'y'] = 0

DC_copd['y'] = pd.Series(dtype=int)

for index, row in DC_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_copd.at[index,'y'] = 1
    else:
        DC_copd.at[index,'y'] = 0

GA_copd['y'] = pd.Series(dtype=int)

for index, row in GA_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_copd.at[index,'y'] = 1
    else:
        GA_copd.at[index,'y'] = 0

HI_copd['y'] = pd.Series(dtype=int)

for index, row in HI_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_copd.at[index,'y'] = 1
    else:
        HI_copd.at[index,'y'] = 0

ID_copd['y'] = pd.Series(dtype=int)

for index, row in ID_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_copd.at[index,'y'] = 1
    else:
        ID_copd.at[index,'y'] = 0

IL_copd['y'] = pd.Series(dtype=int)

for index, row in IL_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_copd.at[index,'y'] = 1
    else:
        IL_copd.at[index,'y'] = 0

IN_copd['y'] = pd.Series(dtype=int)

for index, row in IN_copd.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_copd.at[index,'y'] = 1
    else:
        IN_copd.at[index,'y'] = 0

copd_df['y'] = pd.Series(dtype=int)

for index, row in copd_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        copd_df.at[index,'y'] = 1
    else:
        copd_df.at[index,'y'] = 0


OBESITY Datasets

In [33]:
AL_obesity['y'] = pd.Series(dtype=int)

for index, row in AL_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_obesity.at[index,'y'] = 1
    else:
        AL_obesity.at[index,'y'] = 0

AZ_obesity['y'] = pd.Series(dtype=int)

for index, row in AZ_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_obesity.at[index,'y'] = 1
    else:
        AZ_obesity.at[index,'y'] = 0

AR_obesity['y'] = pd.Series(dtype=int)

for index, row in AR_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_obesity.at[index,'y'] = 1
    else:
        AR_obesity.at[index,'y'] = 0

CA_obesity['y'] = pd.Series(dtype=int)

for index, row in CA_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_obesity.at[index,'y'] = 1
    else:
        CA_obesity.at[index,'y'] = 0

CO_obesity['y'] = pd.Series(dtype=int)

for index, row in CO_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_obesity.at[index,'y'] = 1
    else:
        CO_obesity.at[index,'y'] = 0

CT_obesity['y'] = pd.Series(dtype=int)

for index, row in CT_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_obesity.at[index,'y'] = 1
    else:
        CT_obesity.at[index,'y'] = 0

DL_obesity['y'] = pd.Series(dtype=int)

for index, row in DL_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_obesity.at[index,'y'] = 1
    else:
        DL_obesity.at[index,'y'] = 0

DC_obesity['y'] = pd.Series(dtype=int)

for index, row in DC_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_obesity.at[index,'y'] = 1
    else:
        DC_obesity.at[index,'y'] = 0

GA_obesity['y'] = pd.Series(dtype=int)

for index, row in GA_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_obesity.at[index,'y'] = 1
    else:
        GA_obesity.at[index,'y'] = 0

HI_obesity['y'] = pd.Series(dtype=int)

for index, row in HI_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_obesity.at[index,'y'] = 1
    else:
        HI_obesity.at[index,'y'] = 0

ID_obesity['y'] = pd.Series(dtype=int)

for index, row in ID_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_obesity.at[index,'y'] = 1
    else:
        ID_obesity.at[index,'y'] = 0

IL_obesity['y'] = pd.Series(dtype=int)

for index, row in IL_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_obesity.at[index,'y'] = 1
    else:
        IL_obesity.at[index,'y'] = 0

IN_obesity['y'] = pd.Series(dtype=int)

for index, row in IN_obesity.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_obesity.at[index,'y'] = 1
    else:
        IN_obesity.at[index,'y'] = 0

obesity_df['y'] = pd.Series(dtype=int)

for index, row in obesity_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        obesity_df.at[index,'y'] = 1
    else:
        obesity_df.at[index,'y'] = 0


DIABATES Datasets

In [34]:
AL_diabetes['y'] = pd.Series(dtype=int)

for index, row in AL_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_diabetes.at[index,'y'] = 1
    else:
        AL_diabetes.at[index,'y'] = 0

AZ_diabetes['y'] = pd.Series(dtype=int)

for index, row in AZ_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_diabetes.at[index,'y'] = 1
    else:
        AZ_diabetes.at[index,'y'] = 0

AR_diabetes['y'] = pd.Series(dtype=int)

for index, row in AR_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_diabetes.at[index,'y'] = 1
    else:
        AR_diabetes.at[index,'y'] = 0

CA_diabetes['y'] = pd.Series(dtype=int)

for index, row in CA_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_diabetes.at[index,'y'] = 1
    else:
        CA_diabetes.at[index,'y'] = 0

CO_diabetes['y'] = pd.Series(dtype=int)

for index, row in CO_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_diabetes.at[index,'y'] = 1
    else:
        CO_diabetes.at[index,'y'] = 0

CT_diabetes['y'] = pd.Series(dtype=int)

for index, row in CT_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_diabetes.at[index,'y'] = 1
    else:
        CT_diabetes.at[index,'y'] = 0

DL_diabetes['y'] = pd.Series(dtype=int)

for index, row in DL_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_diabetes.at[index,'y'] = 1
    else:
        DL_diabetes.at[index,'y'] = 0

DC_diabetes['y'] = pd.Series(dtype=int)

for index, row in DC_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_diabetes.at[index,'y'] = 1
    else:
        DC_diabetes.at[index,'y'] = 0

GA_diabetes['y'] = pd.Series(dtype=int)

for index, row in GA_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_diabetes.at[index,'y'] = 1
    else:
        GA_diabetes.at[index,'y'] = 0

HI_diabetes['y'] = pd.Series(dtype=int)

for index, row in HI_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_diabetes.at[index,'y'] = 1
    else:
        HI_diabetes.at[index,'y'] = 0

ID_diabetes['y'] = pd.Series(dtype=int)

for index, row in ID_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_diabetes.at[index,'y'] = 1
    else:
        ID_diabetes.at[index,'y'] = 0

IL_diabetes['y'] = pd.Series(dtype=int)

for index, row in IL_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_diabetes.at[index,'y'] = 1
    else:
        IL_diabetes.at[index,'y'] = 0

IN_diabetes['y'] = pd.Series(dtype=int)

for index, row in IN_diabetes.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_diabetes.at[index,'y'] = 1
    else:
        IN_diabetes.at[index,'y'] = 0

diabetes_df['y'] = pd.Series(dtype=int)

for index, row in diabetes_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        diabetes_df.at[index,'y'] = 1
    else:
        diabetes_df.at[index,'y'] = 0



HEALTH INSURANCE Datasets

In [35]:
AL_insurance['y'] = pd.Series(dtype=int)

for index, row in AL_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_insurance.at[index,'y'] = 1
    else:
        AL_insurance.at[index,'y'] = 0

AZ_insurance['y'] = pd.Series(dtype=int)

for index, row in AZ_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_insurance.at[index,'y'] = 1
    else:
        AZ_insurance.at[index,'y'] = 0

AR_insurance['y'] = pd.Series(dtype=int)

for index, row in AR_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_insurance.at[index,'y'] = 1
    else:
        AR_insurance.at[index,'y'] = 0

CA_insurance['y'] = pd.Series(dtype=int)

for index, row in CA_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_insurance.at[index,'y'] = 1
    else:
        CA_insurance.at[index,'y'] = 0

CO_insurance['y'] = pd.Series(dtype=int)

for index, row in CO_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_insurance.at[index,'y'] = 1
    else:
        CO_insurance.at[index,'y'] = 0

CT_insurance['y'] = pd.Series(dtype=int)

for index, row in CT_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_insurance.at[index,'y'] = 1
    else:
        CT_insurance.at[index,'y'] = 0

DL_insurance['y'] = pd.Series(dtype=int)

for index, row in DL_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_insurance.at[index,'y'] = 1
    else:
        DL_insurance.at[index,'y'] = 0

DC_insurance['y'] = pd.Series(dtype=int)

for index, row in DC_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_insurance.at[index,'y'] = 1
    else:
        DC_insurance.at[index,'y'] = 0

GA_insurance['y'] = pd.Series(dtype=int)

for index, row in GA_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_insurance.at[index,'y'] = 1
    else:
        GA_insurance.at[index,'y'] = 0

HI_insurance['y'] = pd.Series(dtype=int)

for index, row in HI_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_insurance.at[index,'y'] = 1
    else:
        HI_insurance.at[index,'y'] = 0

ID_insurance['y'] = pd.Series(dtype=int)

for index, row in ID_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_insurance.at[index,'y'] = 1
    else:
        ID_insurance.at[index,'y'] = 0

IL_insurance['y'] = pd.Series(dtype=int)

for index, row in IL_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_insurance.at[index,'y'] = 1
    else:
        IL_insurance.at[index,'y'] = 0

IN_insurance['y'] = pd.Series(dtype=int)

for index, row in IN_insurance.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_insurance.at[index,'y'] = 1
    else:
        IN_insurance.at[index,'y'] = 0

insurance_df['y'] = pd.Series(dtype=int)

for index, row in insurance_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        insurance_df.at[index,'y'] = 1
    else:
        insurance_df.at[index,'y'] = 0


ANNUAL CHECK-UP Datasets

In [36]:
AL_checkup['y'] = pd.Series(dtype=int)

for index, row in AL_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AL_checkup.at[index,'y'] = 1
    else:
        AL_checkup.at[index,'y'] = 0

AZ_checkup['y'] = pd.Series(dtype=int)

for index, row in AZ_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AZ_checkup.at[index,'y'] = 1
    else:
        AZ_checkup.at[index,'y'] = 0

AR_checkup['y'] = pd.Series(dtype=int)

for index, row in AR_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        AR_checkup.at[index,'y'] = 1
    else:
        AR_checkup.at[index,'y'] = 0

CA_checkup['y'] = pd.Series(dtype=int)

for index, row in CA_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CA_checkup.at[index,'y'] = 1
    else:
        CA_checkup.at[index,'y'] = 0

CO_checkup['y'] = pd.Series(dtype=int)

for index, row in CO_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CO_checkup.at[index,'y'] = 1
    else:
        CO_checkup.at[index,'y'] = 0

CT_checkup['y'] = pd.Series(dtype=int)

for index, row in CT_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        CT_checkup.at[index,'y'] = 1
    else:
        CT_checkup.at[index,'y'] = 0

DL_checkup['y'] = pd.Series(dtype=int)

for index, row in DL_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DL_checkup.at[index,'y'] = 1
    else:
        DL_checkup.at[index,'y'] = 0

DC_checkup['y'] = pd.Series(dtype=int)

for index, row in DC_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        DC_checkup.at[index,'y'] = 1
    else:
        DC_checkup.at[index,'y'] = 0

GA_checkup['y'] = pd.Series(dtype=int)

for index, row in GA_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        GA_checkup.at[index,'y'] = 1
    else:
        GA_checkup.at[index,'y'] = 0

HI_checkup['y'] = pd.Series(dtype=int)

for index, row in HI_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        HI_checkup.at[index,'y'] = 1
    else:
        HI_checkup.at[index,'y'] = 0

ID_checkup['y'] = pd.Series(dtype=int)

for index, row in ID_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        ID_checkup.at[index,'y'] = 1
    else:
        ID_checkup.at[index,'y'] = 0

IL_checkup['y'] = pd.Series(dtype=int)

for index, row in IL_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IL_checkup.at[index,'y'] = 1
    else:
        IL_checkup.at[index,'y'] = 0

IN_checkup['y'] = pd.Series(dtype=int)

for index, row in IN_checkup.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        IN_checkup.at[index,'y'] = 1
    else:
        IN_checkup.at[index,'y'] = 0

checkup_df['y'] = pd.Series(dtype=int)

for index, row in checkup_df.iterrows():
    if (row['data_value_x'] < row['data_value_y']):
        checkup_df.at[index,'y'] = 1
    else:
        checkup_df.at[index,'y'] = 0


Using the KNN Model

In [43]:
AL_cancer.to_json('new_databases/AL_cancer.json')
AZ_cancer.to_json('new_databases/AZ_cancer.json')
AR_cancer.to_json('new_databases/AR_cancer.json')
CA_cancer.to_json('new_databases/CA_cancer.json')
CO_cancer.to_json('new_databases/CO_cancer.json')
CT_cancer.to_json('new_databases/CT_cancer.json')
DL_cancer.to_json('new_databases/DL_cancer.json')
DC_cancer.to_json('new_databases/DC_cancer.json')
GA_cancer.to_json('new_databases/GA_cancer.json')
HI_cancer.to_json('new_databases/HI_cancer.json')
ID_cancer.to_json('new_databases/ID_cancer.json')
IL_cancer.to_json('new_databases/IL_cancer.json')
IN_cancer.to_json('new_databases/IN_cancer.json')

In [44]:
AL_chd.to_json('new_databases/AL_chd.json')
AZ_chd.to_json('new_databases/AZ_chd.json')
AR_chd.to_json('new_databases/AR_chd.json')
CA_chd.to_json('new_databases/CA_chd.json')
CO_chd.to_json('new_databases/CO_chd.json')
CT_chd.to_json('new_databases/CT_chd.json')
DL_chd.to_json('new_databases/DL_chd.json')
DC_chd.to_json('new_databases/DC_chd.json')
GA_chd.to_json('new_databases/GA_chd.json')
HI_chd.to_json('new_databases/HI_chd.json')
ID_chd.to_json('new_databases/ID_chd.json')
IL_chd.to_json('new_databases/IL_chd.json')
IN_chd.to_json('new_databases/IN_chd.json')

In [45]:
AL_copd.to_json('new_databases/AL_copd.json')
AZ_copd.to_json('new_databases/AZ_copd.json')
AR_copd.to_json('new_databases/AR_copd.json')
CA_copd.to_json('new_databases/CA_copd.json')
CO_copd.to_json('new_databases/CO_copd.json')
CT_copd.to_json('new_databases/CT_copd.json')
DL_copd.to_json('new_databases/DL_copd.json')
DC_copd.to_json('new_databases/DC_copd.json')
GA_copd.to_json('new_databases/GA_copd.json')
HI_copd.to_json('new_databases/HI_copd.json')
ID_copd.to_json('new_databases/ID_copd.json')
IL_copd.to_json('new_databases/IL_copd.json')
IN_copd.to_json('new_databases/IN_copd.json')

In [46]:
AL_obesity.to_json('new_databases/AL_obesity.json')
AZ_obesity.to_json('new_databases/AZ_obesity.json')
AR_obesity.to_json('new_databases/AR_obesity.json')
CA_obesity.to_json('new_databases/CA_obesity.json')
CO_obesity.to_json('new_databases/CO_obesity.json')
CT_obesity.to_json('new_databases/CT_obesity.json')
DL_obesity.to_json('new_databases/DL_obesity.json')
DC_obesity.to_json('new_databases/DC_obesity.json')
GA_obesity.to_json('new_databases/GA_obesity.json')
HI_obesity.to_json('new_databases/HI_obesity.json')
ID_obesity.to_json('new_databases/ID_obesity.json')
IL_obesity.to_json('new_databases/IL_obesity.json')
IN_obesity.to_json('new_databases/IN_obesity.json')

In [47]:
AL_diabetes.to_json('new_databases/AL_diabetes.json')
AZ_diabetes.to_json('new_databases/AZ_diabetes.json')
AR_diabetes.to_json('new_databases/AR_diabetes.json')
CA_diabetes.to_json('new_databases/CA_diabetes.json')
CO_diabetes.to_json('new_databases/CO_diabetes.json')
CT_diabetes.to_json('new_databases/CT_diabetes.json')
DL_diabetes.to_json('new_databases/DL_diabetes.json')
DC_diabetes.to_json('new_databases/DC_diabetes.json')
GA_diabetes.to_json('new_databases/GA_diabetes.json')
HI_diabetes.to_json('new_databases/HI_diabetes.json')
ID_diabetes.to_json('new_databases/ID_diabetes.json')
IL_diabetes.to_json('new_databases/IL_diabetes.json')
IN_diabetes.to_json('new_databases/IN_diabetes.json')

In [48]:
AL_insurance.to_json('new_databases/AL_insurance.json')
AZ_insurance.to_json('new_databases/AZ_insurance.json')
AR_insurance.to_json('new_databases/AR_insurance.json')
CA_insurance.to_json('new_databases/CA_insurance.json')
CO_insurance.to_json('new_databases/CO_insurance.json')
CT_insurance.to_json('new_databases/CT_insurance.json')
DL_insurance.to_json('new_databases/DL_insurance.json')
DC_insurance.to_json('new_databases/DC_insurance.json')
GA_insurance.to_json('new_databases/GA_insurance.json')
HI_insurance.to_json('new_databases/HI_insurance.json')
ID_insurance.to_json('new_databases/ID_insurance.json')
IL_insurance.to_json('new_databases/IL_insurance.json')
IN_insurance.to_json('new_databases/IN_insurance.json')

In [49]:
AL_checkup.to_json('new_databases/AL_checkup.json')
AZ_checkup.to_json('new_databases/AZ_checkup.json')
AR_checkup.to_json('new_databases/AR_checkup.json')
CA_checkup.to_json('new_databases/CA_checkup.json')
CO_checkup.to_json('new_databases/CO_checkup.json')
CT_checkup.to_json('new_databases/CT_checkup.json')
DL_checkup.to_json('new_databases/DL_checkup.json')
DC_checkup.to_json('new_databases/DC_checkup.json')
GA_checkup.to_json('new_databases/GA_checkup.json')
HI_checkup.to_json('new_databases/HI_checkup.json')
ID_checkup.to_json('new_databases/ID_checkup.json')
IL_checkup.to_json('new_databases/IL_checkup.json')
IN_checkup.to_json('new_databases/IN_checkup.json')

In [36]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, classification_report

knn = KNeighborsClassifier(n_neighbors=3)

Cancer

Alabama

In [37]:
X = AL_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.993006993006993
[[  0   0]
 [  1 142]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.99      1.00      1.00       142

    accuracy                           0.99       143
   macro avg       0.50      0.50      0.50       143
weighted avg       0.99      0.99      0.99       143



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Arizona

In [38]:
X = AZ_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[99]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        99

    accuracy                           1.00        99
   macro avg       1.00      1.00      1.00        99
weighted avg       1.00      1.00      1.00        99



Arkansas

In [39]:
X = AR_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[131]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00       131

    accuracy                           1.00       131
   macro avg       1.00      1.00      1.00       131
weighted avg       1.00      1.00      1.00       131



California

In [40]:
X = CA_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9863013698630136
[[  6   0]
 [  5 354]]
              precision    recall  f1-score   support

         0.0       1.00      0.55      0.71        11
         1.0       0.99      1.00      0.99       354

    accuracy                           0.99       365
   macro avg       0.99      0.77      0.85       365
weighted avg       0.99      0.99      0.98       365



Colorado

In [41]:
X = CO_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9902912621359223
[[16  0]
 [ 1 86]]
              precision    recall  f1-score   support

         0.0       1.00      0.94      0.97        17
         1.0       0.99      1.00      0.99        86

    accuracy                           0.99       103
   macro avg       0.99      0.97      0.98       103
weighted avg       0.99      0.99      0.99       103



Connecticut

In [42]:
X = CT_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CT_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8611111111111112
[[ 4  3]
 [ 2 27]]
              precision    recall  f1-score   support

         0.0       0.57      0.67      0.62         6
         1.0       0.93      0.90      0.92        30

    accuracy                           0.86        36
   macro avg       0.75      0.78      0.77        36
weighted avg       0.87      0.86      0.87        36



Delaware

In [43]:
X = DL_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[19]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        19

    accuracy                           1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19



District of Columbia

In [44]:
X = DC_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9935483870967742
[[  0   0]
 [  1 154]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.99      1.00      1.00       154

    accuracy                           0.99       155
   macro avg       0.50      0.50      0.50       155
weighted avg       0.99      0.99      0.99       155



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hawaii

In [None]:
X = HI_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[38]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        38

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



Idaho

In [None]:
X = ID_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[52]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        52

    accuracy                           1.00        52
   macro avg       1.00      1.00      1.00        52
weighted avg       1.00      1.00      1.00        52



Illinois

In [None]:
X = IL_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.991044776119403
[[  6   1]
 [  2 326]]
              precision    recall  f1-score   support

         0.0       0.86      0.75      0.80         8
         1.0       0.99      1.00      1.00       327

    accuracy                           0.99       335
   macro avg       0.93      0.87      0.90       335
weighted avg       0.99      0.99      0.99       335



Indiana

In [None]:
X = IN_cancer.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_cancer['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[66]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        66

    accuracy                           1.00        66
   macro avg       1.00      1.00      1.00        66
weighted avg       1.00      1.00      1.00        66



All Data

In [None]:
X = cancer_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = cancer_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9876462938881665
[[   1    0]
 [  19 1518]]
              precision    recall  f1-score   support

         0.0       1.00      0.05      0.10        20
         1.0       0.99      1.00      0.99      1518

    accuracy                           0.99      1538
   macro avg       0.99      0.53      0.54      1538
weighted avg       0.99      0.99      0.98      1538



Coronary Heart Disease Datasets

Alabama

In [None]:
X = AL_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[143]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       143

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



Arizona

In [None]:
X = AZ_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9090909090909091
[[87  7]
 [ 2  3]]
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.95        89
         1.0       0.60      0.30      0.40        10

    accuracy                           0.91        99
   macro avg       0.76      0.64      0.68        99
weighted avg       0.89      0.91      0.90        99



Arkansas

In [None]:
X = AR_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9847328244274809
[[129   2]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       129
         1.0       0.00      0.00      0.00         2

    accuracy                           0.98       131
   macro avg       0.49      0.50      0.50       131
weighted avg       0.97      0.98      0.98       131



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


California

In [None]:
X = CA_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[364   0]
 [  0   1]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       364
         1.0       1.00      1.00      1.00         1

    accuracy                           1.00       365
   macro avg       1.00      1.00      1.00       365
weighted avg       1.00      1.00      1.00       365



Colorado

In [None]:
X = CO_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9805825242718447
[[101   2]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.98      1.00      0.99       101
         1.0       0.00      0.00      0.00         2

    accuracy                           0.98       103
   macro avg       0.49      0.50      0.50       103
weighted avg       0.96      0.98      0.97       103



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Connecticut

In [None]:
X = CT_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CT_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[36]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        36

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



Delaware

In [None]:
X = DL_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[19]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        19

    accuracy                           1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19



District of Columbia

In [None]:
X = DC_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9935483870967742
[[154   1]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       154
         1.0       0.00      0.00      0.00         1

    accuracy                           0.99       155
   macro avg       0.50      0.50      0.50       155
weighted avg       0.99      0.99      0.99       155



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hawaii

In [None]:
X = HI_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[38]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        38

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



Idaho

In [None]:
X = ID_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[52]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        52

    accuracy                           1.00        52
   macro avg       1.00      1.00      1.00        52
weighted avg       1.00      1.00      1.00        52



Illinois

In [None]:
X = IL_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[335]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       335

    accuracy                           1.00       335
   macro avg       1.00      1.00      1.00       335
weighted avg       1.00      1.00      1.00       335



Indiana

In [None]:
X = IN_chd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_chd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[64]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        64

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64



All Data

In [None]:
X = chd_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = chd_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8679245283018868
[[   9   40]
 [ 163 1325]]
              precision    recall  f1-score   support

         0.0       0.18      0.05      0.08       172
         1.0       0.89      0.97      0.93      1365

    accuracy                           0.87      1537
   macro avg       0.54      0.51      0.51      1537
weighted avg       0.81      0.87      0.83      1537



COPD Datasets

Alabama

In [None]:
X = AL_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.986013986013986
[[141   2]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      0.99       141
         1.0       0.00      0.00      0.00         2

    accuracy                           0.99       143
   macro avg       0.49      0.50      0.50       143
weighted avg       0.97      0.99      0.98       143



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Arizona

In [None]:
X = AZ_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9090909090909091
[[76  8]
 [ 1 14]]
              precision    recall  f1-score   support

         0.0       0.90      0.99      0.94        77
         1.0       0.93      0.64      0.76        22

    accuracy                           0.91        99
   macro avg       0.92      0.81      0.85        99
weighted avg       0.91      0.91      0.90        99



Arkansas

In [None]:
X = AR_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8854961832061069
[[95 14]
 [ 1 21]]
              precision    recall  f1-score   support

         0.0       0.87      0.99      0.93        96
         1.0       0.95      0.60      0.74        35

    accuracy                           0.89       131
   macro avg       0.91      0.79      0.83       131
weighted avg       0.89      0.89      0.88       131



California

In [None]:
X = CA_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9945205479452055
[[363   2]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       363
         1.0       0.00      0.00      0.00         2

    accuracy                           0.99       365
   macro avg       0.50      0.50      0.50       365
weighted avg       0.99      0.99      0.99       365



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Colorado

In [None]:
X = CO_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8932038834951457
[[ 3  0]
 [11 89]]
              precision    recall  f1-score   support

         0.0       1.00      0.21      0.35        14
         1.0       0.89      1.00      0.94        89

    accuracy                           0.89       103
   macro avg       0.95      0.61      0.65       103
weighted avg       0.90      0.89      0.86       103



Connecticut

In [None]:
X = CT_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CT_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8055555555555556
[[23  4]
 [ 3  6]]
              precision    recall  f1-score   support

         0.0       0.85      0.88      0.87        26
         1.0       0.67      0.60      0.63        10

    accuracy                           0.81        36
   macro avg       0.76      0.74      0.75        36
weighted avg       0.80      0.81      0.80        36



Delaware

In [None]:
X = DL_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[19]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        19

    accuracy                           1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19



District of Columbia

In [None]:
X = DC_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9096774193548387
[[139  11]
 [  3   2]]
              precision    recall  f1-score   support

         0.0       0.93      0.98      0.95       142
         1.0       0.40      0.15      0.22        13

    accuracy                           0.91       155
   macro avg       0.66      0.57      0.59       155
weighted avg       0.88      0.91      0.89       155



Hawaii

In [None]:
X = HI_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.7368421052631579
[[18  7]
 [ 3 10]]
              precision    recall  f1-score   support

         0.0       0.72      0.86      0.78        21
         1.0       0.77      0.59      0.67        17

    accuracy                           0.74        38
   macro avg       0.74      0.72      0.72        38
weighted avg       0.74      0.74      0.73        38



Idaho

In [None]:
X = ID_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9615384615384616
[[ 0  0]
 [ 2 50]]
              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         2
         1.0       0.96      1.00      0.98        50

    accuracy                           0.96        52
   macro avg       0.48      0.50      0.49        52
weighted avg       0.92      0.96      0.94        52



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Illinois

In [None]:
X = IL_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9671641791044776
[[  1   2]
 [  9 323]]
              precision    recall  f1-score   support

         0.0       0.33      0.10      0.15        10
         1.0       0.97      0.99      0.98       325

    accuracy                           0.97       335
   macro avg       0.65      0.55      0.57       335
weighted avg       0.95      0.97      0.96       335



Indiana

In [None]:
X = IN_copd.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_copd['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.890625
[[ 3  2]
 [ 5 54]]
              precision    recall  f1-score   support

         0.0       0.60      0.38      0.46         8
         1.0       0.92      0.96      0.94        56

    accuracy                           0.89        64
   macro avg       0.76      0.67      0.70        64
weighted avg       0.88      0.89      0.88        64



All Data

In [None]:
X = copd_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = copd_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8679245283018868
[[   9   40]
 [ 163 1325]]
              precision    recall  f1-score   support

         0.0       0.18      0.05      0.08       172
         1.0       0.89      0.97      0.93      1365

    accuracy                           0.87      1537
   macro avg       0.54      0.51      0.51      1537
weighted avg       0.81      0.87      0.83      1537



Obesity

Alabama

In [None]:
X = AL_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9790209790209791
[[51  1]
 [ 2 89]]
              precision    recall  f1-score   support

         0.0       0.98      0.96      0.97        53
         1.0       0.98      0.99      0.98        90

    accuracy                           0.98       143
   macro avg       0.98      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143



Arizona

In [None]:
X = AZ_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9696969696969697
[[11  1]
 [ 2 85]]
              precision    recall  f1-score   support

         0.0       0.92      0.85      0.88        13
         1.0       0.98      0.99      0.98        86

    accuracy                           0.97        99
   macro avg       0.95      0.92      0.93        99
weighted avg       0.97      0.97      0.97        99



Arkansas

In [None]:
X = AR_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9618320610687023
[[44  1]
 [ 4 82]]
              precision    recall  f1-score   support

         0.0       0.98      0.92      0.95        48
         1.0       0.95      0.99      0.97        83

    accuracy                           0.96       131
   macro avg       0.97      0.95      0.96       131
weighted avg       0.96      0.96      0.96       131



Arkansas

In [None]:
X = AR_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9618320610687023
[[44  1]
 [ 4 82]]
              precision    recall  f1-score   support

         0.0       0.98      0.92      0.95        48
         1.0       0.95      0.99      0.97        83

    accuracy                           0.96       131
   macro avg       0.97      0.95      0.96       131
weighted avg       0.96      0.96      0.96       131



California

In [None]:
X = CA_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9424657534246575
[[192  13]
 [  8 152]]
              precision    recall  f1-score   support

         0.0       0.94      0.96      0.95       200
         1.0       0.95      0.92      0.94       165

    accuracy                           0.94       365
   macro avg       0.94      0.94      0.94       365
weighted avg       0.94      0.94      0.94       365



Colorado

In [None]:
X = CO_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9611650485436893
[[24  1]
 [ 3 75]]
              precision    recall  f1-score   support

         0.0       0.96      0.89      0.92        27
         1.0       0.96      0.99      0.97        76

    accuracy                           0.96       103
   macro avg       0.96      0.94      0.95       103
weighted avg       0.96      0.96      0.96       103



Delaware

In [None]:
X = DL_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[ 7  0]
 [ 0 12]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         7
         1.0       1.00      1.00      1.00        12

    accuracy                           1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19



District of Columbia

In [None]:
X = DC_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9354838709677419
[[64  2]
 [ 8 81]]
              precision    recall  f1-score   support

         0.0       0.97      0.89      0.93        72
         1.0       0.91      0.98      0.94        83

    accuracy                           0.94       155
   macro avg       0.94      0.93      0.93       155
weighted avg       0.94      0.94      0.94       155



Hawaii

In [None]:
X = HI_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[38]]
              precision    recall  f1-score   support

         1.0       1.00      1.00      1.00        38

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



Idaho

In [None]:
X = ID_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8846153846153846
[[23  6]
 [ 0 23]]
              precision    recall  f1-score   support

         0.0       0.79      1.00      0.88        23
         1.0       1.00      0.79      0.88        29

    accuracy                           0.88        52
   macro avg       0.90      0.90      0.88        52
weighted avg       0.91      0.88      0.88        52



Diabetes Datasets

Illinois

In [None]:
X = IL_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9880597014925373
[[ 62   1]
 [  3 269]]
              precision    recall  f1-score   support

         0.0       0.98      0.95      0.97        65
         1.0       0.99      1.00      0.99       270

    accuracy                           0.99       335
   macro avg       0.99      0.98      0.98       335
weighted avg       0.99      0.99      0.99       335



Indiana

In [None]:
X = IN_obesity.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_obesity['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9682539682539683
[[34  2]
 [ 0 27]]
              precision    recall  f1-score   support

         0.0       0.94      1.00      0.97        34
         1.0       1.00      0.93      0.96        29

    accuracy                           0.97        63
   macro avg       0.97      0.97      0.97        63
weighted avg       0.97      0.97      0.97        63



All Data

In [None]:
X = obesity_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = obesity_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.6197916666666666
[[153 236]
 [348 799]]
              precision    recall  f1-score   support

         0.0       0.39      0.31      0.34       501
         1.0       0.70      0.77      0.73      1035

    accuracy                           0.62      1536
   macro avg       0.54      0.54      0.54      1536
weighted avg       0.60      0.62      0.61      1536



Diabetes Datasets

Alabama

In [None]:
X = AL_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8881118881118881
[[50  2]
 [14 77]]
              precision    recall  f1-score   support

         0.0       0.96      0.78      0.86        64
         1.0       0.85      0.97      0.91        79

    accuracy                           0.89       143
   macro avg       0.90      0.88      0.88       143
weighted avg       0.90      0.89      0.89       143



Arizona

In [None]:
X = AZ_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8383838383838383
[[68  6]
 [10 15]]
              precision    recall  f1-score   support

         0.0       0.92      0.87      0.89        78
         1.0       0.60      0.71      0.65        21

    accuracy                           0.84        99
   macro avg       0.76      0.79      0.77        99
weighted avg       0.85      0.84      0.84        99



Arkansas

In [None]:
X = AR_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8625954198473282
[[74  8]
 [10 39]]
              precision    recall  f1-score   support

         0.0       0.90      0.88      0.89        84
         1.0       0.80      0.83      0.81        47

    accuracy                           0.86       131
   macro avg       0.85      0.86      0.85       131
weighted avg       0.86      0.86      0.86       131



California

In [None]:
X = CA_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9698630136986301
[[ 25   1]
 [ 10 329]]
              precision    recall  f1-score   support

         0.0       0.96      0.71      0.82        35
         1.0       0.97      1.00      0.98       330

    accuracy                           0.97       365
   macro avg       0.97      0.86      0.90       365
weighted avg       0.97      0.97      0.97       365



Colorado

In [None]:
X = CO_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.912621359223301
[[82  8]
 [ 1 12]]
              precision    recall  f1-score   support

         0.0       0.91      0.99      0.95        83
         1.0       0.92      0.60      0.73        20

    accuracy                           0.91       103
   macro avg       0.92      0.79      0.84       103
weighted avg       0.91      0.91      0.91       103



Connecticut

In [None]:
X = CT_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CT_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9444444444444444
[[ 2  0]
 [ 2 32]]
              precision    recall  f1-score   support

         0.0       1.00      0.50      0.67         4
         1.0       0.94      1.00      0.97        32

    accuracy                           0.94        36
   macro avg       0.97      0.75      0.82        36
weighted avg       0.95      0.94      0.94        36



Delaware

In [None]:
X = DL_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8421052631578947
[[9 0]
 [3 7]]
              precision    recall  f1-score   support

         0.0       1.00      0.75      0.86        12
         1.0       0.70      1.00      0.82         7

    accuracy                           0.84        19
   macro avg       0.85      0.88      0.84        19
weighted avg       0.89      0.84      0.84        19



District of Columbia

In [None]:
X = DC_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.896774193548387
[[66  9]
 [ 7 73]]
              precision    recall  f1-score   support

         0.0       0.88      0.90      0.89        73
         1.0       0.91      0.89      0.90        82

    accuracy                           0.90       155
   macro avg       0.90      0.90      0.90       155
weighted avg       0.90      0.90      0.90       155



Hawaii

In [None]:
X = HI_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9736842105263158
[[29  1]
 [ 0  8]]
              precision    recall  f1-score   support

         0.0       0.97      1.00      0.98        29
         1.0       1.00      0.89      0.94         9

    accuracy                           0.97        38
   macro avg       0.98      0.94      0.96        38
weighted avg       0.97      0.97      0.97        38



Idaho

In [None]:
X = ID_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8653846153846154
[[ 5  2]
 [ 5 40]]
              precision    recall  f1-score   support

         0.0       0.71      0.50      0.59        10
         1.0       0.89      0.95      0.92        42

    accuracy                           0.87        52
   macro avg       0.80      0.73      0.75        52
weighted avg       0.86      0.87      0.86        52



Illinois

In [None]:
X = IL_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9761194029850746
[[ 23   2]
 [  6 304]]
              precision    recall  f1-score   support

         0.0       0.92      0.79      0.85        29
         1.0       0.98      0.99      0.99       306

    accuracy                           0.98       335
   macro avg       0.95      0.89      0.92       335
weighted avg       0.98      0.98      0.98       335



Indiana

In [None]:
X = IN_diabetes.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_diabetes['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.90625
[[11  3]
 [ 3 47]]
              precision    recall  f1-score   support

         0.0       0.79      0.79      0.79        14
         1.0       0.94      0.94      0.94        50

    accuracy                           0.91        64
   macro avg       0.86      0.86      0.86        64
weighted avg       0.91      0.91      0.91        64



All Data

In [None]:
X = diabetes_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = diabetes_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.8165256994144438
[[  19   69]
 [ 213 1236]]
              precision    recall  f1-score   support

         0.0       0.22      0.08      0.12       232
         1.0       0.85      0.95      0.90      1305

    accuracy                           0.82      1537
   macro avg       0.53      0.51      0.51      1537
weighted avg       0.76      0.82      0.78      1537



Health Insurance Datasets

Alabama

In [None]:
X = AL_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[143]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       143

    accuracy                           1.00       143
   macro avg       1.00      1.00      1.00       143
weighted avg       1.00      1.00      1.00       143



Arizona

In [None]:
X = AZ_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9595959595959596
[[93  3]
 [ 1  2]]
              precision    recall  f1-score   support

         0.0       0.97      0.99      0.98        94
         1.0       0.67      0.40      0.50         5

    accuracy                           0.96        99
   macro avg       0.82      0.69      0.74        99
weighted avg       0.95      0.96      0.95        99



Arkansas

In [None]:
X = AR_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9923664122137404
[[130   1]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       130
         1.0       0.00      0.00      0.00         1

    accuracy                           0.99       131
   macro avg       0.50      0.50      0.50       131
weighted avg       0.98      0.99      0.99       131



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


California

In [None]:
X = CA_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9917808219178083
[[362   3]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       362
         1.0       0.00      0.00      0.00         3

    accuracy                           0.99       365
   macro avg       0.50      0.50      0.50       365
weighted avg       0.98      0.99      0.99       365



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Colorado

In [None]:
X = CO_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9805825242718447
[[101   1]
 [  1   0]]
              precision    recall  f1-score   support

         0.0       0.99      0.99      0.99       102
         1.0       0.00      0.00      0.00         1

    accuracy                           0.98       103
   macro avg       0.50      0.50      0.50       103
weighted avg       0.98      0.98      0.98       103



Connecticut

In [None]:
X = CT_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CT_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[36]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        36

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



Delaware

In [None]:
X = DL_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[19]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        19

    accuracy                           1.00        19
   macro avg       1.00      1.00      1.00        19
weighted avg       1.00      1.00      1.00        19



District of Columbia

In [None]:
X = DC_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9935483870967742
[[154   1]
 [  0   0]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       154
         1.0       0.00      0.00      0.00         1

    accuracy                           0.99       155
   macro avg       0.50      0.50      0.50       155
weighted avg       0.99      0.99      0.99       155



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Hawaii

In [None]:
X = HI_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[38]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        38

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



Idaho

In [None]:
X = ID_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[52]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        52

    accuracy                           1.00        52
   macro avg       1.00      1.00      1.00        52
weighted avg       1.00      1.00      1.00        52



Illinois

In [None]:
X = IL_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[335]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00       335

    accuracy                           1.00       335
   macro avg       1.00      1.00      1.00       335
weighted avg       1.00      1.00      1.00       335



Indiana

In [None]:
X = IN_insurance.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_insurance['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[64]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        64

    accuracy                           1.00        64
   macro avg       1.00      1.00      1.00        64
weighted avg       1.00      1.00      1.00        64



All Data

In [None]:
X = insurance_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = insurance_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9993489583333334
[[1535    1]
 [   0    0]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00      1535
         1.0       0.00      0.00      0.00         1

    accuracy                           1.00      1536
   macro avg       0.50      0.50      0.50      1536
weighted avg       1.00      1.00      1.00      1536



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Annual Check-up Datasets

Alabama

In [None]:
X = AL_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AL_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9440559440559441
[[62  7]
 [ 1 73]]
              precision    recall  f1-score   support

         0.0       0.90      0.98      0.94        63
         1.0       0.99      0.91      0.95        80

    accuracy                           0.94       143
   macro avg       0.94      0.95      0.94       143
weighted avg       0.95      0.94      0.94       143



Arizona

In [None]:
X = AZ_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AZ_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.98989898989899
[[91  0]
 [ 1  7]]
              precision    recall  f1-score   support

         0.0       1.00      0.99      0.99        92
         1.0       0.88      1.00      0.93         7

    accuracy                           0.99        99
   macro avg       0.94      0.99      0.96        99
weighted avg       0.99      0.99      0.99        99



Arkansas

In [None]:
X = AR_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = AR_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9465648854961832
[[64  5]
 [ 2 60]]
              precision    recall  f1-score   support

         0.0       0.93      0.97      0.95        66
         1.0       0.97      0.92      0.94        65

    accuracy                           0.95       131
   macro avg       0.95      0.95      0.95       131
weighted avg       0.95      0.95      0.95       131



California

In [None]:
X = CA_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CA_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9917808219178083
[[359   2]
 [  1   3]]
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00       360
         1.0       0.75      0.60      0.67         5

    accuracy                           0.99       365
   macro avg       0.87      0.80      0.83       365
weighted avg       0.99      0.99      0.99       365



Colorado

In [None]:
X = CO_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CO_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9805825242718447
[[64  0]
 [ 2 37]]
              precision    recall  f1-score   support

         0.0       1.00      0.97      0.98        66
         1.0       0.95      1.00      0.97        37

    accuracy                           0.98       103
   macro avg       0.97      0.98      0.98       103
weighted avg       0.98      0.98      0.98       103



Connecticut

In [None]:
X = CT_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = CT_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[24  0]
 [ 0 12]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        24
         1.0       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



Delaware

In [None]:
X = DL_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DL_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.7894736842105263
[[13  4]
 [ 0  2]]
              precision    recall  f1-score   support

         0.0       0.76      1.00      0.87        13
         1.0       1.00      0.33      0.50         6

    accuracy                           0.79        19
   macro avg       0.88      0.67      0.68        19
weighted avg       0.84      0.79      0.75        19



District of Columbia

In [None]:
X = DC_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = DC_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

ValueError: With n_samples=1, test_size=0.25 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.

Georgia

In [None]:
X = GA_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = GA_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9548387096774194
[[66  6]
 [ 1 82]]
              precision    recall  f1-score   support

         0.0       0.92      0.99      0.95        67
         1.0       0.99      0.93      0.96        88

    accuracy                           0.95       155
   macro avg       0.95      0.96      0.95       155
weighted avg       0.96      0.95      0.95       155



Hawaii

In [None]:
X = HI_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = HI_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[38]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00        38

    accuracy                           1.00        38
   macro avg       1.00      1.00      1.00        38
weighted avg       1.00      1.00      1.00        38



Idaho

In [None]:
X = ID_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = ID_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

1.0
[[ 4  0]
 [ 0 48]]
              precision    recall  f1-score   support

         0.0       1.00      1.00      1.00         4
         1.0       1.00      1.00      1.00        48

    accuracy                           1.00        52
   macro avg       1.00      1.00      1.00        52
weighted avg       1.00      1.00      1.00        52



Illinois

In [None]:
X = IL_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IL_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9880597014925373
[[ 19   1]
 [  3 312]]
              precision    recall  f1-score   support

         0.0       0.95      0.86      0.90        22
         1.0       0.99      1.00      0.99       313

    accuracy                           0.99       335
   macro avg       0.97      0.93      0.95       335
weighted avg       0.99      0.99      0.99       335



Indiana

In [None]:
X = IN_checkup.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = IN_checkup['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.9365079365079365
[[23  2]
 [ 2 36]]
              precision    recall  f1-score   support

         0.0       0.92      0.92      0.92        25
         1.0       0.95      0.95      0.95        38

    accuracy                           0.94        63
   macro avg       0.93      0.93      0.93        63
weighted avg       0.94      0.94      0.94        63



All Data

In [None]:
X = checkup_df.copy()
X.drop(['stateabbr_x','statedesc_x','locationname','datasource_x','category_x','measure_x','data_value_unit_x',
        'data_value_type_x','low_confidence_limit_x','high_confidence_limit_x','totalpopulation_x','geolocation_x','short_question_text_x',
        'low_confidence_limit_y','high_confidence_limit_y','short_question_text_y','y'],axis=1, inplace=True)

y = checkup_df['y'].values

X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=78)

scaler = StandardScaler()
X_scaler = scaler.fit(X_train)
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

knn.fit(X_train_scaled,y_train)

prediction = knn.predict(X_test_scaled)

print(accuracy_score(y_test,prediction))
print(confusion_matrix(prediction,y_test))
print(classification_report(y_test,prediction))

0.6927083333333334
[[ 96 173]
 [299 968]]
              precision    recall  f1-score   support

         0.0       0.36      0.24      0.29       395
         1.0       0.76      0.85      0.80      1141

    accuracy                           0.69      1536
   macro avg       0.56      0.55      0.55      1536
weighted avg       0.66      0.69      0.67      1536

