In [1]:
import math
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

# Cleaning

1. reduce the dataset by sampling a smaller size
2. min-max scale the features to account for different ranges in data like BMI
3. add an intercept bias term to features
4. split into training/test data

In [2]:
df = pd.read_csv('../archive/diabetes_012_health_indicators_BRFSS2015.csv')
df.head()

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,0.0,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,...,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,0.0,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,...,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,0.0,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,...,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,0.0,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,0.0,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,...,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0


In [3]:
# columns should be type int -- none are float or categorical
df = df.astype(int)

In [12]:
# reduce dataset by sampling 
sample_size = 0.3
random_state = 1

df_small = df.sample(frac=0.2, random_state=1)
df_small.shape

(50736, 22)

In [14]:
# get the features
df_feat = df_small.drop("Diabetes_012", axis=1)
df_feat.head()

Unnamed: 0,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
235899,1,0,1,23,1,0,0,1,1,1,...,1,0,2,0,0,0,0,13,6,6
74852,0,0,1,22,1,0,1,1,1,1,...,1,0,2,0,5,0,1,9,6,8
8205,1,1,1,26,1,0,0,1,1,1,...,1,0,2,0,0,0,0,11,4,6
127632,1,0,1,39,0,0,0,1,1,1,...,1,0,4,0,0,0,1,11,6,5
32021,0,0,1,22,1,0,0,0,1,1,...,1,0,1,0,0,0,0,10,6,8


In [18]:
# get the y value
df_y = df_small.loc[:,"Diabetes_012"]
df_y

235899    0
74852     2
8205      0
127632    2
32021     0
         ..
52177     0
94042     2
188075    0
16561     0
38201     0
Name: Diabetes_012, Length: 50736, dtype: int64

In [19]:
# min max scaling, rounded to 3 decimal places
scaler = MinMaxScaler()
X_scale = scaler.fit_transform(df_feat).round(3)

In [20]:
# Add a new column filled with ones -- for intercept/bias term
X_scale = np.hstack((X_scale,np.ones([X_scale.shape[0],1], X_scale.dtype)))
X_scale

array([[1.   , 0.   , 1.   , ..., 1.   , 0.714, 1.   ],
       [0.   , 0.   , 1.   , ..., 1.   , 1.   , 1.   ],
       [1.   , 1.   , 1.   , ..., 0.6  , 0.714, 1.   ],
       ...,
       [1.   , 0.   , 1.   , ..., 0.8  , 0.   , 1.   ],
       [1.   , 1.   , 1.   , ..., 0.8  , 1.   , 1.   ],
       [0.   , 0.   , 1.   , ..., 1.   , 1.   , 1.   ]])

In [23]:
# then split into training and test datasets
y = df_y.to_numpy()

[X_train, X_test, y_train, y_test] = train_test_split(X_scale, y, test_size = .3)

# we would then train the model on the training set
X_train, y_train

(array([[1.   , 1.   , 1.   , ..., 0.6  , 0.429, 1.   ],
        [0.   , 0.   , 1.   , ..., 0.6  , 0.857, 1.   ],
        [1.   , 1.   , 1.   , ..., 0.4  , 0.429, 1.   ],
        ...,
        [0.   , 0.   , 1.   , ..., 1.   , 0.571, 1.   ],
        [0.   , 0.   , 1.   , ..., 1.   , 0.714, 1.   ],
        [1.   , 1.   , 1.   , ..., 0.8  , 0.714, 1.   ]]),
 array([0, 0, 2, ..., 0, 0, 0]))