## Imports

In [1]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

from raw_data import save_all_files, concat_df

## Load the data

Loads the required datasets from google cloud. Then selects only relevant columns and saves it into /data/_year_.pkl.

In [4]:
save_all_files()

Get a coffee ;) 
Downloading and saving 5 big files will take between 10 and 20 minutes.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You successfully downloaded 0 of the 5 files.


## Concatenate data from different years to one dataframe

In [16]:
df = concat_df()
df

Reading file: 2011.pkl
Reading file: 2012.pkl
Reading file: 2013.pkl
Reading file: 2014.pkl
Reading file: 2015.pkl


Unnamed: 0,DIABETE3,SEX,_AGEG5YR,EDUCA,_BMI5,_BMI5CAT,GENHLTH,PHYSHLTH,_TOTINDA,EXERANY2,...,SMOKDAY2,_RFSMOK3,DRNKANY5,ALCDAY5,AVEDRNK2,DRNK3GE5,_RFBING5,CVDSTRK3,CVDINFR4,CVDCRHD4
0,3.0,2.0,9.0,4.0,1855.0,2.0,4.0,88.0,2.0,2.0,...,1.0,2.0,1.0,103.0,2.0,88.0,1.0,2.0,2.0,2.0
1,3.0,1.0,3.0,3.0,3529.0,4.0,4.0,12.0,2.0,2.0,...,1.0,2.0,9.0,,,,9.0,2.0,2.0,2.0
2,3.0,2.0,14.0,6.0,1695.0,1.0,2.0,88.0,1.0,1.0,...,3.0,1.0,1.0,101.0,1.0,88.0,1.0,2.0,2.0,2.0
3,3.0,2.0,14.0,4.0,2579.0,3.0,3.0,88.0,9.0,,...,,1.0,9.0,,,,9.0,2.0,2.0,2.0
4,4.0,2.0,7.0,3.0,4346.0,4.0,5.0,25.0,1.0,1.0,...,3.0,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380042,1.0,2.0,11.0,2.0,1842.0,1.0,4.0,88.0,2.0,2.0,...,,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0
2380043,3.0,2.0,2.0,5.0,2834.0,3.0,1.0,88.0,1.0,1.0,...,,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0
2380044,3.0,2.0,11.0,4.0,4110.0,4.0,4.0,88.0,9.0,,...,3.0,1.0,1.0,202.0,2.0,88.0,1.0,2.0,2.0,2.0
2380045,3.0,1.0,7.0,5.0,2315.0,2.0,3.0,88.0,2.0,2.0,...,,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0


# MODIFY THIS

## Data preparation

- [ ] feature selection
- [ ] data cleaning

In [26]:
# here the data cleaning and data preparation takes place
target_name = "DIABETE3"
feature_names = df.columns.values.tolist()
feature_names.remove(target_name)

# removed all nan values for decision tree to function
# unproblematic features: subset of feature_names with those removed with a lot of nan values
feature_list = [
    "CVDSTRK3",
    "PHYSHLTH",
    "SEX",
    "GENHLTH",
    "CVDINFR4",
    "CVDCRHD4",
    "ASTHMA3",
    "_TOTINDA",
    "_AGEG5YR",
    "_LTASTH1",
    "_RFBMI5",
    "_RFSMOK3",
    "DRNKANY5",
    "_RFBING5"
]

features = {
    "SEX": "SEX",
    "_AGEG5YR": "age",
    "EDUCA": "education",
    "_BMI5": "bmi",
    "_BMI5CAT": "bmicat",
    "GENHLTH": "estimated_health",
    "PHYSHLTH": "bad_physhealth_days",
    "_TOTINDA": "had_phys_activity",
    "EXERANY2": "EXERANY2",
    "SMOKE100": "smoked_over_100",
    "SMOKDAY2": "smokes_daily",
    "_RFSMOK3": "is_smoker",
    "DRNKANY5": "alcohol_past_30",
    "ALCDAY5": "ALCDAY5",
    "AVEDRNK2": "AVEDRNK2",
    "DRNK3GE5": "DRNK3GE5",
    "_RFBING5": "_RFBING5",
    "CVDSTRK3": "had_stroke",
    "CVDINFR4": "had_heart_attack",
    "CVDCRHD4": "had_angina"
}

# only include rows with diabetes = yes and no and select relevant columns
df_filtered = df.loc[df["DIABETE3"].isin([1, 3]), list(features.keys()) + [target_name]]

# rename label 1 = yes (has diabetes) | 3 = no (no diabetes)
df_filtered["DIABETE3"] = df_filtered["DIABETE3"].map({1: "yes", 3: "no"})

df_filtered

Unnamed: 0,SEX,_AGEG5YR,EDUCA,_BMI5,_BMI5CAT,GENHLTH,PHYSHLTH,_TOTINDA,EXERANY2,SMOKE100,...,_RFSMOK3,DRNKANY5,ALCDAY5,AVEDRNK2,DRNK3GE5,_RFBING5,CVDSTRK3,CVDINFR4,CVDCRHD4,DIABETE3
0,2.0,9.0,4.0,1855.0,2.0,4.0,88.0,2.0,2.0,1.0,...,2.0,1.0,103.0,2.0,88.0,1.0,2.0,2.0,2.0,no
1,1.0,3.0,3.0,3529.0,4.0,4.0,12.0,2.0,2.0,1.0,...,2.0,9.0,,,,9.0,2.0,2.0,2.0,no
2,2.0,14.0,6.0,1695.0,1.0,2.0,88.0,1.0,1.0,1.0,...,1.0,1.0,101.0,1.0,88.0,1.0,2.0,2.0,2.0,no
3,2.0,14.0,4.0,2579.0,3.0,3.0,88.0,9.0,,2.0,...,1.0,9.0,,,,9.0,2.0,2.0,2.0,no
5,1.0,9.0,4.0,2957.0,3.0,2.0,88.0,1.0,1.0,1.0,...,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0,yes
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380042,2.0,11.0,2.0,1842.0,1.0,4.0,88.0,2.0,2.0,2.0,...,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0,yes
2380043,2.0,2.0,5.0,2834.0,3.0,1.0,88.0,1.0,1.0,2.0,...,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0,no
2380044,2.0,11.0,4.0,4110.0,4.0,4.0,88.0,9.0,,1.0,...,1.0,1.0,202.0,2.0,88.0,1.0,2.0,2.0,2.0,no
2380045,1.0,7.0,5.0,2315.0,2.0,3.0,88.0,2.0,2.0,2.0,...,1.0,2.0,888.0,,,1.0,2.0,2.0,2.0,no


### Drop missing and bad input values

In [30]:
for col in df_filtered.columns:
    print("\n")
    print(f"Column {col} has the following values:")
    print(df_filtered[col].value_counts())
    print(f"... and {df_filtered[col].isnull().values.sum()} nan values:")
    print()



Column SEX has the following values:
2.0    1360627
1.0     953002
Name: SEX, dtype: int64
... and 0 nan values:



Column _AGEG5YR has the following values:
9.0     257834
8.0     247012
10.0    237913
7.0     221684
13.0    191833
11.0    189200
6.0     171691
5.0     148460
12.0    143326
4.0     131129
3.0     123806
1.0     121744
2.0     103568
14.0     24429
Name: _AGEG5YR, dtype: int64
... and 0 nan values:



Column EDUCA has the following values:
6.0    815467
4.0    667037
5.0    625532
3.0    128848
2.0     62456
9.0      8167
1.0      3108
Name: EDUCA, dtype: int64
... and 3014 nan values:



Column _BMI5 has the following values:
2663.0    25687
2746.0    20161
2744.0    19765
2441.0    17844
2712.0    15550
          ...  
5980.0        1
5179.0        1
6746.0        1
3733.0        1
5017.0        1
Name: _BMI5, Length: 4930, dtype: int64
... and 141448 nan values:



Column _BMI5CAT has the following values:
3.0    787368
2.0    728535
4.0    618729
1.0     37549
Na

In [33]:
# drop na drops around 79% of the rows...
df = df_filtered.dropna()
df

Unnamed: 0,SEX,_AGEG5YR,EDUCA,_BMI5,_BMI5CAT,GENHLTH,PHYSHLTH,_TOTINDA,EXERANY2,SMOKE100,...,_RFSMOK3,DRNKANY5,ALCDAY5,AVEDRNK2,DRNK3GE5,_RFBING5,CVDSTRK3,CVDINFR4,CVDCRHD4,DIABETE3
0,2.0,9.0,4.0,1855.0,2.0,4.0,88.0,2.0,2.0,1.0,...,2.0,1.0,103.0,2.0,88.0,1.0,2.0,2.0,2.0,no
2,2.0,14.0,6.0,1695.0,1.0,2.0,88.0,1.0,1.0,1.0,...,1.0,1.0,101.0,1.0,88.0,1.0,2.0,2.0,2.0,no
11,2.0,13.0,6.0,2141.0,2.0,2.0,88.0,2.0,2.0,1.0,...,1.0,1.0,107.0,2.0,88.0,1.0,2.0,2.0,2.0,no
31,1.0,8.0,6.0,2585.0,3.0,2.0,2.0,1.0,1.0,1.0,...,1.0,1.0,210.0,2.0,88.0,1.0,2.0,2.0,2.0,no
32,1.0,10.0,4.0,2510.0,3.0,2.0,88.0,1.0,1.0,1.0,...,1.0,1.0,215.0,4.0,88.0,1.0,2.0,2.0,2.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380002,1.0,3.0,3.0,2647.0,3.0,2.0,88.0,1.0,1.0,1.0,...,1.0,1.0,201.0,24.0,1.0,2.0,2.0,2.0,2.0,no
2380022,2.0,11.0,4.0,2780.0,3.0,3.0,88.0,2.0,2.0,1.0,...,1.0,1.0,204.0,2.0,88.0,1.0,2.0,2.0,2.0,no
2380025,1.0,1.0,5.0,3797.0,4.0,1.0,88.0,1.0,1.0,1.0,...,2.0,1.0,203.0,8.0,3.0,2.0,2.0,2.0,2.0,no
2380031,1.0,9.0,2.0,2403.0,2.0,2.0,88.0,2.0,2.0,1.0,...,2.0,1.0,103.0,18.0,3.0,2.0,2.0,2.0,2.0,no


# DON'T TOUCH FROM HERE

## Model training
- [ ] model selection
- [ ] train-test split (maybe before data prep)
- [ ] hyperparameter tuning
- [ ] more stuff

In [7]:
X_train, X_test, y_train, y_test = train_test_split(df_simplified[feature_list], df_simplified[target_name], test_size=0.33, random_state=42)

model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

KeyError: "['ASTHMA3', '_LTASTH1', '_RFBMI5'] not in index"

In [None]:
y_predict = model.predict(X_test)
score = accuracy_score(y_test, y_predict)
matrix = confusion_matrix(y_test, y_predict)
matrix

array([[ 16713,    148,  82662,    523,     72,     18],
       [   251,     29,   6776,     19,      2,      1],
       [ 27186,   1022, 633435,   1338,    229,     75],
       [  1288,     29,  11775,     65,      8,      1],
       [   125,      3,    806,     17,     10,      1],
       [    36,      1,    219,      6,      3,    135]], dtype=int64)

In [None]:
confusion_matrix?

[1;31mSignature:[0m
[0mconfusion_matrix[0m[1;33m([0m[1;33m
[0m    [0my_true[0m[1;33m,[0m[1;33m
[0m    [0my_pred[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnormalize[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute confusion matrix to evaluate the accuracy of a classification.

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` and
predicted to be in group :math:`j`.

Thus in binary classification, the count of true negatives is
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

Read more in the :ref:`User Guide <confusion_matrix>`.

Parameters
----------
y_tr

## Use Model to predict user's diabetes probability

### Get user input

In [None]:
# please input data: ...

# use data for diabetes proba
user_row = df_simplified.loc[[11], feature_list].reset_index(drop=True) # got a random index just as an example
probas = model.predict_proba(user_row)

label_meanings = (
    "diabetes",
    "diabetes during pregnancy",
    "pre-diabetes or borderline diabetes",
    "don't know",
    "refused",
    "BLANK"
)

for l, p in list(zip(label_meanings, probas.tolist()[0])):
    print(f"With a probability of {round(p*100, 2)}% you (will) have result: {l}.")

With a probability of 5.15% you (will) have result: diabetes.
With a probability of 0.45% you (will) have result: diabetes during pregnancy.
With a probability of 93.06% you (will) have result: pre-diabetes or borderline diabetes.
With a probability of 1.34% you (will) have result: don't know.
With a probability of 0.0% you (will) have result: refused.
With a probability of 0.0% you (will) have result: BLANK.


### Make recommendations: What factors have the most significant impact on the user's diabetes probability?