## Imports

In [2]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import pandas as pd

from raw_data import save_all_files, concat_df

## Load the data

Loads the required datasets from google cloud. Then selects only relevant columns and saves it into /data/_year_.pkl.

In [3]:
save_all_files()

Get a coffee ;) 
Downloading and saving 5 big files will take between 10 and 20 minutes.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You already have this file's corresponding .pkl file. No need to download the .csv file again.
You successfully downloaded 0 of the 5 files.


## Concatenate data from different years to one dataframe

In [4]:
df = concat_df()
df

Reading file: 2011.pkl
Reading file: 2012.pkl
Reading file: 2013.pkl
Reading file: 2014.pkl
Reading file: 2015.pkl


Unnamed: 0,DIABETE3,SMOKE100,PREGNANT,_BMI5,SMOKDAY2,CVDSTRK3,PHYSHLTH,SEX,GENHLTH,CVDINFR4,...,INSULIN,ASATTACK,_TOTINDA,_AGEG5YR,_LTASTH1,_BMI5CAT,_RFBMI5,_RFSMOK3,DRNKANY5,_RFBING5
0,3.0,1.0,,1855.0,1.0,2.0,88.0,2.0,4.0,2.0,...,,,2.0,9.0,1.0,2.0,1.0,2.0,1.0,1.0
1,3.0,1.0,,3529.0,1.0,2.0,12.0,1.0,4.0,2.0,...,,,2.0,3.0,1.0,4.0,2.0,2.0,9.0,9.0
2,3.0,1.0,2.0,1695.0,3.0,2.0,88.0,2.0,2.0,2.0,...,,,1.0,14.0,1.0,1.0,1.0,1.0,1.0,1.0
3,3.0,2.0,2.0,2579.0,,2.0,88.0,2.0,3.0,2.0,...,,,9.0,14.0,1.0,3.0,2.0,1.0,9.0,9.0
4,4.0,1.0,,4346.0,3.0,2.0,25.0,2.0,5.0,2.0,...,,,1.0,7.0,1.0,4.0,2.0,1.0,2.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380042,1.0,2.0,,1842.0,,2.0,88.0,2.0,4.0,2.0,...,,,2.0,11.0,1.0,1.0,1.0,1.0,2.0,1.0
2380043,3.0,2.0,2.0,2834.0,,2.0,88.0,2.0,1.0,2.0,...,,,1.0,2.0,1.0,3.0,2.0,1.0,2.0,1.0
2380044,3.0,1.0,,4110.0,3.0,2.0,88.0,2.0,4.0,2.0,...,,,9.0,11.0,1.0,4.0,2.0,1.0,1.0,1.0
2380045,3.0,2.0,,2315.0,,2.0,88.0,1.0,3.0,2.0,...,,,2.0,7.0,1.0,2.0,1.0,1.0,2.0,1.0


## Data preparation

- [ ] feature selection
- [ ] data cleaning

In [5]:
# here the data cleaning and data preparation takes place
df = df
target_name = "DIABETE3"
feature_names = df.columns.values.tolist()
feature_names.remove(target_name)

# removed all nan values for decision tree to function
# unproblematic features: subset of feature_names with those removed with a lot of nan values
feature_list = [
    "CVDSTRK3",
    "PHYSHLTH",
    "SEX",
    "GENHLTH",
    "CVDINFR4",
    "CVDCRHD4",
    "ASTHMA3",
    "_TOTINDA",
    "_AGEG5YR",
    "_LTASTH1",
    "_RFBMI5",
    "_RFSMOK3",
    "DRNKANY5",
    "_RFBING5"
]

# dataframe with just a few unproblematic columns. Removed all rows with at least one nan-Value
df_simplified = df[feature_list + [target_name]].dropna()
df_simplified

Unnamed: 0,CVDSTRK3,PHYSHLTH,SEX,GENHLTH,CVDINFR4,CVDCRHD4,ASTHMA3,_TOTINDA,_AGEG5YR,_LTASTH1,_RFBMI5,_RFSMOK3,DRNKANY5,_RFBING5,DIABETE3
0,2.0,88.0,2.0,4.0,2.0,2.0,2.0,2.0,9.0,1.0,1.0,2.0,1.0,1.0,3.0
1,2.0,12.0,1.0,4.0,2.0,2.0,2.0,2.0,3.0,1.0,2.0,2.0,9.0,9.0,3.0
2,2.0,88.0,2.0,2.0,2.0,2.0,2.0,1.0,14.0,1.0,1.0,1.0,1.0,1.0,3.0
3,2.0,88.0,2.0,3.0,2.0,2.0,2.0,9.0,14.0,1.0,2.0,1.0,9.0,9.0,3.0
4,2.0,25.0,2.0,5.0,2.0,2.0,2.0,1.0,7.0,1.0,2.0,1.0,2.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2380042,2.0,88.0,2.0,4.0,2.0,2.0,2.0,2.0,11.0,1.0,1.0,1.0,2.0,1.0,1.0
2380043,2.0,88.0,2.0,1.0,2.0,2.0,2.0,1.0,2.0,1.0,2.0,1.0,2.0,1.0,3.0
2380044,2.0,88.0,2.0,4.0,2.0,2.0,2.0,9.0,11.0,1.0,2.0,1.0,1.0,1.0,3.0
2380045,2.0,88.0,1.0,3.0,2.0,2.0,2.0,2.0,7.0,1.0,1.0,1.0,2.0,1.0,3.0


## Model training
- [ ] model selection
- [ ] train-test split (maybe before data prep)
- [ ] hyperparameter tuning
- [ ] more stuff

In [6]:
X_train, X_test, y_train, y_test = train_test_split(df_simplified[feature_list], df_simplified[target_name], test_size=0.33, random_state=42)

model = DecisionTreeClassifier()
model = model.fit(X_train, y_train)

In [7]:
y_predict = model.predict(X_test)
score = accuracy_score(y_test, y_predict)
matrix = confusion_matrix(y_test, y_predict)
matrix

array([[ 16713,    148,  82662,    523,     72,     18],
       [   251,     29,   6776,     19,      2,      1],
       [ 27186,   1022, 633435,   1338,    229,     75],
       [  1288,     29,  11775,     65,      8,      1],
       [   125,      3,    806,     17,     10,      1],
       [    36,      1,    219,      6,      3,    135]], dtype=int64)

In [8]:
confusion_matrix?

[1;31mSignature:[0m
[0mconfusion_matrix[0m[1;33m([0m[1;33m
[0m    [0my_true[0m[1;33m,[0m[1;33m
[0m    [0my_pred[0m[1;33m,[0m[1;33m
[0m    [1;33m*[0m[1;33m,[0m[1;33m
[0m    [0mlabels[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0msample_weight[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m    [0mnormalize[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m[1;33m
[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Compute confusion matrix to evaluate the accuracy of a classification.

By definition a confusion matrix :math:`C` is such that :math:`C_{i, j}`
is equal to the number of observations known to be in group :math:`i` and
predicted to be in group :math:`j`.

Thus in binary classification, the count of true negatives is
:math:`C_{0,0}`, false negatives is :math:`C_{1,0}`, true positives is
:math:`C_{1,1}` and false positives is :math:`C_{0,1}`.

Read more in the :ref:`User Guide <confusion_matrix>`.

Parameters
----------
y_tr

## Use Model to predict user's diabetes probability

### Get user input

In [9]:
# please input data: ...

# use data for diabetes proba
user_row = df_simplified.loc[[11], feature_list].reset_index(drop=True) # got a random index just as an example
probas = model.predict_proba(user_row)

label_meanings = (
    "diabetes",
    "diabetes during pregnancy",
    "pre-diabetes or borderline diabetes",
    "don't know",
    "refused",
    "BLANK"
)

for l, p in list(zip(label_meanings, probas.tolist()[0])):
    print(f"With a probability of {round(p*100, 2)}% you (will) have result: {l}.")

With a probability of 5.15% you (will) have result: diabetes.
With a probability of 0.45% you (will) have result: diabetes during pregnancy.
With a probability of 93.06% you (will) have result: pre-diabetes or borderline diabetes.
With a probability of 1.34% you (will) have result: don't know.
With a probability of 0.0% you (will) have result: refused.
With a probability of 0.0% you (will) have result: BLANK.


### Make recommendations: What factors have the most significant impact on the user's diabetes probability?