# dp_dhs_ncd_demo


```
!pip install imblearn
!pip install hyperopt
!pip install pandas_ml
```

## Notes

- Definition of diabetes:
    - Recommendation for diagnosis of diabetes [WHO](https://www.who.int/diabetes/publications/Definition%20and%20diagnosis%20of%20diabetes_new.pdf)
- Hyperglycemia and hypertenison:
    - Hypertension if meets any of the following three criteria:
        - SBP ≥ 140 mmHg or DBP ≥ 90 mmHg
        - Doctor/nurse diagnosed high blood pressure
        - Taking blood pressure-lowering medication
    - Hyperglycemia if meets any of the following three criteria:
        - FBG ≥ 126 mg/dl (7.0 mmol/l)
        - Doctor/nurse diagnosed diabetes
        - Taking diabetes medication
- Key outcome variables for hypertension:
    - sh250: Taking prescribed medicine to lower blood pressure
    - sh249: Told by a doctor to have high blood pressure
    - fsysto: Valid Systolic blood presure
    - fdysto: Valid Dystolic blood pressure
- Key outcome variables for hyperglycemia:
    - sh280: Time respondent ate something
    - sh281: Time respondent drank something
    - sh283d: Date for glucose testing Day
    - sh283m: Date for glucose testing Month
    - sh283y: Date for glucose testing Year
    - sh283t: Time for glucose testing
    - sh284: Glucose testing (mg/dl)
    - sh284a: Plasma blood glucose (mmol/L)
    - sh284g: Blood glucose (mmol/dl)
    - sh284b: Plasma blood glucose (mg/dl)
- TO DO LIST:
    - DONE: Subset based on the diabetes variable
    - DONE: Remove based on the manually selected
    - DONE: Remove uniform variables
    - DONE: Remove identical variables
    - DONE: Remove variables with any missing values (threshold)
    - DONE: Check categorical and continous variables and convert to numerical
    - DONE: Assign if variable is categorical or not in the metadata_file
    - DONE: Create functions that creates outcome variables/datasets for hypertension and hyperglycemia
    - Create a function that summarize variables
    - Perform one-hot encoding of variables
    - Apply XGBoost approach with all variables
    - Make 95% Feature importance threshold approach
    - Re-run prediction
    - Fine-tuning the model

## Packages

In [40]:
import os
import re
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import itertools
import collections
import logging
import numpy as np
import pandas as pd
import pandas_ml as pd_ml
import seaborn as sns
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from xgboost import XGBClassifier

from datetime import timedelta
from datetime import datetime as dt

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

## Custom functions

In [52]:
def subset_by_var1(input_df, var):
    df = input_df.copy(deep=True)
    print(df.shape)
    df = df.loc[df[var].notna(),:]
    print(df.shape)
    return df


def subset_by_var2(input_df, var):
    df = input_df.copy(deep=True)
    print(df.shape)
    df = df.loc[df[var] != -9,:]
    print(df.shape)
    return df


def remove_metadata(input_df, metadata):
    df = input_df.copy(deep=True)
    var_select = [var1 for var1,var2 in zip(metadata['variable'],metadata['model_keep']) if var2 == 0]
    print(df.shape)
    df = df.drop(var_select, axis=1)
    print(df.shape)
    return df 
 
def remove_uniform(input_df):
    df = input_df.copy(deep=True)
    var_list = []
    for key, var in enumerate(df):
        tmp = df[var].dropna()
        if len(tmp.unique()) == 1:
            var_list.append(var)
    print(df.shape)
    print(var_list)
    df = df.drop(var_list, axis=1)
    print(df.shape)
    return df
    
def remove_identical(input_df):
    df = input_df.copy(deep=True)
    var_pairs_check = []
    var_pairs = []
    for pair in itertools.combinations(list(df.columns),2):
        var_pairs.append(pair)
        var_pairs_check.append(all(df[pair[0]] == df[pair[1]]))
    selected_pairs = [pair for pair, check in zip(var_pairs, var_pairs_check) if check]
    selected_var = [var2 for var1,var2 in selected_pairs]
    print(df.shape)
    df = df.drop(list(set(selected_var)), axis=1)
    print(df.shape)
    return df


def remove_missing(input_df, threshold):
    df = input_df.copy(deep=True)
    drop_vars = []
    for key, var in enumerate(df):
        null_percent = round(df[var].isnull().mean() * 100,2)
        if null_percent >= threshold:
            drop_vars.append(var)
    print(df.shape)
    df = df.drop(drop_vars, axis=1)
    print(df.shape)
    return df


def check_variables(input_df):
    df = input_df.copy(deep=True)
    var_types = [var_n for var_t,var_n  in zip(df.dtypes, df.columns) if var_t == 'O']
    return var_types

def summarize_variables(input_df, metadata_df):
    metadata = metadata_df.copy(deep=True)
    metadata['num_values'] = 'NaN'
    for key, var in enumerate(metadata.values):
        metadata.loc[key,'num_values'] = len(input_df[var[0]].unique())
    return metadata


def determine_hypertension(input_df):
    df = input_df.copy(deep=True)
    df['hypertension'] = -9
    for key, value in enumerate(df.values):
        if (df.loc[key,'sh250'] == 1. or df.loc[key,'sh249'] == 1. or df.loc[key,'fsysto'] >= 140. or df.loc[key,'fdysto'] >= 90.):
            df.at[key, 'hypertension'] = 1
        elif (df.loc[key,'sh250'] == 0 or df.loc[key,'sh249'] == 0 or df.loc[key,'fsysto'] < 140 or df.loc[key,'fdysto'] < 90):
            df.at[key, 'hypertension'] = 0
        else:
            df.at[key,'hypertension'] = -9
    return df  

def determine_hyperglycaemia(input_df):
    df = input_df.copy(deep=True)
    df['sh280'] = np.where(df['sh280'].isna(), 9999, df['sh280'])
    df['sh280'] = df['sh280'].astype(int).astype(str)
    df['meal'] = 1
    for key,time in enumerate(df['sh280']):
        try:
            df.at[key,'meal'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'meal'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    
    df['sh281'] = np.where(df['sh281'].isna(), 9999, df['sh281'])
    df['sh281'] = df['sh281'].astype(int).astype(str)
    df['drink'] = 1
    for key,time in enumerate(df['sh281']):
        try:
            df.at[key,'drink'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'drink'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    
    df['sh283t'] = np.where(df['sh283t'].isna(), 9999, df['sh283t'])
    df['sh283t'] = df['sh283t'].astype(int).astype(str)
    df['measure_time'] = 1
    for key,time in enumerate(df['sh283t']):
        try:
            df.at[key,'measure_time'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'measure_time'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    df['meal'] = pd.to_datetime(df['meal'], format='%Y-%m-%d %H:%M:%S')
    df['drink'] = pd.to_datetime(df['drink'], format='%Y-%m-%d %H:%M:%S')
    df['measure_time'] = pd.to_datetime(df['measure_time'], format='%Y-%m-%d %H:%M:%S')
    df['meal_delta'] = df['meal'] - df['measure_time']
    df['meal_delta'] = df['meal_delta']/np.timedelta64(1,'h')
    df['meal_fasting'] = np.where(df['meal_delta'] > 4, 1, 0)
    df['drink_delta'] = df['drink'] - df['measure_time']
    df['drink_delta'] = df['drink_delta']/np.timedelta64(1,'h')
    df['drink_fasting'] = np.where(df['drink_delta'] > 4, 1, 0)
    df['hyperglycaemia'] = -9
    # sh258: Ever told by a doctor/nurse to have diabetes
    # sh259: Taking medication for the diabetes
    # df.loc[key,'drink_fasting'] == 1. and df.loc[key,'meal_fasting'] == 1. and 
    # df.loc[key,'drink_fasting'] == 0. and df.loc[key,'meal_fasting'] == 0. and 
    for key, value in enumerate(df.values):
        if (df.loc[key,'sh284b'] >= 126 or df.loc[key,'sh258'] == 1 or df.loc[key,'sh259'] == 1):
            df.at[key, 'hyperglycaemia'] = 1
        elif (df.loc[key,'sh284b'] < 126 or df.loc[key,'sh258'] == 0 or df.loc[key,'sh259'] == 0):
            df.at[key, 'hyperglycaemia'] = 0
        else:
            df.at[key, 'hyperglycaemia'] = -9
    return df 

## Preprocessing data

In [42]:
data = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/data_dd_dhs_ncd.csv'
metadata = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/metadata_dd_dhs_ncd.csv'
metadata_m = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/metadata_dd_dhs_ncd_m.csv'

### Reading data

In [43]:
ncd_data = pd.read_csv(data)
ncd_metadata = pd.read_csv(metadata_m)

### Creating hyperglycaemia target variable

In [44]:
ncd_data = determine_hyperglycaemia(input_df=ncd_data)
print(ncd_data['hyperglycaemia'].value_counts())

-9    74896
 0     6632
 1     2203
Name: hyperglycaemia, dtype: int64


In [45]:
ncd_metadata = summarize_variables(input_df=ncd_data, metadata_df=ncd_metadata)
ncd_metadata.head()
ncd_metadata.to_csv(metadata_m, index=False, index_label=False)

In [46]:
ncd_data.head()

Unnamed: 0.1,Unnamed: 0,hhid,hvidx,hv000,hv001,hv002,hv003,hv004,hv005,hv006,...,shed2,shed3,meal,drink,measure_time,meal_delta,meal_fasting,drink_delta,drink_fasting,hyperglycaemia
0,1,1 3,1,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
1,2,1 3,2,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
2,3,1 3,3,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
3,4,1 3,4,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
4,5,1 3,5,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9


In [47]:
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


### Subset data by target variable missingness

In [48]:
data = subset_by_var2(input_df=ncd_data, var='hyperglycaemia')
'hyperglycaemia' in data.columns

(83731, 391)
(8835, 391)


True

### Subset data by metadata file (variables to keep column)

In [49]:
data = remove_metadata(input_df=data, metadata=ncd_metadata)
'hyperglycaemia' in data.columns

(8835, 391)
(8835, 314)


True

### Remove uniformed variables (no variation)

In [50]:
data = remove_uniform(input_df=data)
'hyperglycaemia' in data.columns

(8835, 314)
['hv015', 'hv020', 'hv027', 'hv042', 'hv120', 'hv121', 'hv122', 'hv124', 'sh21', 'ha62', 'hb62']
(8835, 303)


True

### Remove identical variables 

In [51]:
#data = remove_identical(input_df=data)
#'hyperglycaemia' in data.columns

### Removing missing by threshold 

In [53]:
data = remove_missing(input_df=data, threshold=1)
'hyperglycaemia' in data.columns

(8835, 303)
(8835, 108)


True

### Checking variables

In [54]:
check_variables(input_df=data)
data.head()

Unnamed: 0.1,Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv016,hv017,hv024,...,shed1,shed3,meal,drink,measure_time,meal_delta,meal_fasting,drink_delta,drink_fasting,hyperglycaemia
12,13,6,0,0,6,2,1,10,1,1,...,1.0,1.0,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,0.0,0,0.0,0,1
16,17,6,0,0,6,2,1,10,1,1,...,0.0,0.0,1900-01-01 20:00:00,1900-01-01 20:00:00,1900-01-01 06:08:00,13.866667,1,13.866667,1,0
17,18,6,0,0,6,2,1,10,1,1,...,1.0,1.0,1900-01-01 20:00:00,1900-01-01 20:00:00,1900-01-01 06:12:00,13.8,1,13.8,1,0
26,27,5,1,1,5,5,1,10,1,1,...,1.0,1.0,1900-01-01 21:00:00,1900-01-01 21:00:00,1900-01-01 06:18:00,14.7,1,14.7,1,0
39,40,2,0,0,2,2,0,10,1,1,...,0.0,0.0,1900-01-01 21:00:00,1900-01-01 21:00:00,1900-01-01 06:26:00,14.566667,1,14.566667,1,0


### Removing meta variables

In [55]:
data = data.drop(['Unnamed: 0', 'meal', 'drink', 'measure_time', 'meal_delta', 'meal_fasting', 'drink_delta', 'drink_fasting'],axis=1)
data.head()

Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv016,hv017,hv024,hv025,...,sh231,sh234c,sh237,sh239,sh241,shwh,shcmc,shed1,shed3,hyperglycaemia
12,6,0,0,6,2,1,10,1,1,2,...,1.0,35.0,1.0,13.0,3.0,3.0,917.0,1.0,1.0,1
16,6,0,0,6,2,1,10,1,1,2,...,2.0,58.0,0.0,0.0,1.0,0.0,642.0,0.0,0.0,0
17,6,0,0,6,2,1,10,1,1,2,...,1.0,60.0,1.0,13.0,1.0,3.0,614.0,1.0,1.0,0
26,5,1,1,5,5,1,10,1,1,2,...,1.0,36.0,1.0,52.0,1.0,0.0,907.0,1.0,1.0,0
39,2,0,0,2,2,0,10,1,1,2,...,1.0,65.0,0.0,13.0,1.0,3.0,556.0,0.0,0.0,0


## Building the model

## Spliting predictors and target variable

## Splitting data: train, validation, test

In [56]:
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


In [65]:
y = data['hyperglycaemia']
X = data.drop(['hyperglycaemia'], axis=1)

In [67]:
for var in X.columns:
    print(var)
    print(str(ncd_metadata[ncd_metadata['variable'] == var].values[0][4]))
    if str(ncd_metadata[ncd_metadata['variable'] == var].values[0][4]) == 'categorical':
        X[var] =  X[var].astype('str').astype('category')

hv009
numeric
hv010
numeric
hv011
numeric
hv012
numeric
hv013
numeric
hv014
numeric
hv016
categorical
hv017
numeric
hv024
categorical
hv025
categorical
hv026
categorical
hv035
numeric
hv040
numeric
hv041
numeric
hv201
categorical
hv204
numeric
hv205
categorical
hv206
categorical
hv207
categorical
hv208
categorical
hv209
categorical
hv210
categorical
hv211
categorical
hv213
categorical
hv214
categorical
hv215
categorical
hv216
numeric
hv217
categorical
hv218
categorical
hv219
categorical
hv220
categorical
hv221
categorical
hv226
categorical
hv230a
categorical
hv234a
categorical
hv237
categorical
hv237a
categorical
hv237b
categorical
hv237c
categorical
hv237d
categorical
hv237e
categorical
hv237f
categorical
hv237g
categorical
hv237x
categorical
hv237z
categorical
hv241
categorical
hv243a
categorical
hv244
categorical
hv246
categorical
hv246b
categorical
hv246g
numeric
hv246h
numeric
hv246i
numeric
hv247
categorical
hv252
numeric
hv270
categorical
hv271
numeric
shdistrict
categorical
shu

In [69]:
X = pd.get_dummies(X)
X.head()

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=84, test_size=0.2)

## Fitting the default model

In [72]:
model_hyperglycaemia = XGBClassifier(objective="binary:logistic", max_depth=9, eval_metric='error', min_child_weight=5, subsample=0.7, colsample=0.7)
model_hyperglycaemia.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.7,
       colsample_bylevel=1, colsample_bytree=1, eval_metric='error',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [73]:
y_predicted = model_hyperglycaemia.predict_proba(X_test)
y_predicted = y_predicted[:,1]
y_predicted = np.where(y_predicted > 0.5, 1, 0)
print(type(y_predicted))
print(type(y_test))
cm = pd_ml.ConfusionMatrix(y_pred=y_predicted, y_true=y_test)
print(cm)

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>
Predicted  False  True  __all__
Actual                         
False         26     4       30
True           8     0        8
__all__       34     4       38


In [74]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1327
           1       0.89      0.46      0.61       440

   micro avg       0.85      0.85      0.85      1767
   macro avg       0.87      0.72      0.76      1767
weighted avg       0.86      0.85      0.83      1767



In [75]:
feature_importance = pd.DataFrame.from_dict({'variable': X_train.columns, 'importance': model_hyperglycaemia.feature_importances_})
print(feature_importance.shape)

(622, 2)


In [76]:
feature_importance = feature_importance.merge(ncd_metadata, how='left', on='variable')
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
print(feature_importance.shape)
feature_importance

(622, 6)


Unnamed: 0,variable,importance,description,model_keep,num_values,type
16,hv271,0.091121,Wealth index factor score (5 decimals),1.0,16353.0,numeric
23,shcmc,0.086465,CMC of birth of member,1.0,611.0,date
8,hv040,0.069172,Cluster altitude in meters,1.0,72.0,numeric
14,hv246i,0.037912,Owns Chickens/ ducks,1.0,60.0,numeric
19,hv108,0.030595,Education completed in single years,1.0,21.0,numeric
18,hv105,0.025607,Age of household members,1.0,98.0,numeric
4,hv013,0.020286,Number of de facto members,1.0,26.0,numeric
3,hv012,0.019621,Number of de jure members,1.0,24.0,numeric
15,hv252,0.018623,Frequency household members smoke inside the h...,1.0,6.0,numeric
20,idxh4,0.016960,Index to Household Schedule,1.0,31.0,numeric
