# dp_dhs_ncd_hypertension

```
!pip install imblearn
!pip install hyperopt
!pip install pandas_ml
```

## Notes

- Definition of diabetes:
    - Recommendation for diagnosis of hypertension [link](https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4707193/)
- Hypertension if meets any of the following three criteria:
    - SBP ≥ 140 mmHg or DBP ≥ 90 mmHg
    - Doctor/nurse diagnosed high blood pressure
    - Taking blood pressure-lowering medication

## Packages

In [1]:
import os
import re
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import itertools
import collections
import logging
import numpy as np
import pandas as pd
import pandas_ml as pd_ml
import seaborn as sns
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from xgboost import XGBClassifier

from datetime import timedelta
from datetime import datetime as dt

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

## Custom functions

In [17]:
def subset_by_var1(input_df, var):
    df = input_df.copy(deep=True)
    print(df.shape)
    df = df.loc[df[var].notna(),:]
    print(df.shape)
    return df


def subset_by_var2(input_df, var):
    df = input_df.copy(deep=True)
    print(df.shape)
    df = df.loc[df[var] != -9,:]
    print(df.shape)
    return df


def remove_metadata(input_df, metadata):
    df = input_df.copy(deep=True)
    var_select = [var1 for var1,var2 in zip(metadata['variable'],metadata['model_keep']) if var2 == 0]
    print(df.shape)
    df = df.drop(var_select, axis=1)
    print(df.shape)
    return df 
 
def remove_uniform(input_df):
    df = input_df.copy(deep=True)
    var_list = []
    for key, var in enumerate(df):
        tmp = df[var].dropna()
        if len(tmp.unique()) == 1:
            var_list.append(var)
    print(df.shape)
    print(var_list)
    df = df.drop(var_list, axis=1)
    print(df.shape)
    return df
    
def remove_identical(input_df):
    df = input_df.copy(deep=True)
    var_pairs_check = []
    var_pairs = []
    for pair in itertools.combinations(list(df.columns),2):
        var_pairs.append(pair)
        var_pairs_check.append(all(df[pair[0]] == df[pair[1]]))
    selected_pairs = [pair for pair, check in zip(var_pairs, var_pairs_check) if check]
    selected_var = [var2 for var1,var2 in selected_pairs]
    print(df.shape)
    df = df.drop(list(set(selected_var)), axis=1)
    print(df.shape)
    return df


def remove_missing(input_df, threshold):
    df = input_df.copy(deep=True)
    drop_vars = []
    for key, var in enumerate(df):
        null_percent = round(df[var].isnull().mean() * 100,2)
        if null_percent >= threshold:
            drop_vars.append(var)
    print(df.shape)
    df = df.drop(drop_vars, axis=1)
    print(df.shape)
    return df


def check_variables(input_df):
    df = input_df.copy(deep=True)
    var_types = [var_n for var_t,var_n  in zip(df.dtypes, df.columns) if var_t == 'O']
    return var_types

def summarize_variables(input_df, metadata_df):
    metadata = metadata_df.copy(deep=True)
    metadata['num_values'] = 'NaN'
    for key, var in enumerate(metadata.values):
        metadata.loc[key,'num_values'] = len(input_df[var[0]].unique())
    return metadata


def determine_hypertension(input_df):
    df = input_df.copy(deep=True)
    df['hypertension'] = -9
    for key, value in enumerate(df.values):
        if (df.loc[key,'sh250'] == 1. or df.loc[key,'sh249'] == 1. or df.loc[key,'fsysto'] >= 140. or df.loc[key,'fdysto'] >= 90.):
            df.at[key, 'hypertension'] = 1
        elif (df.loc[key,'sh250'] == 0 or df.loc[key,'sh249'] == 0 or df.loc[key,'fsysto'] < 140 or df.loc[key,'fdysto'] < 90):
            df.at[key, 'hypertension'] = 0
        else:
            df.at[key,'hypertension'] = -9
    return df  

def determine_hyperglycaemia(input_df):
    df = input_df.copy(deep=True)
    df['sh280'] = np.where(df['sh280'].isna(), 9999, df['sh280'])
    df['sh280'] = df['sh280'].astype(int).astype(str)
    df['meal'] = 1
    for key,time in enumerate(df['sh280']):
        try:
            df.at[key,'meal'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'meal'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    
    df['sh281'] = np.where(df['sh281'].isna(), 9999, df['sh281'])
    df['sh281'] = df['sh281'].astype(int).astype(str)
    df['drink'] = 1
    for key,time in enumerate(df['sh281']):
        try:
            df.at[key,'drink'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'drink'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    
    df['sh283t'] = np.where(df['sh283t'].isna(), 9999, df['sh283t'])
    df['sh283t'] = df['sh283t'].astype(int).astype(str)
    df['measure_time'] = 1
    for key,time in enumerate(df['sh283t']):
        try:
            df.at[key,'measure_time'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'measure_time'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    df['meal'] = pd.to_datetime(df['meal'], format='%Y-%m-%d %H:%M:%S')
    df['drink'] = pd.to_datetime(df['drink'], format='%Y-%m-%d %H:%M:%S')
    df['measure_time'] = pd.to_datetime(df['measure_time'], format='%Y-%m-%d %H:%M:%S')
    df['meal_delta'] = df['meal'] - df['measure_time']
    df['meal_delta'] = df['meal_delta']/np.timedelta64(1,'h')
    df['meal_fasting'] = np.where(df['meal_delta'] > 4, 1, 0)
    df['drink_delta'] = df['drink'] - df['measure_time']
    df['drink_delta'] = df['drink_delta']/np.timedelta64(1,'h')
    df['drink_fasting'] = np.where(df['drink_delta'] > 4, 1, 0)
    df['hyperglycaemia'] = -9
    # sh258: Ever told by a doctor/nurse to have diabetes
    # sh259: Taking medication for the diabetes
    # df.loc[key,'drink_fasting'] == 1. and df.loc[key,'meal_fasting'] == 1. and 
    # df.loc[key,'drink_fasting'] == 0. and df.loc[key,'meal_fasting'] == 0. and 
    for key, value in enumerate(df.values):
        if (df.loc[key,'sh284b'] >= 126 or df.loc[key,'sh258'] == 1 or df.loc[key,'sh259'] == 1):
            df.at[key, 'hyperglycaemia'] = 1
        elif (df.loc[key,'sh284b'] < 126 or df.loc[key,'sh258'] == 0 or df.loc[key,'sh259'] == 0):
            df.at[key, 'hyperglycaemia'] = 0
        else:
            df.at[key, 'hyperglycaemia'] = -9
    return df 


def to_categorical(input_df, input_meta):
    df = input_df.copy(deep=True)
    for var in df.columns:
        if str(input_meta[input_meta['variable'] == var].values[0][4]) == 'categorical':
            df[var] =  df[var].astype('str').astype('category')
    return df

def threshold_cut(input_df, threshold):
    df = input_df.copy(deep=True)
    df = df.sort_values(by='importance', ascending=False)
    df['cumsum'] = np.cumsum(df['importance'])
    df = df[df['cumsum'] >= threshold]
    return df

## Preprocessing data

In [18]:
data = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/data_dd_dhs_ncd.csv'
metadata = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/metadata_dd_dhs_ncd.csv'
metadata_m = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/metadata_dd_dhs_ncd_m.csv'

### Reading data

In [19]:
ncd_data = pd.read_csv(data)
ncd_metadata = pd.read_csv(metadata_m)

### Creating hyperglycaemia target variable

In [20]:
ncd_data = determine_hypertension(input_df=ncd_data)
print(ncd_data['hypertension'].value_counts())

-9    75739
 0     5561
 1     2431
Name: hypertension, dtype: int64


In [21]:
ncd_metadata = summarize_variables(input_df=ncd_data, metadata_df=ncd_metadata)
ncd_metadata.head()
ncd_metadata.to_csv(metadata_m, index=False, index_label=False)

In [22]:
ncd_data.head()

Unnamed: 0.1,Unnamed: 0,hhid,hvidx,hv000,hv001,hv002,hv003,hv004,hv005,hv006,...,shbm,shri,shcmc,shflg,shsmk,shresi,shed1,shed2,shed3,hypertension
0,1,1 3,1,BD6,1,3,2,1,504672,8,...,,,,,,,,,,-9
1,2,1 3,2,BD6,1,3,2,1,504672,8,...,,,,,,,,,,-9
2,3,1 3,3,BD6,1,3,2,1,504672,8,...,,,,,,,,,,-9
3,4,1 3,4,BD6,1,3,2,1,504672,8,...,,,,,,,,,,-9
4,5,1 3,5,BD6,1,3,2,1,504672,8,...,,,,,,,,,,-9


In [23]:
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


### Subset data by target variable missingness

In [24]:
data = subset_by_var2(input_df=ncd_data, var='hypertension')
'hypertension' in data.columns

(83731, 384)
(7992, 384)


True

### Subset data by metadata file (variables to keep column)

In [25]:
data = remove_metadata(input_df=data, metadata=ncd_metadata)
'hypertension' in data.columns

(7992, 384)
(7992, 307)


True

### Remove uniformed variables (no variation)

In [26]:
data = remove_uniform(input_df=data)
'hypertension' in data.columns

(7992, 307)
['hv015', 'hv020', 'hv027', 'hv042', 'hv120', 'hv121', 'hv122', 'hv124', 'sh21', 'ha62', 'hb62']
(7992, 296)


True

### Remove identical variables 

In [27]:
#data = remove_identical(input_df=data)
#'hyperglycaemia' in data.columns

### Removing missing by threshold 

In [28]:
data = remove_missing(input_df=data, threshold=1)
'hypertension' in data.columns

(7992, 296)
(7992, 102)


True

### Checking variables

In [29]:
check_variables(input_df=data)
data.head()

Unnamed: 0.1,Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv016,hv017,hv024,...,sh234c,sh237,sh239,sh241,sh257,shwh,shcmc,shed1,shed3,hypertension
16,17,6,0,0,6,2,1,10,1,1,...,58.0,0.0,0.0,1.0,1.0,0.0,642.0,0.0,0.0,0
17,18,6,0,0,6,2,1,10,1,1,...,60.0,1.0,13.0,1.0,1.0,3.0,614.0,1.0,1.0,1
26,27,5,1,1,5,5,1,10,1,1,...,36.0,1.0,52.0,1.0,1.0,0.0,907.0,1.0,1.0,0
39,40,2,0,0,2,2,0,10,1,1,...,65.0,0.0,13.0,1.0,1.0,3.0,556.0,0.0,0.0,1
40,41,2,0,0,2,2,0,10,1,1,...,55.0,0.0,0.0,1.0,1.0,0.0,669.0,0.0,0.0,0


### Removing meta variables

In [30]:
data = data.drop(['Unnamed: 0'],axis=1)
data.head()

Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv016,hv017,hv024,hv025,...,sh234c,sh237,sh239,sh241,sh257,shwh,shcmc,shed1,shed3,hypertension
16,6,0,0,6,2,1,10,1,1,2,...,58.0,0.0,0.0,1.0,1.0,0.0,642.0,0.0,0.0,0
17,6,0,0,6,2,1,10,1,1,2,...,60.0,1.0,13.0,1.0,1.0,3.0,614.0,1.0,1.0,1
26,5,1,1,5,5,1,10,1,1,2,...,36.0,1.0,52.0,1.0,1.0,0.0,907.0,1.0,1.0,0
39,2,0,0,2,2,0,10,1,1,2,...,65.0,0.0,13.0,1.0,1.0,3.0,556.0,0.0,0.0,1
40,2,0,0,2,2,0,10,1,1,2,...,55.0,0.0,0.0,1.0,1.0,0.0,669.0,0.0,0.0,0


## Building the model

## Spliting predictors and target variable

## Splitting data: train, validation, test

In [31]:
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


In [32]:
y = data['hypertension']
X = data.drop(['hypertension'], axis=1)
X = to_categorical(input_df=X, input_meta=ncd_metadata)
X = pd.get_dummies(X)
X.head()

Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv017,hv035,hv040,hv041,...,shed1_0.0,shed1_1.0,shed1_2.0,shed1_3.0,shed3_0.0,shed3_1.0,shed3_2.0,shed3_3.0,shed3_4.0,shed3_5.0
16,6,0,0,6,2,1,1,1,10,4,...,1,0,0,0,1,0,0,0,0,0
17,6,0,0,6,2,1,1,1,10,4,...,0,1,0,0,0,1,0,0,0,0
26,5,1,1,5,5,1,1,1,10,2,...,0,1,0,0,0,1,0,0,0,0
39,2,0,0,2,2,0,1,0,10,2,...,1,0,0,0,1,0,0,0,0,0
40,2,0,0,2,2,0,1,0,10,2,...,1,0,0,0,1,0,0,0,0,0


In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=84, test_size=0.2)

## Fitting the default model

In [34]:
model_hypertension = XGBClassifier(objective="binary:logistic", max_depth=9, eval_metric='error', min_child_weight=5, subsample=0.7, colsample=0.7)
model_hypertension.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.7,
       colsample_bylevel=1, colsample_bytree=1, eval_metric='error',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [36]:
from sklearn.metrics import confusion_matrix
y_predicted = model_hypertension.predict_proba(X_test)
y_predicted = y_predicted[:,1]
y_predicted = np.where(y_predicted > 0.5, 1, 0)
cm1= pd_ml.ConfusionMatrix(y_pred=y_predicted, y_true=y_test,)
cm2 = confusion_matrix(y_pred=y_predicted, y_true=y_test,)
print(cm1)
print(cm2)

Predicted  False  True  __all__
Actual                         
False         18     4       22
True           6     3        9
__all__       24     7       31
[[974 133]
 [315 177]]


In [37]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.76      0.88      0.81      1107
           1       0.57      0.36      0.44       492

   micro avg       0.72      0.72      0.72      1599
   macro avg       0.66      0.62      0.63      1599
weighted avg       0.70      0.72      0.70      1599



In [39]:
feature_importance = pd.DataFrame.from_dict({'variable': X_train.columns, 'importance': model_hypertension.feature_importances_})
print(feature_importance.shape)

(624, 2)


In [40]:
feature_importance = feature_importance.merge(ncd_metadata, how='left', on='variable')
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
print(feature_importance.shape)
feature_importance

(624, 6)


Unnamed: 0,variable,importance,description,model_keep,num_values,type
16,hv271,0.100930,Wealth index factor score (5 decimals),1.0,16353.0,numeric
23,shcmc,0.082603,CMC of birth of member,1.0,611.0,date
8,hv040,0.079681,Cluster altitude in meters,1.0,72.0,numeric
14,hv246i,0.039841,Owns Chickens/ ducks,1.0,60.0,numeric
18,hv105,0.029748,Age of household members,1.0,98.0,numeric
19,hv108,0.026826,Education completed in single years,1.0,21.0,numeric
0,hv009,0.020983,Number of household members,1.0,25.0,numeric
4,hv013,0.018858,Number of de facto members,1.0,26.0,numeric
3,hv012,0.018592,Number of de jure members,1.0,24.0,numeric
11,hv216,0.016202,Number of rooms used for sleeping,1.0,12.0,numeric


In [41]:
feature_importance.to_csv("/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/feature_importnace_dhs_hypertension.csv")

### Interesting aspect of variable owning chicken/duck versus hyperglycemia

In [43]:
print(data[['hv246i','hypertension']].groupby(by=['hypertension']).mean())
print(data[['hv246i','hypertension']].groupby(by=['hypertension']).std())
print(data[['hv246i','hypertension']].groupby(by=['hypertension']).max())
print(data[['hv246i','hypertension']].groupby(by=['hypertension']).min())

                hv246i
hypertension          
0             5.141341
1             4.462773
                hv246i
hypertension          
0             7.698674
1             7.633756
              hv246i
hypertension        
0                 95
1                 95
              hv246i
hypertension        
0                  0
1                  0


## Building a model on a subset of the most informative variables

In [44]:
print(feature_importance.shape)
subset = threshold_cut(input_df=feature_importance, threshold=0.95)
print(subset.shape)

(624, 6)
(442, 7)


In [45]:
print(X.shape)
X = X[subset['variable'].unique()]
print(X.shape)


(7992, 624)
(7992, 442)


In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=84, test_size=0.2)

## Fitting the default model
model_hyperglycaemia = XGBClassifier(objective="binary:logistic", max_depth=9, eval_metric='error', min_child_weight=5, subsample=0.7, colsample=0.7)
model_hyperglycaemia.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.7,
       colsample_bylevel=1, colsample_bytree=1, eval_metric='error',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [47]:
from sklearn.metrics import confusion_matrix
y_predicted = model_hyperglycaemia.predict_proba(X_test)
y_predicted = y_predicted[:,1]
y_predicted = np.where(y_predicted > 0.5, 1, 0)
cm1= pd_ml.ConfusionMatrix(y_pred=y_predicted, y_true=y_test,)
cm2 = confusion_matrix(y_pred=y_predicted, y_true=y_test,)
print(cm1)
print(cm2)

Predicted  False  True  __all__
Actual                         
False         20     2       22
True           8     1        9
__all__       28     3       31
[[990 117]
 [365 127]]


In [48]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.73      0.89      0.80      1107
           1       0.52      0.26      0.35       492

   micro avg       0.70      0.70      0.70      1599
   macro avg       0.63      0.58      0.57      1599
weighted avg       0.67      0.70      0.66      1599



In [49]:
feature_importance = pd.DataFrame.from_dict({'variable': X_train.columns, 'importance': model_hyperglycaemia.feature_importances_})
print(feature_importance.shape)

(442, 2)


In [50]:
feature_importance = feature_importance.merge(ncd_metadata, how='left', on='variable')
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
print(feature_importance.shape)
feature_importance

(442, 6)


Unnamed: 0,variable,importance,description,model_keep,num_values,type
388,hv244_1,0.047406,,,,
348,sh230,0.046659,Index to household schedule,1.0,26.0,numeric
343,shed3_0.0,0.029862,,,,
423,hv210_1,0.029115,,,,
390,hv246_1,0.028369,,,,
362,hv208_1,0.027249,,,,
95,hv101_1,0.024263,,,,
301,sh110g_1,0.023143,,,,
263,hv247_1,0.023143,,,,
307,hv025_2,0.022770,,,,
