# dp_dhs_ncd_diabetes

```
!pip install imblearn
!pip install hyperopt
!pip install pandas_ml
```

## Notes

- Definition of diabetes:
    - Recommendation for diagnosis of diabetes [WHO](https://www.who.int/diabetes/publications/Definition%20and%20diagnosis%20of%20diabetes_new.pdf)
- Hyperglycemia if meets any of the following three criteria:
    - FBG ≥ 126 mg/dl (7.0 mmol/l)
    - Doctor/nurse diagnosed diabetes
    - Taking diabetes medication

## Packages

In [34]:
import os
import re
os.environ['KMP_DUPLICATE_LIB_OK']='True'
import itertools
import collections
import logging
import numpy as np
import pandas as pd
import pandas_ml as pd_ml
import seaborn as sns
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import RandomOverSampler 
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegressionCV
from xgboost import XGBClassifier

from datetime import timedelta
from datetime import datetime as dt

import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import mean_squared_error

from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

## Custom functions

In [149]:
def subset_by_var1(input_df, var):
    df = input_df.copy(deep=True)
    print(df.shape)
    df = df.loc[df[var].notna(),:]
    print(df.shape)
    return df


def subset_by_var2(input_df, var):
    df = input_df.copy(deep=True)
    print(df.shape)
    df = df.loc[df[var] != -9,:]
    print(df.shape)
    return df


def remove_metadata(input_df, metadata):
    df = input_df.copy(deep=True)
    var_select = [var1 for var1,var2 in zip(metadata['variable'],metadata['model_keep']) if var2 == 0]
    print(df.shape)
    df = df.drop(var_select, axis=1)
    print(df.shape)
    return df 
 
def remove_uniform(input_df):
    df = input_df.copy(deep=True)
    var_list = []
    for key, var in enumerate(df):
        tmp = df[var].dropna()
        if len(tmp.unique()) == 1:
            var_list.append(var)
    print(df.shape)
    print(var_list)
    df = df.drop(var_list, axis=1)
    print(df.shape)
    return df
    
def remove_identical(input_df):
    df = input_df.copy(deep=True)
    var_pairs_check = []
    var_pairs = []
    for pair in itertools.combinations(list(df.columns),2):
        var_pairs.append(pair)
        var_pairs_check.append(all(df[pair[0]] == df[pair[1]]))
    selected_pairs = [pair for pair, check in zip(var_pairs, var_pairs_check) if check]
    selected_var = [var2 for var1,var2 in selected_pairs]
    print(df.shape)
    df = df.drop(list(set(selected_var)), axis=1)
    print(df.shape)
    return df


def remove_missing(input_df, threshold):
    df = input_df.copy(deep=True)
    drop_vars = []
    for key, var in enumerate(df):
        null_percent = round(df[var].isnull().mean() * 100,2)
        if null_percent >= threshold:
            drop_vars.append(var)
    print(df.shape)
    df = df.drop(drop_vars, axis=1)
    print(df.shape)
    return df


def check_variables(input_df):
    df = input_df.copy(deep=True)
    var_types = [var_n for var_t,var_n  in zip(df.dtypes, df.columns) if var_t == 'O']
    return var_types

def summarize_variables(input_df, metadata_df):
    metadata = metadata_df.copy(deep=True)
    metadata['num_values'] = 'NaN'
    for key, var in enumerate(metadata.values):
        metadata.loc[key,'num_values'] = len(input_df[var[0]].unique())
    return metadata


def determine_hypertension(input_df):
    df = input_df.copy(deep=True)
    df['hypertension'] = -9
    for key, value in enumerate(df.values):
        if (df.loc[key,'sh250'] == 1. or df.loc[key,'sh249'] == 1. or df.loc[key,'fsysto'] >= 140. or df.loc[key,'fdysto'] >= 90.):
            df.at[key, 'hypertension'] = 1
        elif (df.loc[key,'sh250'] == 0 or df.loc[key,'sh249'] == 0 or df.loc[key,'fsysto'] < 140 or df.loc[key,'fdysto'] < 90):
            df.at[key, 'hypertension'] = 0
        else:
            df.at[key,'hypertension'] = -9
    return df  

def determine_hyperglycaemia(input_df):
    df = input_df.copy(deep=True)
    df['sh280'] = np.where(df['sh280'].isna(), 9999, df['sh280'])
    df['sh280'] = df['sh280'].astype(int).astype(str)
    df['meal'] = 1
    for key,time in enumerate(df['sh280']):
        try:
            df.at[key,'meal'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'meal'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    
    df['sh281'] = np.where(df['sh281'].isna(), 9999, df['sh281'])
    df['sh281'] = df['sh281'].astype(int).astype(str)
    df['drink'] = 1
    for key,time in enumerate(df['sh281']):
        try:
            df.at[key,'drink'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'drink'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    
    df['sh283t'] = np.where(df['sh283t'].isna(), 9999, df['sh283t'])
    df['sh283t'] = df['sh283t'].astype(int).astype(str)
    df['measure_time'] = 1
    for key,time in enumerate(df['sh283t']):
        try:
            df.at[key,'measure_time'] = dt.strptime(time,'%H%M')
        except ValueError:
            df.at[key,'measure_time'] = dt.strptime('2020-01-01:00:00','%Y-%m-%d:%H:%M')
    df['meal'] = pd.to_datetime(df['meal'], format='%Y-%m-%d %H:%M:%S')
    df['drink'] = pd.to_datetime(df['drink'], format='%Y-%m-%d %H:%M:%S')
    df['measure_time'] = pd.to_datetime(df['measure_time'], format='%Y-%m-%d %H:%M:%S')
    df['meal_delta'] = df['meal'] - df['measure_time']
    df['meal_delta'] = df['meal_delta']/np.timedelta64(1,'h')
    df['meal_fasting'] = np.where(df['meal_delta'] > 4, 1, 0)
    df['drink_delta'] = df['drink'] - df['measure_time']
    df['drink_delta'] = df['drink_delta']/np.timedelta64(1,'h')
    df['drink_fasting'] = np.where(df['drink_delta'] > 4, 1, 0)
    df['hyperglycaemia'] = -9
    # sh258: Ever told by a doctor/nurse to have diabetes
    # sh259: Taking medication for the diabetes
    # df.loc[key,'drink_fasting'] == 1. and df.loc[key,'meal_fasting'] == 1. and 
    # df.loc[key,'drink_fasting'] == 0. and df.loc[key,'meal_fasting'] == 0. and 
    for key, value in enumerate(df.values):
        if (df.loc[key,'sh284b'] >= 126 or df.loc[key,'sh258'] == 1 or df.loc[key,'sh259'] == 1):
            df.at[key, 'hyperglycaemia'] = 1
        elif (df.loc[key,'sh284b'] < 126 or df.loc[key,'sh258'] == 0 or df.loc[key,'sh259'] == 0):
            df.at[key, 'hyperglycaemia'] = 0
        else:
            df.at[key, 'hyperglycaemia'] = -9
    return df 


def to_categorical(input_df, input_meta):
    df = input_df.copy(deep=True)
    for var in df.columns:
        if str(input_meta[input_meta['variable'] == var].values[0][4]) == 'categorical':
            df[var] =  df[var].astype('str').astype('category')
    return df

def threshold_cut(input_df, threshold):
    df = input_df.copy(deep=True)
    df = df.sort_values(by='importance', ascending=False)
    df['cumsum'] = np.cumsum(df['importance'])
    df = df[df['cumsum'] >= threshold]
    return df

def get_svrs_metadata(path, pattern):
    out_files = []
    for path, subdirs, files in os.walk(path):
        out_files = out_files + [os.path.join(path,file) for file in files if bool(re.search(pattern=pattern,string=file))]
    return out_files

def search_pattern(input_df, var, pattern):
    df = input_df.copy(deep=True)
    subset = [bool(re.search(pattern=pattern, string=var)) for var in df[var]]
    return df[subset]

## Preprocessing data

In [171]:
data = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/data_dd_dhs_ncd.csv'
metadata = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/metadata_dd_dhs_ncd.csv'
metadata_m = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/ncd_data/metadata_dd_dhs_ncd_m.csv'
svrs_metadata = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/bbs/data/'
svrs2017 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/data/bbs/svrs/SVRS_17/metadata_bbs_SRVS_17.csv'
svrs2017 = '/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/data/bbs/svrs/SVRS_17/tafsl-2h.dta'

## Checking DHS data

In [172]:
ncd_data = pd.read_csv(data)
ncd_metadata = pd.read_csv(metadata_m)
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


- Education (completed in years):
    1. DHS: **hv108** - Education completed in single years
    2. SVRS2017: **q_16** - Level of Education
- Age:
    1. DHS: **hv105** - Age of household members
    2. SVRS2017: **q_10** - Age
- Number of household members:
    1. DHS: **hv009** - Number of household members
    2. SVRS2017:  
- Urban/rural
    1. DHS: 
    2. SVRS2017:
- Zila
    1. DHS:
    2. SVRS2017:

In [173]:
print(ncd_data['hv009'].median())
search_pattern(input_df=ncd_metadata, var='description', pattern='Toilet|toilet')

5.0


Unnamed: 0,variable,description,model_keep,num_values,type
42,hv205,Type of toilet facility,1,13,categorical
59,hv225,Share toilet with other households,1,3,categorical
89,hv238,Number of households sharing toilet,1,12,categorical


## Checking SVRS data

In [174]:
itr = pd.read_stata(svrs2017, iterator=True)
data = pd.read_stata(svrs2017)
print(itr.variable_labels())
print(data['q_6'].value_counts())
data.head()

{'psu_no': ' Primary Sampling Unit', 'zila': 'Zila Code', 'upza': 'Upazila Code', 'union': 'Union Code', 'mauza': 'Mauza Code', 'rmo': 'Rural Urban Code', 'hh_no': 'Household Number', 'q1_1n': 'Number of Building', 'q1_1a': 'Area of Building', 'q1_2n': 'Number of Semi Pucca', 'q1_2a': 'Area of Semi Pucca', 'q1_3n': 'Number of CIS/Wooden', 'q1_3a': 'Area of CIS/Wooden', 'q1_4n': 'Number of Mud', 'q1_4a': 'Area of Mud', 'q1_5n': 'Number of Bamboo', 'q1_5a': 'Area of Bamboo', 'q1_6n': 'Number of Others', 'q1_6a': 'Area of Others', 'q2_1': 'Sources of Drinking Water', 'q2_2': 'Sources of Other Uses', 'q_3': 'Ownership of Sources of Water', 'q_4': ' Sources of light', 'q_5': ' Sources of Fuel', 'q_6': ' Toilet Facilities', 'q_7': ' Level of Economic Solvency', 'dwater': 'RECODE of q2_1', 'divn': 'RECODE of psu_no', 'resi': 'RECODE of rmo', 'q21': 'RECODE of q2_1', 'q4': 'RECODE of q_4', 'q5': 'RECODE of q_5', 'q6': 'RECODE of q_6', 'q7': 'RECODE of q_7'}
1.0    126437
2.0    100272
3.0     

Unnamed: 0,psu_no,zila,upza,union,mauza,rmo,hh_no,q1_1n,q1_1a,q1_2n,...,q_6,q_7,dwater,divn,resi,q21,q4,q5,q6,q7
0,1.0,4.0,9.0,7.0,250.0,2.0,1.0,0.0,0.0,3.0,...,1.0,4.0,Tubewell,Barisal,Urban,Tubewell,Electricity,Jute/Wood/bamboo,Sanitary with water Shield,Economic solvent
1,1.0,4.0,9.0,7.0,250.0,2.0,2.0,0.0,0.0,0.0,...,1.0,3.0,Tubewell,Barisal,Urban,Tubewell,Electricity,Others,Sanitary with water Shield,Balanced inc/exp
2,1.0,4.0,9.0,7.0,250.0,2.0,3.0,0.0,0.0,0.0,...,1.0,3.0,Tubewell,Barisal,Urban,Tubewell,Electricity,Straw leaf,Sanitary with water Shield,Balanced inc/exp
3,1.0,4.0,9.0,7.0,250.0,2.0,4.0,0.0,0.0,2.0,...,1.0,3.0,Tubewell,Barisal,Urban,Tubewell,Electricity,Others,Sanitary with water Shield,Balanced inc/exp
4,1.0,4.0,9.0,7.0,250.0,2.0,5.0,0.0,0.0,2.0,...,9.0,3.0,Tubewell,Barisal,Urban,Tubewell,Electricity,Jute/Wood/bamboo,Open,Balanced inc/exp


### Reading data

In [4]:
ncd_data = pd.read_csv(data)
ncd_metadata = pd.read_csv(metadata_m)

### Creating hyperglycaemia target variable

In [5]:
ncd_data = determine_hyperglycaemia(input_df=ncd_data)
print(ncd_data['hyperglycaemia'].value_counts())

-9    74896
 0     6632
 1     2203
Name: hyperglycaemia, dtype: int64


In [6]:
ncd_metadata = summarize_variables(input_df=ncd_data, metadata_df=ncd_metadata)
ncd_metadata.head()
ncd_metadata.to_csv(metadata_m, index=False, index_label=False)

In [7]:
ncd_data.head()

Unnamed: 0.1,Unnamed: 0,hhid,hvidx,hv000,hv001,hv002,hv003,hv004,hv005,hv006,...,shed2,shed3,meal,drink,measure_time,meal_delta,meal_fasting,drink_delta,drink_fasting,hyperglycaemia
0,1,1 3,1,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
1,2,1 3,2,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
2,3,1 3,3,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
3,4,1 3,4,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9
4,5,1 3,5,BD6,1,3,2,1,504672,8,...,,,2020-01-01,2020-01-01,2020-01-01,0.0,0,0.0,0,-9


In [8]:
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


### Subset data by target variable missingness

In [9]:
data = subset_by_var2(input_df=ncd_data, var='hyperglycaemia')
'hyperglycaemia' in data.columns

(83731, 391)
(8835, 391)


True

### Subset data by metadata file (variables to keep column)

In [10]:
data = remove_metadata(input_df=data, metadata=ncd_metadata)
'hyperglycaemia' in data.columns

(8835, 391)
(8835, 314)


True

### Remove uniformed variables (no variation)

In [11]:
data = remove_uniform(input_df=data)
'hyperglycaemia' in data.columns

(8835, 314)
['hv015', 'hv020', 'hv027', 'hv042', 'hv120', 'hv121', 'hv122', 'hv124', 'sh21', 'ha62', 'hb62']
(8835, 303)


True

### Remove identical variables 

In [12]:
#data = remove_identical(input_df=data)
#'hyperglycaemia' in data.columns

### Removing missing by threshold 

In [13]:
data = remove_missing(input_df=data, threshold=1)
'hyperglycaemia' in data.columns

(8835, 303)
(8835, 108)


True

### Checking variables

In [14]:
check_variables(input_df=data)
data.head()

Unnamed: 0.1,Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv016,hv017,hv024,...,shed1,shed3,meal,drink,measure_time,meal_delta,meal_fasting,drink_delta,drink_fasting,hyperglycaemia
12,13,6,0,0,6,2,1,10,1,1,...,1.0,1.0,2020-01-01 00:00:00,2020-01-01 00:00:00,2020-01-01 00:00:00,0.0,0,0.0,0,1
16,17,6,0,0,6,2,1,10,1,1,...,0.0,0.0,1900-01-01 20:00:00,1900-01-01 20:00:00,1900-01-01 06:08:00,13.866667,1,13.866667,1,0
17,18,6,0,0,6,2,1,10,1,1,...,1.0,1.0,1900-01-01 20:00:00,1900-01-01 20:00:00,1900-01-01 06:12:00,13.8,1,13.8,1,0
26,27,5,1,1,5,5,1,10,1,1,...,1.0,1.0,1900-01-01 21:00:00,1900-01-01 21:00:00,1900-01-01 06:18:00,14.7,1,14.7,1,0
39,40,2,0,0,2,2,0,10,1,1,...,0.0,0.0,1900-01-01 21:00:00,1900-01-01 21:00:00,1900-01-01 06:26:00,14.566667,1,14.566667,1,0


### Removing meta variables

In [15]:
data = data.drop(['Unnamed: 0', 'meal', 'drink', 'measure_time', 'meal_delta', 'meal_fasting', 'drink_delta', 'drink_fasting'],axis=1)
data.head()

Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv016,hv017,hv024,hv025,...,sh231,sh234c,sh237,sh239,sh241,shwh,shcmc,shed1,shed3,hyperglycaemia
12,6,0,0,6,2,1,10,1,1,2,...,1.0,35.0,1.0,13.0,3.0,3.0,917.0,1.0,1.0,1
16,6,0,0,6,2,1,10,1,1,2,...,2.0,58.0,0.0,0.0,1.0,0.0,642.0,0.0,0.0,0
17,6,0,0,6,2,1,10,1,1,2,...,1.0,60.0,1.0,13.0,1.0,3.0,614.0,1.0,1.0,0
26,5,1,1,5,5,1,10,1,1,2,...,1.0,36.0,1.0,52.0,1.0,0.0,907.0,1.0,1.0,0
39,2,0,0,2,2,0,10,1,1,2,...,1.0,65.0,0.0,13.0,1.0,3.0,556.0,0.0,0.0,0


## Building the model

## Spliting predictors and target variable

## Splitting data: train, validation, test

In [16]:
ncd_metadata.head()

Unnamed: 0,variable,description,model_keep,num_values,type
0,hhid,Case Identification,0,17141,numeric
1,hvidx,Line number,0,31,categorical
2,hv000,Country code and phase,0,1,categorical
3,hv001,Cluster number,0,600,categorical
4,hv002,Household number,0,184,categorical


In [17]:
y = data['hyperglycaemia']
X = data.drop(['hyperglycaemia'], axis=1)
X = to_categorical(input_df=X, input_meta=ncd_metadata)
X = pd.get_dummies(X)
X.head()

Unnamed: 0,hv009,hv010,hv011,hv012,hv013,hv014,hv017,hv035,hv040,hv041,...,shed1_0.0,shed1_1.0,shed1_2.0,shed1_3.0,shed3_0.0,shed3_1.0,shed3_2.0,shed3_3.0,shed3_4.0,shed3_5.0
12,6,0,0,6,2,1,1,1,10,4,...,0,1,0,0,0,1,0,0,0,0
16,6,0,0,6,2,1,1,1,10,4,...,1,0,0,0,1,0,0,0,0,0
17,6,0,0,6,2,1,1,1,10,4,...,0,1,0,0,0,1,0,0,0,0
26,5,1,1,5,5,1,1,1,10,2,...,0,1,0,0,0,1,0,0,0,0
39,2,0,0,2,2,0,1,0,10,2,...,1,0,0,0,1,0,0,0,0,0


In [18]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=84, test_size=0.2)

## Fitting the default model

In [19]:
model_hyperglycaemia = XGBClassifier(objective="binary:logistic", max_depth=9, eval_metric='error', min_child_weight=5, subsample=0.7, colsample=0.7)
model_hyperglycaemia.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.7,
       colsample_bylevel=1, colsample_bytree=1, eval_metric='error',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [20]:
from sklearn.metrics import confusion_matrix
y_predicted = model_hyperglycaemia.predict_proba(X_test)
y_predicted = y_predicted[:,1]
y_predicted = np.where(y_predicted > 0.5, 1, 0)
cm1= pd_ml.ConfusionMatrix(y_pred=y_predicted, y_true=y_test,)
cm2 = confusion_matrix(y_pred=y_predicted, y_true=y_test,)
print(cm1)
print(cm2)

Predicted  False  True  __all__
Actual                         
False         26     4       30
True           8     0        8
__all__       34     4       38
[[1302   25]
 [ 237  203]]


In [21]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.85      0.98      0.91      1327
           1       0.89      0.46      0.61       440

   micro avg       0.85      0.85      0.85      1767
   macro avg       0.87      0.72      0.76      1767
weighted avg       0.86      0.85      0.83      1767



In [22]:
feature_importance = pd.DataFrame.from_dict({'variable': X_train.columns, 'importance': model_hyperglycaemia.feature_importances_})
print(feature_importance.shape)

(622, 2)


## Checking available variables in SVRS

- Extract 95% of high influencing variables from DHS prediction
- Go through metadata file from SVRS file and list all variables
- Pick 20 variables or so to predict diabetes
- 

In [66]:
def read_metadata(files):
    metadata = {}
    for file in files:
        metadata[file] = pd.read_csv(file)
    return metadata
        

In [113]:
data.columns

Index(['psu_no', 'zila', 'upza', 'union', 'mauza', 'rmo', 'hh_no', 'tot_pop',
       'q_8', 'q_10', 'q_11', 'q_12', 'q_13', 'q_14', 'q_15', 'q_16', 'q_17',
       'q_18', 'q_19', 'q_20', 'q_21', 'q_22', 'hhsize', 'agecat1', 'agecat2',
       'resi', 'religion', 'divn', 'sex', 'agecatLT'],
      dtype='object')

In [82]:
feature_importance = feature_importance.merge(ncd_metadata, how='left', on='variable')
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
print(feature_importance.shape)
feature_importance.sort_values(by='importance', ascending=False)

(622, 14)


Unnamed: 0,variable,importance,description_x,model_keep_x,num_values_x,type_x,description_y,model_keep_y,num_values_y,type_y,description,model_keep,num_values,type
0,hv271,0.091121,Wealth index factor score (5 decimals),1.0,16353.0,numeric,Wealth index factor score (5 decimals),1.0,16353.0,numeric,Wealth index factor score (5 decimals),1.0,16353.0,numeric
1,shcmc,0.086465,CMC of birth of member,1.0,611.0,date,CMC of birth of member,1.0,611.0,date,CMC of birth of member,1.0,611.0,date
2,hv040,0.069172,Cluster altitude in meters,1.0,72.0,numeric,Cluster altitude in meters,1.0,72.0,numeric,Cluster altitude in meters,1.0,72.0,numeric
3,hv246i,0.037912,Owns Chickens/ ducks,1.0,60.0,numeric,Owns Chickens/ ducks,1.0,60.0,numeric,Owns Chickens/ ducks,1.0,60.0,numeric
4,hv108,0.030595,Education completed in single years,1.0,21.0,numeric,Education completed in single years,1.0,21.0,numeric,Education completed in single years,1.0,21.0,numeric
5,hv105,0.025607,Age of household members,1.0,98.0,numeric,Age of household members,1.0,98.0,numeric,Age of household members,1.0,98.0,numeric
6,hv013,0.020286,Number of de facto members,1.0,26.0,numeric,Number of de facto members,1.0,26.0,numeric,Number of de facto members,1.0,26.0,numeric
7,hv012,0.019621,Number of de jure members,1.0,24.0,numeric,Number of de jure members,1.0,24.0,numeric,Number of de jure members,1.0,24.0,numeric
8,hv252,0.018623,Frequency household members smoke inside the h...,1.0,6.0,numeric,Frequency household members smoke inside the h...,1.0,6.0,numeric,Frequency household members smoke inside the h...,1.0,6.0,numeric
9,idxh4,0.016960,Index to Household Schedule,1.0,31.0,numeric,Index to Household Schedule,1.0,31.0,numeric,Index to Household Schedule,1.0,31.0,numeric


In [24]:
feature_importance.to_csv("/Users/edinhamzic/Symphony/wb_bangladesh/Bangladesh/output/dhs/feature_importnace_dhs_hyperglycemia.csv")

### Interesting aspect of variable owning chicken/duck versus hyperglycemia

In [25]:
print(data[['hv246i','hyperglycaemia']].groupby(by=['hyperglycaemia']).mean())
print(data[['hv246i','hyperglycaemia']].groupby(by=['hyperglycaemia']).std())
print(data[['hv246i','hyperglycaemia']].groupby(by=['hyperglycaemia']).max())
print(data[['hv246i','hyperglycaemia']].groupby(by=['hyperglycaemia']).min())

                  hv246i
hyperglycaemia          
0               5.036339
1               4.068997
                  hv246i
hyperglycaemia          
0               7.761393
1               6.816976
                hv246i
hyperglycaemia        
0                   95
1                   95
                hv246i
hyperglycaemia        
0                    0
1                    0


## Building a model on a subset of the most informative variables

In [26]:
print(feature_importance.shape)
subset = threshold_cut(input_df=feature_importance, threshold=0.95)
print(subset.shape)

(622, 6)
(455, 7)


In [28]:
subset.head(100)

Unnamed: 0,variable,importance,description,model_keep,num_values,type,cumsum
110,hv214_12,0.000998,,,,,0.950117
578,sh237_1.0,0.000998,,,,,0.951114
53,hv016_8,0.000998,,,,,0.952112
70,hv201_21,0.000998,,,,,0.953110
259,hv237d_0,0.000998,,,,,0.954107
179,hv220_52,0.000998,,,,,0.955105
180,hv220_53,0.000998,,,,,0.956102
545,hv106_1,0.000998,,,,,0.957100
608,shwh_3.0,0.000998,,,,,0.958098
318,hv270_2,0.000998,,,,,0.959095


In [105]:
print(X.shape)
X = X[subset['variable'].unique()]
print(X.shape)


(8835, 622)
(8835, 455)


In [106]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=84, test_size=0.2)
X_train, X_validation, y_train, y_validation = train_test_split(X_train, y_train, random_state=84, test_size=0.2)

## Fitting the default model
model_hyperglycaemia = XGBClassifier(objective="binary:logistic", max_depth=9, eval_metric='error', min_child_weight=5, subsample=0.7, colsample=0.7)
model_hyperglycaemia.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample=0.7,
       colsample_bylevel=1, colsample_bytree=1, eval_metric='error',
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=9,
       min_child_weight=5, missing=None, n_estimators=100, n_jobs=1,
       nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=0.7)

In [107]:
from sklearn.metrics import confusion_matrix
y_predicted = model_hyperglycaemia.predict_proba(X_test)
y_predicted = y_predicted[:,1]
y_predicted = np.where(y_predicted > 0.5, 1, 0)
cm1= pd_ml.ConfusionMatrix(y_pred=y_predicted, y_true=y_test,)
cm2 = confusion_matrix(y_pred=y_predicted, y_true=y_test,)
print(cm1)
print(cm2)

Predicted  False  True  __all__
Actual                         
False         25     5       30
True           8     0        8
__all__       33     5       38
[[1282   45]
 [ 267  173]]


In [108]:
print(classification_report(y_test, y_predicted))

              precision    recall  f1-score   support

           0       0.83      0.97      0.89      1327
           1       0.79      0.39      0.53       440

   micro avg       0.82      0.82      0.82      1767
   macro avg       0.81      0.68      0.71      1767
weighted avg       0.82      0.82      0.80      1767



In [109]:
feature_importance = pd.DataFrame.from_dict({'variable': X_train.columns, 'importance': model_hyperglycaemia.feature_importances_})
print(feature_importance.shape)

(455, 2)


In [110]:
feature_importance = feature_importance.merge(ncd_metadata, how='left', on='variable')
feature_importance = feature_importance.sort_values(by='importance', ascending=False)
print(feature_importance.shape)
feature_importance

(455, 6)


Unnamed: 0,variable,importance,description,model_keep,num_values,type
170,sh230,0.059061,Index to household schedule,1.0,26.0,numeric
226,hv244_1,0.040858,,,,
173,hv247_1,0.028317,,,,
73,sh13_1.0,0.026699,,,,
227,hv246_1,0.024272,,,,
163,hv026_3,0.021845,,,,
296,sh104b_0,0.021036,,,,
280,hv118_1,0.021036,,,,
43,sh110g_1,0.020227,,,,
107,hv208_1,0.019013,,,,
