In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from itertools import combinations
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold

seed = 667

In [2]:
# ignore convergence warnings
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning) 
warnings.simplefilter('ignore', RuntimeWarning) 
warnings.simplefilter('ignore', UserWarning) 

In [3]:
df = pd.read_csv('dementia.tsv', sep='\t')
df = df.rename(columns={'Subject ID': 'ID', 'MR Delay': 'Delay'})

In [4]:
min_max_scaler = preprocessing.MinMaxScaler()
df['eTIV'] = min_max_scaler.fit_transform(df[['eTIV']])

In [5]:
df

Unnamed: 0,ID,Group,Visit,Delay,Sex,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,Demented,Group_num,CDR_int
0,1,Nondemented,1,0,1,87,14,2.0,27.0,0.0,0.981069,0.696,0,0,0
1,1,Nondemented,2,457,1,88,14,2.0,30.0,0.0,1.000000,0.681,0,0,0
2,4,Nondemented,1,0,0,88,18,3.0,28.0,0.0,0.121381,0.710,0,0,0
3,4,Nondemented,2,538,0,90,18,3.0,27.0,0.0,0.104677,0.718,0,0,0
4,5,Nondemented,1,0,1,80,12,4.0,28.0,0.0,0.649220,0.712,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,185,Demented,2,842,1,82,16,1.0,28.0,0.5,0.653675,0.694,1,1,1
350,185,Demented,3,2297,1,86,16,1.0,26.0,0.5,0.648107,0.675,1,1,1
351,186,Nondemented,1,0,0,61,13,2.0,30.0,0.0,0.237194,0.801,0,0,0
352,186,Nondemented,2,763,0,63,13,2.0,30.0,0.0,0.246102,0.796,0,0,0


In [6]:
# Split data into train and test sets
np.random.seed(667)
train_mask = np.random.rand(df.shape[0]) > .2

test_df = df[~train_mask]
train_df = df[train_mask]
test_df = sm.add_constant(test_df)

print('Training data:')
print('total observations: ' + str(train_df.shape[0]))
print(df.head())
print('Testing data:')
print('total observations: ' + str(test_df.shape[0]))
print(test_df.head())

Training data:
total observations: 289
   ID        Group  Visit  Delay  Sex  Age  EDUC  SES  MMSE  CDR      eTIV  \
0   1  Nondemented      1      0    1   87    14  2.0  27.0  0.0  0.981069   
1   1  Nondemented      2    457    1   88    14  2.0  30.0  0.0  1.000000   
2   4  Nondemented      1      0    0   88    18  3.0  28.0  0.0  0.121381   
3   4  Nondemented      2    538    0   90    18  3.0  27.0  0.0  0.104677   
4   5  Nondemented      1      0    1   80    12  4.0  28.0  0.0  0.649220   

    nWBV  Demented  Group_num  CDR_int  
0  0.696         0          0        0  
1  0.681         0          0        0  
2  0.710         0          0        0  
3  0.718         0          0        0  
4  0.712         0          0        0  
Testing data:
total observations: 65
    const  ID        Group  Visit  Delay  Sex  Age  EDUC  SES  MMSE  CDR  \
1     1.0   1  Nondemented      2    457    1   88    14  2.0  30.0  0.0   
10    1.0   9     Demented      2    576    1   69    12 

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [8]:
# For brute-force search, we need to get all sublists from the list of columns except the empty sublist
def sublists(columns):
    sublists = []
    
    for i in range(0, len(columns)+1):
        temp = [list(x) for x in combinations(columns, i)]
        if len(temp)>0:
            sublists.extend(temp)
            
    sublists.remove([])
            
    return sublists 

# Here we create a brute-force search function which runs 5-fold cross-validation for each sublist
# We can define which objective function to use for optimization
def brute_search(data, response, r_vars, covariates, ident, penalty, obj_fn):
    resp = data[response]
    preds = data[covariates]
    preds = sm.add_constant(preds)
    preds_r = data[r_vars]
    search_space = sublists(preds.columns.tolist())
    
    bestobj = -np.inf
    best_cols = None
    
    for cols in search_space:
        objs = []
        for train_index, test_index in skf.split(preds[cols], resp):
            res = sm.BinomialBayesMixedGLM(resp.iloc[train_index], preds.iloc[train_index][covariates], preds_r.iloc[train_index][random_vars], 
                                           ident).fit_vb()
#             tmpobj = res.llf - penalty*len(cols)
            test_preds = res.predict(preds.iloc[test_index][cols])
            objs.append(obj_fn(resp.iloc[test_index], test_preds))
        tmpobj = sum(objs) / len(objs)
#         print('{0}: {1}'.format(cols, tmpobj))
#         print(objs)
        if tmpobj > bestobj:
            bestobj = tmpobj
            best_cols = cols
            
    res = sm.BinomialBayesMixedGLM(resp.iloc[test_index], preds.iloc[test_index][best_cols], preds_r.iloc[test_index][random_vars], 
                                  ident).fit_vb()
    return bestobj, best_cols, res

# GLMM for `Demented` response
## `Subject` random variable

In [9]:
covariates = ['Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE', 'Visit']
random_vars = ['ID']

In [10]:
res = sm.BinomialBayesMixedGLM(train_df.Demented, sm.add_constant(train_df[covariates]), train_df[random_vars], [0]).fit_vb()
print(res.summary())

             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
const    M     5.8380   0.1527                      
Age      M     0.0604   0.0020                      
Sex      M     1.6856   0.2154                      
EDUC     M     0.0396   0.0102                      
SES      M     0.2571   0.0555                      
eTIV     M    -1.1705   0.3078                      
nWBV     M     2.2929   0.2081                      
MMSE     M    -0.5307   0.0054                      
Visit    M    -0.0310   0.0717                      
VC_1     V    -0.9963   0.9963 0.369   0.050   2.708
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [11]:
preds = res.predict(sm.add_constant(test_df[covariates + random_vars]))

In [12]:
roc_auc_score(test_df.Demented, preds)

0.5282651072124755

In [13]:
accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])

0.5846153846153846

### Model selection

In [14]:
def demented_obj_fn(gt, preds):
    return accuracy_score(gt, [0 if i < 0.5 else 1 for i in preds])

In [15]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(train_df, 'Demented', random_vars, covariates, [0], 1, demented_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, preds)))
print('acc: {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])))
print()

Brute-force
             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
Age      M     0.0798   0.0047                      
Sex      M     1.7051   0.5317                      
EDUC     M    -0.0786   0.0236                      
SES      M     0.6693   0.1470                      
eTIV     M     0.0719   0.6908                      
nWBV     M     1.8251   0.4753                      
MMSE     M    -0.3952   0.0128                      
Visit    M    -0.1524   0.2019                      
VC_1     V    -0.9724   0.9735 0.378   0.054   2.650
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations
llt: -52.155481602929235:
auc: 0.8323586744639376:
acc: 0.6923076923076923:



## `Visit` random variable

In [16]:
covariates = ['Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE']
random_vars = ['Visit']

In [17]:
res = sm.BinomialBayesMixedGLM(train_df.Demented, sm.add_constant(train_df[covariates]), train_df[random_vars], [0]).fit_vb()
print(res.summary())

             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
const    M     6.0884   0.1499                      
Age      M     0.0486   0.0019                      
Sex      M     1.6152   0.2130                      
EDUC     M     0.0560   0.0100                      
SES      M     0.2557   0.0546                      
eTIV     M    -1.0623   0.3030                      
nWBV     M     2.4465   0.2044                      
MMSE     M    -0.4940   0.0053                      
VC_1     V    -0.8680   0.8895 0.420   0.071   2.487
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [18]:
preds = res.predict(sm.add_constant(test_df[covariates + random_vars]))

In [19]:
roc_auc_score(test_df.Demented, preds)

0.8421052631578948

In [20]:
accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])

0.7538461538461538

### Model Selection

In [21]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(train_df, 'Demented', random_vars, covariates, [0], 1, demented_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, preds)))
print('acc: {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])))
print()

Brute-force
            Binomial Mixed GLM Results
     Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
---------------------------------------------------
Age     M     0.0708   0.0042                      
Sex     M     0.9484   0.4892                      
EDUC    M    -0.0384   0.0210                      
SES     M     0.3380   0.1319                      
eTIV    M     0.2542   0.6272                      
nWBV    M     2.0193   0.4306                      
MMSE    M    -0.2898   0.0116                      
VC_1    V    -0.6929   0.7871 0.500   0.104   2.414
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations
llt: -31.411615872931076:
auc: 0.857699805068226:
acc: 0.7846153846153846:



## `Subject` and `Visit` random variable

In [22]:
covariates = ['Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE']
random_vars = ['Visit', 'ID']

In [23]:
res = sm.BinomialBayesMixedGLM(train_df.Demented, sm.add_constant(train_df[covariates]), train_df[random_vars], [0, 1]).fit_vb()
print(res.summary())

             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
const    M     5.8439   0.1526                      
Age      M     0.0600   0.0020                      
Sex      M     1.6826   0.2153                      
EDUC     M     0.0399   0.0102                      
SES      M     0.2580   0.0554                      
eTIV     M    -1.1743   0.3077                      
nWBV     M     2.2960   0.2080                      
MMSE     M    -0.5310   0.0054                      
VC_1     V    -0.8679   0.8894 0.420   0.071   2.487
VC_2     V    -0.9963   0.9963 0.369   0.050   2.708
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [24]:
preds = res.predict(sm.add_constant(test_df[covariates + random_vars]))

In [25]:
roc_auc_score(test_df.Demented, preds)

0.530214424951267

In [26]:
accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])

0.5846153846153846

### Model Selection

In [27]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(train_df, 'Demented', random_vars, covariates, [0, 1], 1, demented_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, preds)))
print('acc: {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])))
print()

Brute-force
            Binomial Mixed GLM Results
     Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
---------------------------------------------------
Age     M     0.0763   0.0046                      
Sex     M     1.7431   0.5297                      
EDUC    M    -0.0784   0.0236                      
SES     M     0.6741   0.1463                      
eTIV    M     0.0627   0.6880                      
nWBV    M     1.8101   0.4735                      
MMSE    M    -0.3910   0.0128                      
VC_1    V    -0.6540   0.7688 0.520   0.112   2.419
VC_2    V    -0.9730   0.9740 0.378   0.054   2.651
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations
llt: -50.35526186075053:
auc: 0.8382066276803118:
acc: 0.6923076923076923:



## `Visit` and `Sex` random variable

In [28]:
covariates = ['Age', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE']
random_vars = ['Visit', 'Sex']

In [30]:
res = sm.BinomialBayesMixedGLM(train_df.Demented, sm.add_constant(train_df[covariates]), train_df[random_vars], [0, 1]).fit_vb()
print(res.summary())

             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
const    M     6.0976   0.1494                      
Age      M     0.0481   0.0019                      
EDUC     M     0.0567   0.0099                      
SES      M     0.2584   0.0544                      
eTIV     M    -0.9605   0.3022                      
nWBV     M     2.4169   0.2037                      
MMSE     M    -0.4930   0.0053                      
VC_1     V    -0.8694   0.8904 0.419   0.071   2.488
VC_2     V     0.4910   0.5011 1.634   0.600   4.452
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [31]:
preds = res.predict(sm.add_constant(test_df[covariates + random_vars]))

In [32]:
roc_auc_score(test_df.Demented, preds)

0.8430799220272903

In [33]:
accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])

0.7384615384615385

### Model Selection

In [35]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(train_df, 'Demented', random_vars, covariates, [0, 1], 1, demented_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, preds)))
print('acc: {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])))
print()

Brute-force
            Binomial Mixed GLM Results
     Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
---------------------------------------------------
Age     M     0.0704   0.0041                      
EDUC    M    -0.0388   0.0207                      
SES     M     0.2679   0.1296                      
eTIV    M     0.7260   0.6219                      
nWBV    M     2.0384   0.4245                      
MMSE    M    -0.2810   0.0114                      
VC_1    V    -0.6842   0.7829 0.504   0.105   2.415
VC_2    V    -0.1545   0.6096 0.857   0.253   2.900
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations
llt: -32.09742998705076:
auc: 0.8323586744639376:
acc: 0.7846153846153846:

