In [200]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import mord as m

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

seed = 667

In [212]:
df = pd.read_csv('dementia.tsv', sep='\t')

In [213]:
df

Unnamed: 0,Subject ID,Group,Visit,Sex,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,Demented,Group_num,CDR_int
0,1,Nondemented,1,1,87,14,2.0,27.0,0.0,1987,0.696,0,0,0
1,1,Nondemented,2,1,88,14,2.0,30.0,0.0,2004,0.681,0,0,0
2,4,Nondemented,1,0,88,18,3.0,28.0,0.0,1215,0.710,0,0,0
3,4,Nondemented,2,0,90,18,3.0,27.0,0.0,1200,0.718,0,0,0
4,5,Nondemented,1,1,80,12,4.0,28.0,0.0,1689,0.712,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,185,Demented,2,1,82,16,1.0,28.0,0.5,1693,0.694,1,1,1
350,185,Demented,3,1,86,16,1.0,26.0,0.5,1688,0.675,1,1,1
351,186,Nondemented,1,0,61,13,2.0,30.0,0.0,1319,0.801,0,0,0
352,186,Nondemented,2,0,63,13,2.0,30.0,0.0,1327,0.796,0,0,0


In [214]:
df = df.drop(columns=['Subject ID'])

In [215]:
df

Unnamed: 0,Group,Visit,Sex,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,Demented,Group_num,CDR_int
0,Nondemented,1,1,87,14,2.0,27.0,0.0,1987,0.696,0,0,0
1,Nondemented,2,1,88,14,2.0,30.0,0.0,2004,0.681,0,0,0
2,Nondemented,1,0,88,18,3.0,28.0,0.0,1215,0.710,0,0,0
3,Nondemented,2,0,90,18,3.0,27.0,0.0,1200,0.718,0,0,0
4,Nondemented,1,1,80,12,4.0,28.0,0.0,1689,0.712,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,Demented,2,1,82,16,1.0,28.0,0.5,1693,0.694,1,1,1
350,Demented,3,1,86,16,1.0,26.0,0.5,1688,0.675,1,1,1
351,Nondemented,1,0,61,13,2.0,30.0,0.0,1319,0.801,0,0,0
352,Nondemented,2,0,63,13,2.0,30.0,0.0,1327,0.796,0,0,0


Let's also define a framework to do brute-force model selection with 5-fold cross validation

In [216]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [217]:
# For brute-force search, we need to get all sublists from the list of columns except the empty sublist
def sublists(columns):
    sublists = []
    
    for i in range(len(columns) + 1): 
        for j in range(i + 1, len(columns) + 1): 
            sublist = columns[i:j]  
            sublists.append(sublist) 
            
    return sublists 

# Here we create a brute-force search function which runs 5-fold cross-validation for each sublist
# We can define which objective function to use for optimization
def brute_search(model, data, response, penalty, obj_fn):
    resp = data[response]
    preds = data.loc[:, data.columns != response]
    preds = sm.add_constant(preds)
    search_space = sublists(preds.columns.tolist())
    
    bestobj = -np.inf
    best_cols = None
    
    for cols in search_space:
        objs = []
        for train_index, test_index in skf.split(preds[cols], resp):
            res = model(resp.iloc[train_index], preds.iloc[train_index][cols]).fit(maxiter=100, disp=False)
#             tmpobj = res.llf - penalty*len(cols)
            test_preds = res.predict(preds.iloc[test_index][cols])
            objs.append(obj_fn(resp.iloc[test_index], test_preds))
        tmpobj = sum(objs) / len(objs)
#         print('{0}: {1}'.format(cols, tmpobj))
#         print(objs)
        if tmpobj > bestobj:
            bestobj = tmpobj
            best_cols = cols
            
    res = model(resp, preds[best_cols]).fit(maxiter=100)
    return bestobj, best_cols, res

In [218]:
# Split data into train and test sets
np.random.seed(667)
train_mask = np.random.rand(df.shape[0]) < .2

test_df = df[~train_mask]
train_df = df[train_mask]
test_df = sm.add_constant(test_df)

print('Training data:')
print('total observations: ' + str(train_df.shape[0]))
print(df.head())
print('Testing data:')
print('total observations: ' + str(test_df.shape[0]))
print(test_df.head())

Training data:
total observations: 65
         Group  Visit  Sex  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV  Demented  \
0  Nondemented      1    1   87    14  2.0  27.0  0.0  1987  0.696         0   
1  Nondemented      2    1   88    14  2.0  30.0  0.0  2004  0.681         0   
2  Nondemented      1    0   88    18  3.0  28.0  0.0  1215  0.710         0   
3  Nondemented      2    0   90    18  3.0  27.0  0.0  1200  0.718         0   
4  Nondemented      1    1   80    12  4.0  28.0  0.0  1689  0.712         0   

   Group_num  CDR_int  
0          0        0  
1          0        0  
2          0        0  
3          0        0  
4          0        0  
Testing data:
total observations: 289
   const        Group  Visit  Sex  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV  \
0    1.0  Nondemented      1    1   87    14  2.0  27.0  0.0  1987  0.696   
2    1.0  Nondemented      1    0   88    18  3.0  28.0  0.0  1215  0.710   
3    1.0  Nondemented      2    0   90    18  3.0  27.0  0.0  1200  

## Logistic Regression on `Demented` with all covariates

In [181]:
covariates = ['Visit', 'Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE']

In [182]:
res = sm.Logit(df.Demented, sm.add_constant(df[covariates])).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.349345
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               Demented   No. Observations:                  354
Model:                          Logit   Df Residuals:                      345
Method:                           MLE   Df Model:                            8
Date:                Sun, 06 Dec 2020   Pseudo R-squ.:                  0.4860
Time:                        14:09:40   Log-Likelihood:                -123.67
converged:                       True   LL-Null:                       -240.60
Covariance Type:            nonrobust   LLR p-value:                 4.503e-46
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         63.8337      9.555      6.680      0.000      45.105      82.562
Visit          0.1695      0.

In [184]:
res = sm.Logit(train_df.Demented, sm.add_constant(train_df[covariates])).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.253658
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:               Demented   No. Observations:                   65
Model:                          Logit   Df Residuals:                       56
Method:                           MLE   Df Model:                            8
Date:                Sun, 06 Dec 2020   Pseudo R-squ.:                  0.6263
Time:                        14:09:44   Log-Likelihood:                -16.488
converged:                       True   LL-Null:                       -44.119
Covariance Type:            nonrobust   LLR p-value:                 3.924e-09
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        127.9413     44.689      2.863      0.004      40.352     215.530
Visit         -0.1147      0

In [185]:
preds = res.predict(sm.add_constant(test_df[covariates]))
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print(loglike_test)

-159.6626093153062


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [186]:
roc_auc_score(test_df.Demented, preds)

0.8771645021645021

In [187]:
accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])

0.7993079584775087

### Model Selection with brute-force search

In [188]:
def demented_obj_fn(gt, preds):
    return accuracy_score(gt, [0 if i < 0.5 else 1 for i in preds])

In [189]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(sm.Logit, train_df[covariates + ['Demented']], 'Demented', 1, demented_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print('aic: {0}'.format(bf_res.aic))
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, preds)))
print('acc: {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])))
print()

Brute-force
Optimization terminated successfully.
         Current function value: 0.253658
         Iterations 10
                           Logit Regression Results                           
Dep. Variable:               Demented   No. Observations:                   65
Model:                          Logit   Df Residuals:                       56
Method:                           MLE   Df Model:                            8
Date:                Sun, 06 Dec 2020   Pseudo R-squ.:                  0.6263
Time:                        14:09:48   Log-Likelihood:                -16.488
converged:                       True   LL-Null:                       -44.119
Covariance Type:            nonrobust   LLR p-value:                 3.924e-09
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const        127.9413     44.689      2.863      0.004      40.352     215.530
Visit         -0

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Multinomial Logistic Regression on `Group` with all covariates

In [192]:
covariates = ['Visit', 'Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE']

In [193]:
model = sm.MNLogit(df['Group'], df[covariates])
res = model.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.633116
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                  Group   No. Observations:                  354
Model:                        MNLogit   Df Residuals:                      338
Method:                           MLE   Df Model:                           14
Date:                Sun, 06 Dec 2020   Pseudo R-squ.:                  0.3249
Time:                        14:10:00   Log-Likelihood:                -224.12
converged:                       True   LL-Null:                       -331.98
Covariance Type:            nonrobust   LLR p-value:                 3.330e-38
   Group=Demented       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
Visit                -0.4706      0.256     -1.835      0.067      -0.973       0.032
Age     

In [194]:
res = sm.MNLogit(train_df.Group_num, sm.add_constant(train_df[covariates])).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.526265
         Iterations 9
                          MNLogit Regression Results                          
Dep. Variable:              Group_num   No. Observations:                   65
Model:                        MNLogit   Df Residuals:                       47
Method:                           MLE   Df Model:                           16
Date:                Sun, 06 Dec 2020   Pseudo R-squ.:                  0.4543
Time:                        14:10:01   Log-Likelihood:                -34.207
converged:                       True   LL-Null:                       -62.687
Covariance Type:            nonrobust   LLR p-value:                 1.688e-06
Group_num=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          82.7635     33.209      2.492      0.013      17.675     147.852
Visit          -0.8681    

In [195]:
preds = res.predict(sm.add_constant(test_df[covariates]))

In [196]:
roc_auc_score(test_df.Demented, preds.max(axis=1))

0.6350846123573397

In [197]:
accuracy_score(test_df.Demented, preds.idxmax(axis=1))

0.7612456747404844

### Model Selection with brute-force search

In [198]:
def group_obj_fn(gt, preds):
    return accuracy_score(gt, preds.idxmax(axis=1))

In [199]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(sm.MNLogit, train_df[covariates + ['Group_num']], 'Group_num', 1, group_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds.idxmax(axis=1)) + (1-test_df.Demented)*np.log(1-preds.idxmax(axis=1))).sum()
print('aic: {0}'.format(bf_res.aic))
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented,  preds.max(axis=1))))
print('acc: {0}:'.format(accuracy_score(test_df.Demented,  preds.idxmax(axis=1))))
print()

Brute-force
Optimization terminated successfully.
         Current function value: 0.709326
         Iterations 7
                          MNLogit Regression Results                          
Dep. Variable:              Group_num   No. Observations:                   65
Model:                        MNLogit   Df Residuals:                       61
Method:                           MLE   Df Model:                            2
Date:                Sun, 06 Dec 2020   Pseudo R-squ.:                  0.2645
Time:                        14:10:04   Log-Likelihood:                -46.106
converged:                       True   LL-Null:                       -62.687
Covariance Type:            nonrobust   LLR p-value:                 6.296e-08
Group_num=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
nWBV           26.8738      7.720      3.481      0.000      11.743      42.005
MMSE          

  result = getattr(ufunc, method)(*inputs, **kwargs)


## Ordinal Logistic Regression on `CDR` with all covariates

In [205]:
covariates = ['Visit', 'Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE']

In [219]:
model = m.LogisticIT()

In [220]:
model.fit(train_df[covariates], train_df['CDR_int'])

LogisticIT()

In [222]:
preds = model.predict(test_df[covariates])

In [223]:
accuracy_score(test_df.CDR_int, preds)

0.7197231833910035

In [225]:
accuracy_score(test_df.Demented, [0 if i < 1 else 1 for i in preds])

0.8062283737024222

In [226]:
roc_auc_score(test_df.Demented, [0 if i < 1 else 1 for i in preds])

0.7778433687524596