In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import mord as m

from itertools import combinations
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

seed = 667

In [2]:
# ignore convergence warnings
import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning) 
warnings.simplefilter('ignore', RuntimeWarning) 
warnings.simplefilter('ignore', UserWarning) 

In [3]:
df = pd.read_csv('dementia.tsv', sep='\t')

In [4]:
df

Unnamed: 0,Subject ID,Group,Visit,MR Delay,Sex,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,Demented,Group_num,CDR_int
0,1,Nondemented,1,0,1,87,14,2.0,27.0,0.0,1987,0.696,0,0,0
1,1,Nondemented,2,457,1,88,14,2.0,30.0,0.0,2004,0.681,0,0,0
2,4,Nondemented,1,0,0,88,18,3.0,28.0,0.0,1215,0.710,0,0,0
3,4,Nondemented,2,538,0,90,18,3.0,27.0,0.0,1200,0.718,0,0,0
4,5,Nondemented,1,0,1,80,12,4.0,28.0,0.0,1689,0.712,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,185,Demented,2,842,1,82,16,1.0,28.0,0.5,1693,0.694,1,1,1
350,185,Demented,3,2297,1,86,16,1.0,26.0,0.5,1688,0.675,1,1,1
351,186,Nondemented,1,0,0,61,13,2.0,30.0,0.0,1319,0.801,0,0,0
352,186,Nondemented,2,763,0,63,13,2.0,30.0,0.0,1327,0.796,0,0,0


In [5]:
df = df.drop(columns=['Subject ID'])

In [6]:
df

Unnamed: 0,Group,Visit,MR Delay,Sex,Age,EDUC,SES,MMSE,CDR,eTIV,nWBV,Demented,Group_num,CDR_int
0,Nondemented,1,0,1,87,14,2.0,27.0,0.0,1987,0.696,0,0,0
1,Nondemented,2,457,1,88,14,2.0,30.0,0.0,2004,0.681,0,0,0
2,Nondemented,1,0,0,88,18,3.0,28.0,0.0,1215,0.710,0,0,0
3,Nondemented,2,538,0,90,18,3.0,27.0,0.0,1200,0.718,0,0,0
4,Nondemented,1,0,1,80,12,4.0,28.0,0.0,1689,0.712,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
349,Demented,2,842,1,82,16,1.0,28.0,0.5,1693,0.694,1,1,1
350,Demented,3,2297,1,86,16,1.0,26.0,0.5,1688,0.675,1,1,1
351,Nondemented,1,0,0,61,13,2.0,30.0,0.0,1319,0.801,0,0,0
352,Nondemented,2,763,0,63,13,2.0,30.0,0.0,1327,0.796,0,0,0


Let's also define a framework to do brute-force model selection with 5-fold cross validation

In [7]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=seed)

In [8]:
# For brute-force search, we need to get all sublists from the list of columns except the empty sublist
def sublists(columns):
    sublists = []
    
    for i in range(0, len(columns)+1):
        temp = [list(x) for x in combinations(columns, i)]
        if len(temp)>0:
            sublists.extend(temp)
            
    sublists.remove([])
            
    return sublists 

# Here we create a brute-force search function which runs 5-fold cross-validation for each sublist
# We can define which objective function to use for optimization
def brute_search(model, data, response, penalty, obj_fn):
    resp = data[response]
    preds = data.loc[:, data.columns != response]
    preds = sm.add_constant(preds)
    search_space = sublists(preds.columns.tolist())
    
    bestobj = -np.inf
    best_cols = None
    
    for cols in search_space:
        objs = []
        for train_index, test_index in skf.split(preds[cols], resp):
            res = model(resp.iloc[train_index], preds.iloc[train_index][cols]).fit(maxiter=100, disp=False)
#             tmpobj = res.llf - penalty*len(cols)
            test_preds = res.predict(preds.iloc[test_index][cols])
            objs.append(obj_fn(resp.iloc[test_index], test_preds))
        tmpobj = sum(objs) / len(objs)
#         print('{0}: {1}'.format(cols, tmpobj))
#         print(objs)
        if tmpobj > bestobj:
            bestobj = tmpobj
            best_cols = cols
            
    res = model(resp, preds[best_cols]).fit(maxiter=100)
    return bestobj, best_cols, res

In [9]:
# Split data into train and test sets
np.random.seed(667)
train_mask = np.random.rand(df.shape[0]) > .2

test_df = df[~train_mask]
train_df = df[train_mask]
test_df = sm.add_constant(test_df)

print('Training data:')
print('total observations: ' + str(train_df.shape[0]))
print(df.head())
print('Testing data:')
print('total observations: ' + str(test_df.shape[0]))
print(test_df.head())

Training data:
total observations: 289
         Group  Visit  MR Delay  Sex  Age  EDUC  SES  MMSE  CDR  eTIV   nWBV  \
0  Nondemented      1         0    1   87    14  2.0  27.0  0.0  1987  0.696   
1  Nondemented      2       457    1   88    14  2.0  30.0  0.0  2004  0.681   
2  Nondemented      1         0    0   88    18  3.0  28.0  0.0  1215  0.710   
3  Nondemented      2       538    0   90    18  3.0  27.0  0.0  1200  0.718   
4  Nondemented      1         0    1   80    12  4.0  28.0  0.0  1689  0.712   

   Demented  Group_num  CDR_int  
0         0          0        0  
1         0          0        0  
2         0          0        0  
3         0          0        0  
4         0          0        0  
Testing data:
total observations: 65
    const        Group  Visit  MR Delay  Sex  Age  EDUC  SES  MMSE  CDR  eTIV  \
1     1.0  Nondemented      2       457    1   88    14  2.0  30.0  0.0  2004   
10    1.0     Demented      2       576    1   69    12  2.0  24.0  0.5  1480

## Logistic Regression on `Demented` with all covariates

In [10]:
covariates = ['Visit', 'Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE', 'MR Delay']

In [11]:
res = sm.Logit(df.Demented, sm.add_constant(df[covariates])).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.346050
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               Demented   No. Observations:                  354
Model:                          Logit   Df Residuals:                      344
Method:                           MLE   Df Model:                            9
Date:                Mon, 07 Dec 2020   Pseudo R-squ.:                  0.4909
Time:                        17:49:30   Log-Likelihood:                -122.50
converged:                       True   LL-Null:                       -240.60
Covariance Type:            nonrobust   LLR p-value:                 8.133e-46
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         65.7569      9.821      6.695      0.000      46.508      85.006
Visit         -0.5293      0.

In [12]:
res = sm.Logit(train_df.Demented, sm.add_constant(train_df[covariates])).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.354222
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               Demented   No. Observations:                  289
Model:                          Logit   Df Residuals:                      279
Method:                           MLE   Df Model:                            9
Date:                Mon, 07 Dec 2020   Pseudo R-squ.:                  0.4790
Time:                        17:49:30   Log-Likelihood:                -102.37
converged:                       True   LL-Null:                       -196.48
Covariance Type:            nonrobust   LLR p-value:                 9.699e-36
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         62.8729     10.566      5.950      0.000      42.164      83.582
Visit         -0.4952      0.

In [13]:
preds = res.predict(sm.add_constant(test_df[covariates]))
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print(loglike_test)

-20.52862368719089


In [14]:
roc_auc_score(test_df.Demented, preds)

0.9278752436647174

In [15]:
accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])

0.8769230769230769

### Model Selection with brute-force search

In [16]:
def demented_obj_fn(gt, preds):
    return accuracy_score(gt, [0 if i < 0.5 else 1 for i in preds])

In [17]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(sm.Logit, train_df[covariates + ['Demented']], 'Demented', 1, demented_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds) + (1-test_df.Demented)*np.log(1-preds)).sum()
print('aic: {0}'.format(bf_res.aic))
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, preds)))
print('acc: {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 0.5 else 1 for i in preds])))
print()

Brute-force
Optimization terminated successfully.
         Current function value: 0.381432
         Iterations 8
                           Logit Regression Results                           
Dep. Variable:               Demented   No. Observations:                  289
Model:                          Logit   Df Residuals:                      282
Method:                           MLE   Df Model:                            6
Date:                Mon, 07 Dec 2020   Pseudo R-squ.:                  0.4390
Time:                        17:49:49   Log-Likelihood:                -110.23
converged:                       True   LL-Null:                       -196.48
Covariance Type:            nonrobust   LLR p-value:                 1.330e-34
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         34.8669      5.618      6.207      0.000      23.857      45.877
Visit         -0.

## Multinomial Logistic Regression on `Group` with all covariates

In [18]:
covariates = ['Visit', 'Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE', 'MR Delay']

In [19]:
model = sm.MNLogit(df['Group'], sm.add_constant(df[covariates]))
res = model.fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.518842
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:                  Group   No. Observations:                  354
Model:                        MNLogit   Df Residuals:                      334
Method:                           MLE   Df Model:                           18
Date:                Mon, 07 Dec 2020   Pseudo R-squ.:                  0.4467
Time:                        17:49:49   Log-Likelihood:                -183.67
converged:                       True   LL-Null:                       -331.98
Covariance Type:            nonrobust   LLR p-value:                 2.384e-52
   Group=Demented       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------
const                36.4864     12.632      2.888      0.004      11.728      61.245
Visit   

In [20]:
res = sm.MNLogit(train_df.Group_num, sm.add_constant(train_df[covariates])).fit()
print(res.summary())

Optimization terminated successfully.
         Current function value: 0.499118
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:              Group_num   No. Observations:                  289
Model:                        MNLogit   Df Residuals:                      269
Method:                           MLE   Df Model:                           18
Date:                Mon, 07 Dec 2020   Pseudo R-squ.:                  0.4641
Time:                        17:49:49   Log-Likelihood:                -144.25
converged:                       True   LL-Null:                       -269.15
Covariance Type:            nonrobust   LLR p-value:                 8.902e-43
Group_num=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          76.7553     12.596      6.094      0.000      52.067     101.443
Visit          -0.4649    

In [21]:
preds = res.predict(sm.add_constant(test_df[covariates]))

In [22]:
roc_auc_score(test_df.Demented, preds.max(axis=1))

0.5458089668615984

In [23]:
accuracy_score(test_df.Demented,  [0 if i < 1 else 1 for i in preds.idxmax(axis=1)])

0.8153846153846154

In [24]:
accuracy_score(test_df.Group_num,  preds.idxmax(axis=1))

0.7230769230769231

### Model Selection with brute-force search

In [25]:
def group_obj_fn(gt, preds):
    return accuracy_score(gt, preds.idxmax(axis=1))

In [26]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search(sm.MNLogit, train_df[covariates + ['Group_num']], 'Group_num', 1, group_obj_fn)
print(bf_res.summary())
preds = bf_res.predict(test_df[bf_cols])
loglike_test = (test_df.Demented*np.log(preds.idxmax(axis=1)) + (1-test_df.Demented)*np.log(1-preds.idxmax(axis=1))).sum()
print('aic: {0}'.format(bf_res.aic))
print('llt: {0}:'.format(loglike_test))
print('auc: {0}:'.format(roc_auc_score(test_df.Demented,  preds.max(axis=1))))
print('acc (converted): {0}:'.format(accuracy_score(test_df.Demented,  [0 if i < 1 else 1 for i in preds.idxmax(axis=1)])))
print('acc (no conversion): {0}:'.format(accuracy_score(test_df.Group_num,  preds.idxmax(axis=1))))
print()

Brute-force
Optimization terminated successfully.
         Current function value: 0.556847
         Iterations 8
                          MNLogit Regression Results                          
Dep. Variable:              Group_num   No. Observations:                  289
Model:                        MNLogit   Df Residuals:                      275
Method:                           MLE   Df Model:                           12
Date:                Mon, 07 Dec 2020   Pseudo R-squ.:                  0.4021
Time:                        17:50:29   Log-Likelihood:                -160.93
converged:                       True   LL-Null:                       -269.15
Covariance Type:            nonrobust   LLR p-value:                 1.295e-39
Group_num=1       coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const          67.1113     10.695      6.275      0.000      46.150      88.073
Visit         

## Ordinal Logistic Regression on `CDR` with all covariates

In [27]:
covariates = ['Visit', 'Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'nWBV', 'MMSE', 'MR Delay']

In [28]:
model = m.LogisticIT().fit(sm.add_constant(train_df[covariates]), train_df['CDR_int'])

In [29]:
preds = model.predict(sm.add_constant(test_df[covariates]))

In [30]:
accuracy_score(test_df.CDR_int, preds)

0.7076923076923077

In [31]:
accuracy_score(test_df.Demented, [0 if i < 1 else 1 for i in preds])

0.8

In [32]:
roc_auc_score(test_df.Demented, [0 if i < 1 else 1 for i in preds])

0.7699805068226121

### Model Selection with brute-force search

In [33]:
def cdr_obj_fn(gt, preds):
    return accuracy_score(gt, preds)

We have to define a new brute-force function to work with `Mord`

In [34]:
# Here we create a brute-force search function which runs 5-fold cross-validation for each sublist
# We can define which objective function to use for optimization
def brute_search_mord(model, data, response, penalty, obj_fn):
    resp = data[response]
    preds = data.loc[:, data.columns != response]
    preds = sm.add_constant(preds)
    search_space = sublists(preds.columns.tolist())
    
    bestobj = -np.inf
    best_cols = None
    
    for cols in search_space:
        objs = []
        for train_index, test_index in skf.split(preds[cols], resp):
#             res = model(resp.iloc[train_index], preds.iloc[train_index][cols]).fit(maxiter=100, disp=False)
            res = model().fit(preds.iloc[train_index][cols], resp.iloc[train_index])
#             tmpobj = res.llf - penalty*len(cols)
            test_preds = res.predict(preds.iloc[test_index][cols])
            objs.append(obj_fn(resp.iloc[test_index], test_preds))
        tmpobj = sum(objs) / len(objs)
#         print('{0}: {1}'.format(cols, tmpobj))
#         print(objs)
        if tmpobj > bestobj:
            bestobj = tmpobj
            best_cols = cols
            
#     res = model(resp, preds[best_cols]).fit(maxiter=100)
    res = model().fit(preds[best_cols], resp)
    return bestobj, best_cols, res

In [35]:
print('Brute-force')
bf_obj, bf_cols, bf_res = brute_search_mord(m.LogisticIT, train_df[covariates + ['CDR_int']], 'CDR_int', 1, cdr_obj_fn)
preds = bf_res.predict(test_df[bf_cols])

Brute-force


In [36]:
print('auc: {0}:'.format(roc_auc_score(test_df.Demented, [0 if i < 1 else 1 for i in preds])))
print('acc (converted): {0}:'.format(accuracy_score(test_df.Demented, [0 if i < 1 else 1 for i in preds])))
print('acc (no conversion): {0}:'.format(accuracy_score(test_df.CDR_int, preds)))
print()

auc: 0.7938596491228069:
acc (converted): 0.8153846153846154:
acc (no conversion): 0.7230769230769231:



In [37]:
bf_cols

['Age', 'Sex', 'EDUC', 'SES', 'eTIV', 'MMSE']

In [39]:
bf_res.coef_

array([-3.89595371e-03,  1.06228069e+00, -5.50926664e-02, -1.35987276e-01,
       -8.32141791e-04, -4.56375197e-01])