In [39]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn import preprocessing

In [5]:
df = pd.read_csv('lg_data.tsv', sep='\t')
df = df.rename(columns={'Subject ID': 'ID'})

In [43]:
min_max_scaler = preprocessing.MinMaxScaler()
df['eTIV'] = min_max_scaler.fit_transform(df[['eTIV']])

In [44]:
df

Unnamed: 0,ID,Demented,Visit,Sex,Age,EDUC,SES,MMSE,eTIV,nWBV
0,1,0,1,1,87,14,2.0,27.0,0.981069,0.696
1,1,0,2,1,88,14,2.0,30.0,1.000000,0.681
2,4,0,1,0,88,18,3.0,28.0,0.121381,0.710
3,4,0,2,0,90,18,3.0,27.0,0.104677,0.718
4,5,0,1,1,80,12,4.0,28.0,0.649220,0.712
...,...,...,...,...,...,...,...,...,...,...
349,185,1,2,1,82,16,1.0,28.0,0.653675,0.694
350,185,1,3,1,86,16,1.0,26.0,0.648107,0.675
351,186,0,1,0,61,13,2.0,30.0,0.237194,0.801
352,186,0,2,0,63,13,2.0,30.0,0.246102,0.796


In [45]:
random = {"Subject ID": '0 + ID', 'Visit': '0 + Visit'}
res = sm.BinomialBayesMixedGLM.from_formula('Demented ~ MMSE + nWBV + SES + EDUC + Age + Sex + eTIV', random, df).fit_vb()
print(res.summary())

               Binomial Mixed GLM Results
           Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
---------------------------------------------------------
Intercept     M     6.8377   0.1377                      
MMSE          M    -0.5465   0.0049                      
nWBV          M     2.6648   0.1877                      
SES           M     0.2247   0.0507                      
EDUC          M     0.0235   0.0092                      
Age           M     0.0563   0.0018                      
Sex           M     1.5465   0.1984                      
eTIV          M    -1.2227   0.2828                      
Subject ID    V    -0.9976   0.9976 0.369   0.050   2.712
Visit         V    -0.8857   0.9022 0.412   0.068   2.506
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [68]:
test_df = pd.read_csv('cs_data.tsv', sep='\t')
test_df = test_df.rename(columns={'Educ': 'EDUC'})
test_df['Visit'] = 1

In [69]:
test_df = test_df.reset_index()
test_df = test_df.rename(columns={'index': 'Subject ID'})

In [71]:
test_df = sm.add_constant(test_df[['Demented', 'Visit', 'Sex', 'Age', 'EDUC', 'SES', 'MMSE', 'eTIV', 'nWBV', 'Subject ID']])

In [72]:
test_df

Unnamed: 0,Demented,Visit,Sex,Age,EDUC,SES,MMSE,eTIV,nWBV,Subject ID
0,0,1,0,74,2.0,3.0,29.0,1344,0.743,0
1,0,1,0,55,4.0,1.0,29.0,1147,0.810,1
2,1,1,0,73,4.0,3.0,27.0,1454,0.708,2
3,0,1,1,74,5.0,2.0,30.0,1636,0.689,3
4,0,1,0,52,3.0,2.0,30.0,1321,0.827,4
...,...,...,...,...,...,...,...,...,...,...
211,1,1,0,70,1.0,4.0,29.0,1295,0.748,211
212,1,1,0,73,3.0,2.0,23.0,1536,0.730,212
213,0,1,0,61,2.0,4.0,28.0,1354,0.825,213
214,0,1,1,61,5.0,2.0,30.0,1637,0.780,214


In [73]:
res.predict(test_df)

  t = np.exp(-z)


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])