# <span style="color:#0b486b">This note summarizes approaches for disease condition prediction problems.</span>

## <span style="color:#0b486b">I. Naive approaches</span>

### <span style="color:#0b486b">I.1 Logistic regression</span>

$$
P(y=1|\boldsymbol{X})=\frac{1}{1+exp(\boldsymbol{W}^{T}\boldsymbol{X})}
$$

where $\boldsymbol{X}$ is training sample and $y$ is training label.

In [1]:
# Load dataset and do some statistics
import numpy as np
import pandas as pd

df = pd.read_csv('data/GSE97356.csv')

In [2]:
df.head()

Unnamed: 0,A1BG,ABHD17A,ABI2,ACER3,ADAM20,ADAM9,AGFG1,AKAP13,AKR7A2,AKT1,...,ZNF16,ZNF263,ZNF324B,ZNF428,ZNF439,ZNF48,ZNF747,ZNF81,ZSCAN18,condition
0,22,255,141,393,37,191,1109,4659,48,434,...,37,84,16,32,23,148,15,230,90,0
1,68,713,488,1075,117,577,3343,14783,150,1266,...,95,332,72,76,127,563,90,765,227,0
2,51,941,403,879,73,340,2267,10949,142,1136,...,88,291,65,95,76,598,71,594,231,0
3,37,447,349,770,67,339,1785,8909,88,792,...,74,203,45,56,87,284,49,411,140,0
4,26,278,187,587,39,301,1455,6900,71,626,...,41,114,33,37,51,188,30,361,146,0


In [3]:
# get features and labels
X = df.iloc[:,:-1].values
y = df.iloc[:,-1].values

In [4]:
print('#Samples: ',len(df))
print('#Features: ', X.shape[1])
print('#Classes: ', len(np.unique(y)))
for label in np.unique(y):
    print('#Samples of class %s: %d' % (label, len(y[np.where(y==label)])))

#Samples:  282
#Features:  448
#Classes:  2
#Samples of class 0: 201
#Samples of class 1: 81


In [5]:
# using simple classification method: logistic regression
# for less biased result we do 10-fold cross validation and get mean values
# regular CV is KFold: data will be randomly split into k folds

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from utils.evaluation import *

# init the model
logistic_reg = LogisticRegression()

# set 10-fold regular CV
kfold = KFold(10, True, 1)

scores = []
for train_idx, test_idx in kfold.split(X):
    # split data into train and test
    X_train, X_test = X[train_idx], X[test_idx] 
    y_train, y_test = y[train_idx], y[test_idx]
    
    # fit the model with train data
    logistic_reg.fit(X_train, y_train)
    # make prediction
    preds = logistic_reg.predict(X_test)
    scores.append(evaluate(preds, y_test))

mean_scores = np.mean(scores, axis=0)
std = np.std(scores, axis=0)

# print prediction results
print('AUC:\t%.3f (+-%.3f)' % (mean_scores[0], std[0]))
print('F1:\t%.3f (+-%.3f)' % (mean_scores[1], std[1]))
print('Acc:\t%.3f (+-%.3f)' % (mean_scores[2], std[2]))

AUC:	0.572 (+-0.070)
F1:	0.366 (+-0.123)
Acc:	0.666 (+-0.066)


In [6]:
# try with Stratified cross validation: 
# the entire dataset will be split such that each fold is a good representative of the whole
from sklearn.model_selection import StratifiedKFold

# init the model
logistic_reg = LogisticRegression()

# set 10-fold stratified CV
skfold = StratifiedKFold(10, random_state=None)

scores = []

for train_idx, test_idx in skfold.split(X, y):
    # split data into train and test
    X_train, X_test = X[train_idx], X[test_idx] 
    y_train, y_test = y[train_idx], y[test_idx]
    
    # fit the model with train data
    logistic_reg.fit(X_train, y_train)
    # make prediction
    preds = logistic_reg.predict(X_test)
    scores.append(evaluate(preds, y_test))

mean_scores = np.mean(scores, axis=0)
std = np.std(scores, axis=0)

# print prediction results
print('AUC:\t%.3f (+-%.3f)' % (mean_scores[0], std[0]))
print('F1:\t%.3f (+-%.3f)' % (mean_scores[1], std[1]))
print('Acc:\t%.3f (+-%.3f)' % (mean_scores[2], std[2]))

AUC:	0.542 (+-0.107)
F1:	0.317 (+-0.119)
Acc:	0.617 (+-0.114)
