# Implementation of Logistic Regression on AMP-PD 

#### Author: Maria Castanos and William Koehler

In [1]:
import pandas as pd
import numpy as np
from logistic_regression import RazorLogReg
from dimensionality_reduction import SelectFeatures
from sklearn import model_selection
import time
from sklearn.feature_selection import SelectKBest, f_classif, RFECV
from sklearn import metrics

## Import Datasets

In [2]:
# Download data from S3
path = "/Users/mdmcastanos/Documents/OccamzRazor/plink_files/"
numpy_file = path + "plink_numpy.npy"
tsv_file = path + "latest_labels.tsv"
y = pd.read_csv(tsv_file, sep = '\t')
df = pd.DataFrame(np.load(numpy_file))
df = df.assign(participant_id=y["participant_id"], case_control_other_latest = y['case_control_other_latest'])

In [3]:
# Reducing dataset to generate examples
df = df[:][0:100]

### Split data

In [4]:
X = df.drop(columns=['case_control_other_latest'])
y = df['case_control_other_latest']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

## Feature Selection

Since the dataset has *x* features and *n* rows, the feature space was reduced by choosing the most relevant features according to different [feature selection methods](logistic_regression). 

### Tree-based Feature Selection
Computes impurity-based feature importances to drop irrelevant features. 

In [5]:
# Select Features
selected_features = SelectFeatures()
train_set_reduced = selected_features.get_reduced_dataset(X_train, y_train)
test_set_reduced = selected_features.get_test_set_reduced(X_test, y_test)

In [6]:
# Fit Model
classifier1 = RazorLogReg(train_set_reduced, test_set_reduced)
logistic_regression = classifier1.get_logistic_regression(None)

In [7]:
# Get Tree-based Predictions
log_reg_predicted_classes, log_reg_predicted_probs = classifier1.predict(logistic_regression)

### Univariate Feature Selection
This method chooses the best features based on univariate statistical tests. In this case, the ANOVA F-value is used to estimate the degree of linear dependency between two random variables (For different types of relationships mutual information methods can be used but I suspect there's not enough data to do that).

### Recursive Feature Elimination with CV
Since this algorithm eliminates the less significant variables from the fitted variables, as opposed to the previous seen methods, this method requires that we fit the model first and pass it as a parameter to the RFE algorithm. The cross validation is then done to fins the optimal value of features to be kept in the model.

In [8]:
selected_features = SelectFeatures()

# Train set set
ID, X_train_rfecv, y_train_rfecv = selected_features.get_data(X_train, y_train)
train_set_rfecv = pd.concat([ID.reset_index(drop=True), 
                                     y_train_rfecv.reset_index(drop=True), 
                                     X_train_rfecv.reset_index(drop=True)],
                                    axis = 1, 
                                    ignore_index=False)

# Test set set
ID, X_test_rfecv, y_test_rfecv = selected_features.get_data(X_test, y_test)
test_set_rfecv = pd.concat([ID.reset_index(drop=True), 
                                     y_test_rfecv.reset_index(drop=True), 
                                     X_test_rfecv.reset_index(drop=True)],
                                    axis = 1, 
                                    ignore_index=False)


In [9]:
# Fit Model
n = list(range(0, 50)) + ["case_control_other_latest", "participant_id"] # Reducing the data to generate examples
classifier = RazorLogReg(train_set_rfecv[n], test_set_rfecv[n])
logistic_regression_rfecv = classifier.logistic_regression()

In [10]:
# Drop Features
n = list(range(0, 50)) # Reducing the data to generate examples
X_train_rfecv = X_train_rfecv[n]
selector = RFECV(logistic_regression_rfecv, step=1, cv=3)
logistic_regression_rfecv = selector.fit(X_train_rfecv, y_train_rfecv)

In [11]:
# Get RFECV predictions
log_reg_rfecv_predicted_classes, log_reg_rfecv_predicted_probs = classifier.predict(logistic_regression_rfecv)

## Regularized Multinomial Logistic Regression 

### Training
A regularized multinomial logistic regression is trained to predict three classes (Control, Case, Other). Elastic Net penalization was used in order to compromise between ridge and lasso penalizations. 

The optimization problem is:
$$\max_{\beta_{0k}, \beta_{k}} \left\{ \sum_{i = 1}^{N} \log Pr(g_i|x_i) - \lambda \sum_{k = 1}^K\sum_{j = 1}^p \big(\alpha |\beta_{kj}| + (1 - \alpha)\beta_{kj}^2\big) \right\}$$
### Hyperparameter Optimization
Hyperparameter $\lambda$ in the optimization problem above, is optimized by implementing grid search. 

## Train Baseline Models

To compare the performance of the logistic regression against baseline models, both a random and a majority-class classifers were trained. 
To evaluate the performance of the logistic regression, both a random and a majority-class classifers were implemented as baseline models to compare against. 

In [12]:
# Get data
ID, X_train_bl, y_train_bl = selected_features.get_data(X_train, y_train)
train_set_bl = pd.concat([ID.reset_index(drop=True), 
                          y_train_bl.reset_index(drop=True), 
                          X_train_bl.reset_index(drop=True)],
                         axis = 1, 
                         ignore_index=False)

# Test set set
ID, X_test_bl, y_test_bl = selected_features.get_data(X_test, y_test)
test_set_bl = pd.concat([ID.reset_index(drop=True), 
                         y_test_bl.reset_index(drop=True), 
                         X_test_bl.reset_index(drop=True)],
                        axis = 1, 
                        ignore_index=False)

In [13]:
classifier = RazorLogReg(train_set_bl, test_set_bl)
random_classifier = classifier.get_random_classifier()
majority_class_classifier = classifier.get_majority_class_classifier()

In [14]:
# Get baseline models predictions
rc_predicted_classes, rc_predicted_probs = classifier.predict(random_classifier)
mc_predicted_classes, mc_predicted_probs = classifier.predict(majority_class_classifier)

## Results

In [15]:
performance_table = classifier1.get_performance_table(logistic_regression)
performance_table = pd.DataFrame(performance_table)

performance_table['Logistic Regression RFECV'] = [metrics.accuracy_score(y_test_rfecv, log_reg_rfecv_predicted_classes),
                                          metrics.precision_score(y_test_rfecv, log_reg_rfecv_predicted_classes, 
                                                                  average='weighted'),
                                          metrics.recall_score(y_test_rfecv, log_reg_rfecv_predicted_classes,
                                                               average='weighted'), 
                                          metrics.f1_score(y_test_rfecv, log_reg_rfecv_predicted_classes, 
                                                           average='weighted')]


performance_table['Random Classifier'] = [metrics.accuracy_score(y_test_bl, rc_predicted_classes),
                                          metrics.precision_score(y_test_bl, rc_predicted_classes, 
                                                                  average='weighted'),
                                          metrics.recall_score(y_test_bl, rc_predicted_classes,
                                                               average='weighted'), 
                                          metrics.f1_score(y_test_bl, rc_predicted_classes, 
                                                           average='weighted')]

performance_table['Majority Class Classifier'] = [metrics.accuracy_score(y_test_bl, mc_predicted_classes),
                                                  metrics.precision_score(y_test_bl, mc_predicted_classes, 
                                                                          average='weighted'), 
                                                  metrics.recall_score(y_test_bl, mc_predicted_classes, 
                                                                       average='weighted'), 
                                                  metrics.f1_score(y_test_bl, mc_predicted_classes, 
                                                                   average='weighted')]
performance_table.round(3)

  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,Logistic Regression,Logistic Regression RFECV,Random Classifier,Majority Class Classifier
Accuracy,0.533,0.4,0.567,0.533
Precision,0.498,0.16,0.556,0.284
Recall,0.533,0.4,0.567,0.533
F1 Score,0.515,0.229,0.545,0.371


## Next Steps

- Add PCA to the pipeline. 
- Implement univariate feature selection (Deal with current float error).
- Literature review on different ways of training on Patient Data.