# Implementation of Logistic Regression on AMP-PD 

#### Authors: Maria Castanos and William Koehler

In [1]:
import pandas as pd
import numpy as np
from logistic_regression import RazorLogReg
from dimensionality_reduction import SelectFeatures
from sklearn import model_selection
from sklearn.feature_selection import RFECV

## Import Datasets
Load data from the [folder in S3](https://s3.console.aws.amazon.com/s3/buckets/amp-pd-data?prefix=genomic-data%2Freduced%2F&region=us-west-2&showversions=false#). Save into a new folder named 'plink_files'.

In [2]:
path = "/Users/mdmcastanos/Documents/OccamzRazor/plink_files/"
npy_file = path + "plink_numpy.npy"
tsv_file = path + "latest_labels.tsv"
df_y = pd.read_csv(tsv_file, sep = '\t')
df_X = pd.DataFrame(np.load(npy_file))

In [3]:
# Construct dataset
df = df_X.assign(participant_id=df_y['participant_id'], case_control_other_latest = df_y['case_control_other_latest'])
df = df.set_index('participant_id')
df.reset_index(drop=True, inplace=True)

## Split Data

In [4]:
X = df.drop(columns=['case_control_other_latest'])
y = df['case_control_other_latest']
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
X_train.to_csv('X_train.csv', index=False)
y_train.to_csv('y_train.csv', index=False)

## Feature Selection
Since the dataset has *x* features and *n* rows, the feature space was reduced by choosing the most relevant features according to different [feature selection methods](logistic_regression). 

### Tree-based Feature Selection
Computes impurity-based feature importances to drop irrelevant features. 

In [5]:
select_data = SelectFeatures()
tree_based_train_set = select_data.get_reduced_dataset(X_train, y_train)
tree_based_test_set = select_data.get_test_set_reduced(X_test, y_test)

### Univariate Feature Selection
This method chooses the best features based on univariate statistical tests. In this case, the ANOVA F-value is used to estimate the degree of linear dependency between two random variables (For different types of relationships mutual information methods can be used but I suspect there's not enough data to do that).

In [6]:
univariate_selected = SelectFeatures(method='Univariate', k=25000)
univariate_train_set = univariate_selected.get_reduced_dataset(X_train, y_train)
univariate_test_set = univariate_selected.get_test_set_reduced(X_test, y_test)

  156420  161804  169729  190507  196506  216239  223863  229368  233078
  236292  291171  293399  295141  325964  332167  335432  359053  374831
  377355  384083  397029  430139  443653  462168  488037  488859  524597
  538363  551676  556984  557842  561958  576374  613765  620423  625878
  641807  646124  661350  755655  768836  781153  805117  805199  805361
  805368  805409  806665  806748  806755  806800  806821  814363  858116
  859136  877688  933716  942316  948423  952829  970808  992956 1034116
 1058003 1098113 1103212 1127154 1144896 1182493 1210916 1223929 1252852
 1307843 1312982 1315644 1317855 1325158 1333963 1390823 1410100 1442753
 1447028 1473796 1482825 1514238 1543858 1603122 1628805 1659803 1665699
 1684342 1704470 1719832 1750311 1757599 1780999 1837354 1890305 1895694
 1900670 1966531] are constant.
  f = msb / msw


### Recursive Feature Elimination with CV
Since this algorithm eliminates the less significant variables from the fitted variables, as opposed to the previous seen methods, it requires the model to be fitted first and passed in as a parameter. The cross validation is done to find the optimal value of features to be kept in the model.

In [10]:
# Construct train and test sets
complete_train_set = pd.concat([X_train, y_train], axis=1)
complete_test_set = pd.concat([X_test, y_test], axis=1)

In [None]:
# Fit Model
rfecv = RazorLogReg(complete_train_set, complete_test_set)
rfecv_log_reg = rfecv.logistic_regression()

In [None]:
# Drop Features
selector = RFECV(rfecv_log_reg, step=1000, cv=3)

## Regularized Multinomial Logistic Regression 
### Training
A regularized multinomial logistic regression is trained to predict three classes (Control, Case, Other). Elastic Net penalization was used in order to find a compromise between ridge and lasso penalizations. 

The optimization problem is:
$$\max_{\beta_{0k}, \beta_{k}} \left\{ \sum_{i = 1}^{N} \log Pr(g_i|x_i) - \lambda \sum_{k = 1}^K\sum_{j = 1}^p \big(\alpha |\beta_{kj}| + (1 - \alpha)\beta_{kj}^2\big) \right\}$$
### Hyperparameter Optimization
Hyperparameters $\lambda$ and $\alpha$ in the problem above, are optimized by implementing grid search. Since the logistic regression classifies more than one class, the optimal hyperparametrs are those that minimize the log-loss function.

In [7]:
# Training logistic regression with features selected by the tree based method
tree_based = RazorLogReg(tree_based_train_set, tree_based_test_set)
tree_based_classifier = tree_based.get_logistic_regression(None)





In [8]:
# Training logistic regression with features selected by the univatiate method
univariate = RazorLogReg(univariate_train_set, univariate_test_set)
univariate_classifier = univariate.get_logistic_regression(None)





In [None]:
# Training logistic regression with features kept by the RFECV method 
rfecv_classifier = selector.fit(X_train, y_train)

## Train Baseline Classifiers
To compare the performance of the logistic regression against baseline models, both a random and a majority-class classifers were trained. 
To evaluate the performance of the logistic regression, both a random and a majority-class classifers were implemented as baseline models to compare against.

In [11]:
baseline = RazorLogReg(complete_train_set, complete_test_set)
random_classifier = baseline.get_random_classifier()
majority_class_classifier = baseline.get_majority_class_classifier()

## Results
### Predictions

In [13]:
tree_based_predicted_classes, tree_based_predicted_probs = tree_based.predict(tree_based_classifier)
univariate_predicted_classes, univariate_predicted_probs = univariate.predict(univariate_classifier)
#rfecv_predicted_classes, rfecv_predicted_probs = rfecv.predict(rfecv_classifier)
rc_predicted_classes, rc_predicted_probs = baseline.predict(random_classifier)
mc_predicted_classes, mc_predicted_probs = baseline.predict(majority_class_classifier)

### Performance Table

In [16]:
classifier = RazorLogReg()
average = 'weighted'
tree_based = classifier.get_metrics(y_test, tree_based_predicted_classes, average)
univariate = classifier.get_metrics(y_test, univariate_predicted_classes, average)
#rfecv = classifier.get_metrics(y_test, rfecv_predicted_classes, average)
random = classifier.get_metrics(y_test, rc_predicted_classes, average)
majority_class = classifier.get_metrics(y_test, mc_predicted_classes, average) 

performance_table = pd.DataFrame({"Tree Based Method": tree_based, 
                                  "Univariate Method": univariate, 
                                  #"RFECV Method": rfecv,
                                  "Random": random,
                                  "Majority Class": majority_class
                                 })

performance_table.index = ['Accuracy', 'Precision', 'Recall', 'F1-Score'] 
performance_table.round(3)

Unnamed: 0,Tree Based Method,Univariate Method,Random,Majority Class
Accuracy,0.659933,0.680135,0.312009,0.584736
Precision,0.643497,0.62521,0.46012,0.341916
Recall,"(0.6599326599326599,)","(0.6801346801346801,)","(0.31200897867564537,)","(0.5847362514029181,)"
F1-Score,0.650123,0.650969,0.356754,0.431512


## Next Steps
- Add PCA to the pipeline and compare. 
- Run with two classes (merge 'other' with 'case')
- Add Balanced Accuracy 