# Elastic Net Models

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler


In [2]:
geneexp = pd.read_csv("gene_expression_original.csv", sep='\t', index_col=0)

In [5]:
selected_genes = pd.read_csv("linear_model/topgenes.tsv", sep='\t', index_col=0)
selected_genes.head()

Unnamed: 0,TSS,SSwithin,SSbetween,SEProp
SSMa114640,2.976327,0.406906,2.56942,0.863286
SSMa019120,3.158486,0.508415,2.650071,0.839032
SSMd291000,2.447368,0.449332,1.998036,0.816402
SSMb152060,0.30543,0.056249,0.249181,0.815836
SSMb194180,8.265961,1.534264,6.731697,0.814388


In [6]:
import re
# extract the month numbers
def extract_number(mystring):
    numbers = re.findall("^\d+", mystring)
    return int(numbers[0])

In [25]:
# get the ages
ages = np.array([extract_number(timestring) for timestring in geneexp.columns.tolist()]).reshape(-1, 1)
top_genes = selected_genes.index.tolist()
selected_geneexp_mat = np.array(geneexp.loc[top_genes, :]).transpose()
log_geneexp_mat = np.log(selected_geneexp_mat)

Fit Elastic Net Regression

In [200]:
from sklearn.model_selection import LeaveOneOut, GridSearchCV
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.pipeline import Pipeline

Set up parameters that are worth tuning: number of features to start from, regularization parameter and the proportion of l2 and l1 penalty.

In [251]:
num_features = np.array([50, 100, 200, 300, 400, 500])
# each entry in the dictionary is a matrix of values
loo_mae = {num_feature: [] for num_feature in num_features} # leave one out MAE
loo_instability = {num_feature: [] for num_feature in num_features} # leave one out instability
agefold_mae = {num_feature: [] for num_feature in num_features}
agefold_instability = {num_feature: [] for num_feature in num_features}
# optimal_param = {num_feature: [] for num_feature in num_features}
penalties = 0.05*(2**np.arange(0, 6))
l1ratios = np.arange(0.25, 1, 0.25)

Set up

In [259]:
def enet_fit(X, Y, nfeature=50, penalty=0.2, l1ratio=0.5, folds = None):
    """
    :param X: predictor matrix, nsample * ngene
    :param Y: age to be predicted, nsamples
    :param nfeature: number of top features in X to be preserved
    :param penalty: penalty parameter
    :param l1ratio: proportion of L1 penalty
    :param folds: a list of indices
    :return: mean absolute error and instability statistics
    """
    X_subset = X[:, 0:nfeature]
    model = Pipeline([
        ('scaler', StandardScaler()),
        ('elasticnet', ElasticNet(alpha=penalty,l1_ratio=l1ratio,max_iter=20000))
    ])
    absolute_error = np.zeros(X_subset.shape[0])
    coefficients = np.zeros((len(folds), nfeature))

    if folds is None:
        folds = np.arange(0, X.shape[0])

    for j, fold in enumerate(folds):

        X_train = np.delete(X_subset, fold, axis=0)
        Y_train = np.delete(Y, fold, axis=0)
        X_test = X_subset[fold, :].reshape(-1, nfeature)
        Y_test = Y[fold]
        model.fit(X_train, Y_train)
        coefficients[j, :] = model['elasticnet'].coef_
        y_test_pred = model.predict(X_test)
        absolute_error[fold] = mean_absolute_error(Y_test, y_test_pred)

    mae = np.mean(absolute_error)
    selection_prob = np.mean(coefficients !=0, axis=0)
    instability = np.mean(selection_prob * (1-selection_prob))

    return mae, instability


First report performance when each sample is left out one by one.

In [252]:
for num_feature in num_features:
    print(num_feature)
    mae_mat = np.zeros((len(penalties), len(l1ratios)))
    instability_mat = np.zeros((len(penalties), len(l1ratios)))
    for i, penalty in enumerate(penalties):
        for j, l1ratio in enumerate(l1ratios):
            mae_mat[i, j], instability_mat[i, j] = enet_fit(X=log_geneexp_mat, Y=ages, nfeature=num_feature, penalty=penalty, l1ratio=l1ratio)

    loo_mae[num_feature] = mae_mat
    loo_instability[num_feature] = instability_mat

50
100
200
300


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

400


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

500


  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = c

In [289]:
import pickle
with open("linear_model/loo_mae.pkl", 'wb') as f:
    pickle.dump(loo_mae, f)
with open("linear_model/loo_instability.pkl", 'wb') as f:
    pickle.dump(loo_instability, f)

In [260]:
unique_ages = np.unique(ages)
age_folds = [np.where(ages == age)[0] for age in unique_ages]


I also check the performance when samples of a specific age are taken altogether as the test set.

In [261]:
for num_feature in num_features:
    print(num_feature)
    mae_mat = np.zeros((len(penalties), len(l1ratios)))
    instability_mat = np.zeros((len(penalties), len(l1ratios)))
    for i, penalty in enumerate(penalties):
        for j, l1ratio in enumerate(l1ratios):
            mae_mat[i, j], instability_mat[i, j] = enet_fit(X=log_geneexp_mat, Y=ages, nfeature=num_feature, penalty=penalty, l1ratio=l1ratio, folds=age_folds)

    agefold_mae[num_feature] = mae_mat
    agefold_instability[num_feature] = instability_mat

50
100
200
300
400
500


In [290]:
import pickle
with open("linear_model/age_mae.pkl", 'wb') as f:
    pickle.dump(agefold_mae, f)
with open("linear_model/age_instability.pkl", 'wb') as f:
    pickle.dump(agefold_instability, f)