In [1]:
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm
import warnings


In [2]:
# Ignore warnings
warnings.filterwarnings("ignore")


In [4]:
# Load the data
df = pd.read_csv("./data/teratogenicity_data.txt")
df = df.set_index("ID")
df.head()

Unnamed: 0_level_0,Teratogenicity,LogP,Aromatic_rings,Molecular_weight,H_bond_acceptors,H_bond_donors,Polar_surface_area,Rotatable_bonds,C,F,...,SPST,TCCX,TCOT,TXCX,XCCX,double_bonds,triple_bonds,number_of_charges,net_charge,number_of_rings
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
CHEMBL1466,1,2.79,2,336.29,6,2,93.06,2,19,0,...,0,0,0,0,0,4,0,0,0,4
CHEMBL633,1,7.24,3,645.31,3,0,42.68,11,25,0,...,0,0,0,0,0,1,0,0,0,3
CHEMBL1201039,1,1.97,2,431.94,6,2,160.75,5,15,0,...,0,0,0,0,0,5,0,0,0,3
CHEMBL170365,-1,-0.18,0,44.05,1,0,17.07,0,2,0,...,0,0,0,0,0,1,0,0,0,0
CHEMBL223520,1,-7.14,0,484.5,15,11,282.61,6,18,0,...,0,0,0,0,0,0,0,0,0,3


In [5]:
# Obtain X and y matrices
X = df.drop("Teratogenicity", axis = 1).values
X = sm.add_constant(X)
Y = df.Teratogenicity.values
Y[Y==-1]=0


# Filter
Filtrar los valors de X que tengan poca varianza. Se puede utilizar la funcion VarianceThreshold de sklearn.feature_selection.
Y transformar los datos utilizando la función StandardScaler de sklearn.preprocessing

In [6]:
# Delete variables with low variance
from sklearn.feature_selection import VarianceThreshold
selector = VarianceThreshold(0.5)
X = selector.fit_transform(X)

from sklearn.preprocessing import StandardScaler
scale = StandardScaler()
X = scale.fit_transform(X)


# Split in train and test
Se puede utilizar train_test_split de la libreria sklearn.model_selection.
Queremos 50% de datos para train y para test.

In [7]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.5, stratify=Y)


# Contruir modelos

Se pide:
- Modelo de regresión logística utilizando lasso
- Modelo de regresión logística utilizando elastic-net

Para cada modelo calcular el AUROC y al accuracy y el número de varialbes utilizadas en el modelo.

In [8]:
# Lasso Regression
from sklearn.linear_model import LogisticRegressionCV
m1 = LogisticRegressionCV(Cs=10, cv=5, random_state=0, solver='liblinear',
                          max_iter=100, penalty='l1').fit(X_train, Y_train)

ypred = m1.predict(X_train)
from sklearn.metrics import roc_auc_score
auctrain = roc_auc_score(ypred, Y_train)


ypred = m1.predict(X_test)
auctest = roc_auc_score(ypred, Y_test)


print("AUC_ROC in training error - Lasso: ", auctrain)
print("AUC_ROC in test error - Lasso: ", auctest)

AUC_ROC in training error - Lasso:  0.881038756877683
AUC_ROC in test error - Lasso:  0.606475300400534


In [9]:
ypred = m1.predict(X_train)
from sklearn.metrics import accuracy_score
ACtrain = accuracy_score(ypred, Y_train)

ypred = m1.predict(X_test)
ACtest = accuracy_score(ypred, Y_test)

print("Accuracy in training error is: ", ACtrain)
print("Accuracy in test error is: ", ACtest)

betas = m1.coef_
n_features = sum(sum(abs(betas)>0))
print("Number of features with Lasso: ", n_features)


Accuracy in training error is:  0.8807692307692307
Accuracy in test error is:  0.6130268199233716
Number of features with Lasso:  116


In [10]:
# Elasticnet Regression
m = 11
l1 = np.linspace(0,1,m)

m2 = LogisticRegressionCV(Cs=10, cv=5, random_state=0, solver='saga',
                          max_iter=100, penalty='elasticnet',
                          l1_ratios=l1).fit(X_train, Y_train)

ypred = m2.predict(X_train)
auctrain = roc_auc_score(ypred, Y_train)


ypred = m2.predict(X_test)
auctest = roc_auc_score(ypred, Y_test)


print("AUC_ROC in training error - ElasticNEt: ", auctrain)
print("AUC_ROC in test error - ElasticNet: ", auctest)

AUC_ROC in training error - ElasticNEt:  0.7274195344693173
AUC_ROC in test error - ElasticNet:  0.7138026607538803


In [11]:
ypred = m2.predict(X_train)
from sklearn.metrics import accuracy_score
ACtrain2 = accuracy_score(ypred, Y_train)

ypred = m2.predict(X_test)
ACtest2 = accuracy_score(ypred, Y_test)

print("Accuracy in training error - ElasticNet: ", ACtrain2)
print("Accuracy in test error - ElasticNet: ", ACtest2)

betas = m2.coef_
n_features = sum(sum(abs(betas)>0))
print("Number of features with ElasticNEt: ", n_features)


Accuracy in training error - ElasticNet:  0.6576923076923077
Accuracy in test error - ElasticNet:  0.6513409961685823
Number of features with ElasticNEt:  17


# Construir Modelo Seleccionando mejores k features

- Utilizar la función SelectKBest de sklearn.feature_selection para seleccionar las mejores k features. 
- Con esas k variables realizar la regresión logística con cross-validaton con cv = 5, optimizando los valores de roc_auc.
- Calcular la media de los 5 aurocs obtenidos.
- Seleccionar la k con el valor más alto de media de aurocs y con este valor de k entrenar todo el train y evaluar el modelo con el test.

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.model_selection import cross_val_score


In [22]:
p=50
FS = np.zeros(p)
k = 1
logreg = LogisticRegression()


In [23]:
while (k <= p):
    X_new = SelectKBest(f_classif, k=k).fit_transform(X_train, Y_train) # Select the k best features
    X_new = sm.add_constant(X_new)
    logreg.fit(X_new,Y_train)
    scores = cross_val_score(logreg, X_new, Y_train,
                         scoring='roc_auc', cv=5)
    FS[k-1] = np.mean(scores)
    k = k +1


In [24]:
max_value = max(FS)
max_index = np.where(FS==max_value)
K = np.ndarray.item(max_index[0])+1
selector = SelectKBest(f_classif, k=K)
selector.fit(X_train, Y_train)
cols_idxs = selector.get_support(indices=True)

X_new_train = X_train[:, cols_idxs]
X_new_train = sm.add_constant(X_new_train)

X_new_test = X_test[:, cols_idxs]
X_new_test = sm.add_constant(X_new_test)

m3 = logreg.fit(X_new_train,Y_train)

from sklearn.metrics import roc_auc_score
ypred = m3.predict(X_new_train)
auctrain = roc_auc_score(ypred, Y_train)

ypred = m3.predict(X_new_test)
auctest = roc_auc_score(ypred, Y_test)


print("AUC_ROC in training error - Feature selection: ", auctrain)
print("AUC_ROC in test error - Feature selection: ", auctest)

ypred = m3.predict(X_new_train)
ACtrain = accuracy_score(ypred, Y_train)

ypred = m3.predict(X_new_test)
ACtest = accuracy_score(ypred, Y_test)

print("Accuracy in training error - Feature selection: ", ACtrain)
print("Accuracy in test error - Feature selection: ", ACtest)

betas = m3.coef_
n_features = sum(sum(abs(betas)>0))
print("Number of features with Feature Selection: ", n_features)

AUC_ROC in training error - Feature selection:  0.730625
AUC_ROC in test error - Feature selection:  0.6383277216610549
Accuracy in training error - Feature selection:  0.7307692307692307
Accuracy in test error - Feature selection:  0.6436781609195402
Number of features with Feature Selection:  49
