In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn import preprocessing

In [2]:
from math import sqrt

# Lasso Regression

In [266]:
data = pd.read_csv('ML_data.csv')
data.head()

Unnamed: 0,Type,AB,Site,Impurity,∆H (A-rich),∆H (B-rich),(+2/+1),(+1/0),(0/-1),(-1/-2),...,Therm_cond,Elec_cond,Heat_fusion,Heat_vap,Electronegativity,At_num,Period,Group,Valence,Ox_state
0,IV-IV,SiC,M_i_A,In,15.963,15.963,3.891,4.66,5.109,5.629,...,81.6,3.4,3.26,226.34,1.78,49,5,13,3,3
1,IV-IV,SiC,M_i_B,La,19.972,19.554,3.863,4.539,4.991,5.556,...,13.5,1.9,11.3,399.57,1.1,57,6,3,3,3
2,IV-IV,SiC,M_i_neut,Zn,7.157,7.157,3.206,4.244,4.734,5.363,...,116.0,16.9,7.38,115.3,1.65,30,4,12,2,2
3,IV-IV,SiC,M_i_B,Al,6.951,7.11,3.503,4.242,4.65,5.319,...,237.0,37.7,10.7,290.8,1.61,13,3,13,3,3
4,IV-IV,SiC,M_B,I,8.783,9.201,3.614,4.221,5.058,5.608,...,0.45,0.0,7.76,20.9,2.66,53,5,17,7,1


In [267]:
data_1 = data[data['Type']=='II-VI']
data_2 = data[data['Type']=='III-V']
data_3 = data[data['Type']=='IV-IV']
count_1, count_2, count_3 = data.Type.value_counts()
data_1_under = data_1.sample(count_3, random_state=0)
data_3_under = data_3.sample(count_3, random_state=0)
data = pd.concat([data_1_under, data_2, data_3_under], axis=0)

In [268]:
data_no = data.copy()
data_no.head()

Unnamed: 0,Type,AB,Site,Impurity,∆H (A-rich),∆H (B-rich),(+2/+1),(+1/0),(0/-1),(-1/-2),...,Therm_cond,Elec_cond,Heat_fusion,Heat_vap,Electronegativity,At_num,Period,Group,Valence,Ox_state
450,II-VI,CdTe,M_A,Mo,3.513,3.375,0.258,0.698,1.185,1.873,...,138.0,17.3,36.0,590.4,2.16,42,5,6,6,6
504,II-VI,CdSe,M_B,Ge,2.157,4.241,-0.236,0.459,0.812,1.213,...,59.9,0.0,31.8,334.3,2.01,32,4,14,4,4
321,II-VI,CdTe,M_i_B,Nb,4.417,6.057,0.838,1.188,1.633,2.008,...,53.7,6.6,26.9,690.1,1.6,41,5,5,5,5
564,II-VI,CdTe,M_A,Tc,3.311,3.315,0.135,0.214,1.302,1.742,...,50.6,0.0,23.0,502.0,1.9,43,5,7,7,7
167,II-VI,CdSe,M_i_A,Bi,3.953,4.913,1.182,1.564,1.809,2.247,...,7.87,0.9,11.0,179.0,2.02,83,6,15,5,3


In [269]:
predictors = list(data.columns[4:10])
features = list(data.columns[14:])
feature_no = list(data_no.columns[14:])
len(feature_no)

41

In [270]:
all_features = features

In [338]:
all_features = []
for feat in features:
    square_feat = feat + '_square' 
    data[square_feat] = data[feat] ** 2
    data_no[square_feat] = data_no[feat] ** 2
    if data[feat].min() >= 0:
        sqrt_feat = feat + '_sqrt'
        data_no[sqrt_feat] = data_no[feat].apply(sqrt)
        data[sqrt_feat] = data[feat].apply(sqrt)
        all_features.extend([feat, square_feat, sqrt_feat])
    else:
        all_features.extend([feat, square_feat])

In [339]:
len(data_no.columns), len(data.columns)

(96, 96)

We first need to do a little bit more pre-processing to prepare the data for model training. Models like Ridge and LASSO assume the input features are standardized (mean 0, std. dev. 1) and the target values are centered (mean 0). If we do not do this, we might get some unpredictable results since we violate the assumption of the models!

In [340]:
def standardize(v):
    """
    Takes a single column of a DataFrame and returns a new column 
    with the data standardized (mean 0, std deviation 1)
    """
    std = v.std()
    if std == 0:
        return np.zeros(len(v))
    else:
        return (v - v.mean()) / std
def normalize(v):
    """
    Takes a single column of a DataFrame and returns a new column 
    with the data normalized (data range[0,1])
    """
    max_ = v.max()
    min_ = v.min()
    if max_ == min_:
        return np.ones(len(v))
    else:
        return (v/(max_ - min_))
    
# Standardize each of the features
for feature in all_features:
    data[feature] = standardize(data[feature])
    data_no[feature] = normalize(data_no[feature])
    
# Make the predictors have mean 0 
for predictor in predictors:
    mean_pre = data[predictor].mean() 
    data[predictor] -= mean_pre

# Preview

In [341]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
#train, test = train_test_split(data, test_size=0.2, random_state=0)
#train_no, test_no = train_test_split(data_no, test_size=0.3, random_state=0)
#train, test
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=100)
sss.get_n_splits(data, data['Type'])
#train_no, test_no
for train_index, test_index in sss.split(data, data['Type']):
    train, test = data.iloc[train_index], data.iloc[test_index]
np.unique(train['Type'], return_counts=True)

(array(['II-VI', 'III-V', 'IV-IV'], dtype=object),
 array([124, 124, 124], dtype=int64))

In [342]:
sss.get_n_splits(data_no, data_no['Type'])
for train_index, test_index in sss.split(data_no, data_no['Type']):
    train_no, test_no = data_no.iloc[train_index], data_no.iloc[test_index]
np.unique(train_no['Type'], return_counts=True)

(array(['II-VI', 'III-V', 'IV-IV'], dtype=object),
 array([124, 124, 124], dtype=int64))

In [343]:
X = train[all_features]
y = train['∆H (A-rich)']
y2 = train['∆H (B-rich)']
y3 = train['(+2/+1)']
y4 = train['(+1/0)']
y5 = train['(0/-1)']
y6 = train['(-1/-2)']
X_no = train_no[all_features]
y_no = train_no['∆H (A-rich)']
y2_no = train_no['∆H (B-rich)']
y3_no = train_no['(+2/+1)']
y4_no = train_no['(+1/0)']
y5_no = train_no['(0/-1)']
y6_no = train_no['(-1/-2)']

**Simple Regression Model, create a baseline for advanced regression models**

In [344]:
#Compare of standardized dataset and non-standardized
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
# Standardized LinearRegression
reg_LR = LinearRegression()
MSEs = cross_val_score(reg_LR, X, y, scoring = "neg_mean_squared_error", cv = 5)
MSEs_2 = cross_val_score(reg_LR, X, y2, scoring = "neg_mean_squared_error", cv = 5)
MSEs_3 = cross_val_score(reg_LR, X, y3, scoring = "neg_mean_squared_error", cv = 5)
MSEs_4 = cross_val_score(reg_LR, X, y4, scoring = "neg_mean_squared_error", cv = 5)
MSEs_5 = cross_val_score(reg_LR, X, y5, scoring = "neg_mean_squared_error", cv = 5)
MSEs_6 = cross_val_score(reg_LR, X, y6, scoring = "neg_mean_squared_error", cv = 5)
print("RMSE1 for Standardized LinearRegression is", np.mean((-MSEs)**0.5))
print("RMSE2 for Standardized LinearRegression is", np.mean((-MSEs_2)**0.5))
print("RMSE3 for Standardized LinearRegression is", np.mean((-MSEs_3)**0.5))
print("RMSE4 for Standardized LinearRegression is", np.mean((-MSEs_4)**0.5))
print("RMSE5 for Standardized LinearRegression is", np.mean((-MSEs_5)**0.5))
print("RMSE6 for Standardized LinearRegression is", np.mean((-MSEs_6)**0.5))
# Normalized LinearRegression
MSEs_no = cross_val_score(reg_LR, X_no, y_no, scoring = "neg_mean_squared_error", cv = 5)
MSEs_no_2 = cross_val_score(reg_LR, X_no, y2_no, scoring = "neg_mean_squared_error", cv = 5)
MSEs_no_3 = cross_val_score(reg_LR, X_no, y3_no, scoring = "neg_mean_squared_error", cv = 5)
MSEs_no_4 = cross_val_score(reg_LR, X_no, y4_no, scoring = "neg_mean_squared_error", cv = 5)
MSEs_no_5 = cross_val_score(reg_LR, X_no, y5_no, scoring = "neg_mean_squared_error", cv = 5)
MSEs_no_6 = cross_val_score(reg_LR, X_no, y6_no, scoring = "neg_mean_squared_error", cv = 5)
print("RMSE1 for Normalized LinearRegression is", np.mean((-MSEs_no)**0.5))
print("RMSE2 for Normalized LinearRegression is", np.mean((-MSEs_no_2)**0.5))
print("RMSE3 for Normalized LinearRegression is", np.mean((-MSEs_no_3)**0.5))
print("RMSE4 for Normalized LinearRegression is", np.mean((-MSEs_no_4)**0.5))
print("RMSE5 for Normalized LinearRegression is", np.mean((-MSEs_no_5)**0.5))
print("RMSE6 for Normalized LinearRegression is", np.mean((-MSEs_no_6)**0.5))

RMSE1 for Standardized LinearRegression is 2.5934793876038236
RMSE2 for Standardized LinearRegression is 2.9859413628345823
RMSE3 for Standardized LinearRegression is 1.3414735169182084
RMSE4 for Standardized LinearRegression is 1.2991365793815175
RMSE5 for Standardized LinearRegression is 1.2430546763642127
RMSE6 for Standardized LinearRegression is 1.173441626280955
RMSE1 for Normalized LinearRegression is 2.5964968183839927
RMSE2 for Normalized LinearRegression is 3.0069013411372234
RMSE3 for Normalized LinearRegression is 1.333059756310052
RMSE4 for Normalized LinearRegression is 1.2836660772436372
RMSE5 for Normalized LinearRegression is 1.2394681004689452
RMSE6 for Normalized LinearRegression is 1.1651130483958834


In [345]:
from sklearn.linear_model import Lasso

In [346]:
reg = Lasso(random_state=0)
parameters = {'alpha': np.logspace(-7, 7, num=15)}
clf_no = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_no_2 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_no_3 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_no_4 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_no_5 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_no_6 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_no.fit(X_no, y_no)
clf_no_2.fit(X_no, y2_no)
clf_no_3.fit(X_no, y3_no)
clf_no_4.fit(X_no, y4_no)
clf_no_5.fit(X_no, y5_no)
clf_no_6.fit(X_no, y6_no)
Lasso_no_model = [clf_no, clf_no_2, clf_no_3, 
              clf_no_4, clf_no_5, clf_no_6]

In [347]:
reg = Lasso(random_state=0)
parameters = {'alpha': np.logspace(-7, 7, num=15)}
clf = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_2 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_3 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_4 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_5 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf_6 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
clf.fit(X, y)
clf_2.fit(X, y2)
clf_3.fit(X, y3)
clf_4.fit(X, y4)
clf_5.fit(X, y5)
clf_6.fit(X, y6)
Lasso_model = [clf, clf_2, clf_3, 
              clf_4, clf_5, clf_6]

In [348]:
def print_coefficients(model, features):
    """
    This function takes in a model column and a features column. 
    And prints the coefficient along with its feature name.
    """
    feats = list(zip(model.coef_, features))
    print(*feats, sep = "\n")

In [349]:
Lasso_no_data = []
for i in range(len(Lasso_no_model)):
    number = 0
    select = (Lasso_no_model[i].best_estimator_).coef_
    feature2 = []
    for j in range(len(select)):
        if select[j]: 
            feature2.append(all_features[j])
            number += 1
    Lasso_no_data.append({
    'Predictor': predictors[i],
    'Selected Features': feature2,
    'Total Features Num': number,
    'Training RMSE': (-Lasso_no_model[i].best_score_)**0.5,
    'Testing RMSE': (mean_squared_error(test_no[predictors[i]], Lasso_no_model[i].predict(test_no[all_features])))**0.5,
    'Model': Lasso_no_model[i].best_estimator_,
    'l1 penalty': Lasso_no_model[i].best_params_.values()
    })
Lasso_no_df = pd.DataFrame(Lasso_no_data)
Lasso_no_df
#Lasso_no_df.to_csv(r'./normalized_all_feature.csv', index = False)

Unnamed: 0,Predictor,Selected Features,Total Features Num,Training RMSE,Testing RMSE,Model,l1 penalty
0,∆H (A-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",55,2.488111,2.307819,"Lasso(alpha=0.001, copy_X=True, fit_intercept=...",(0.001)
1,∆H (B-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",55,2.947878,2.977163,"Lasso(alpha=0.001, copy_X=True, fit_intercept=...",(0.001)
2,(+2/+1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",47,1.232584,0.993623,"Lasso(alpha=0.001, copy_X=True, fit_intercept=...",(0.001)
3,(+1/0),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",44,1.212771,1.086696,"Lasso(alpha=0.001, copy_X=True, fit_intercept=...",(0.001)
4,(0/-1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",44,1.154134,0.998769,"Lasso(alpha=0.001, copy_X=True, fit_intercept=...",(0.001)
5,(-1/-2),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",40,1.09587,0.962392,"Lasso(alpha=0.001, copy_X=True, fit_intercept=...",(0.001)


In [350]:
Lasso_data = []
for i in range(len(Lasso_model)):
    number = 0
    select = (Lasso_model[i].best_estimator_).coef_
    feature2 = []
    for j in range(len(select)):
        if select[j]: 
            feature2.append(all_features[j])
            number += 1
    Lasso_data.append({
    'Predictor': predictors[i],
    'Selected Features': feature2,
    'Total Features Num': number,
    'Training RMSE': (-Lasso_model[i].best_score_)**0.5,
    'Testing RMSE': (mean_squared_error(test[predictors[i]], Lasso_model[i].predict(test[all_features])))**0.5,
    'Model': Lasso_model[i].best_estimator_,
    'l1 penalty': Lasso_model[i].best_params_.values()
    })
Lasso_df = pd.DataFrame(Lasso_data)
Lasso_df
#Lasso_df.to_csv(r'./standardized_all_feature.csv', index = False)

Unnamed: 0,Predictor,Selected Features,Total Features Num,Training RMSE,Testing RMSE,Model,l1 penalty
0,∆H (A-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",56,2.458576,2.329445,"Lasso(alpha=0.01, copy_X=True, fit_intercept=T...",(0.01)
1,∆H (B-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_latt_const,...",57,2.947753,2.971056,"Lasso(alpha=0.01, copy_X=True, fit_intercept=T...",(0.01)
2,(+2/+1),"[Z_B_square, PBE_delta_H, PBE_delta_H_square, ...",42,1.192281,0.960712,"Lasso(alpha=0.01, copy_X=True, fit_intercept=T...",(0.01)
3,(+1/0),"[PBE_delta_H, PBE_delta_H_square, PBE_latt_con...",44,1.166847,1.062388,"Lasso(alpha=0.01, copy_X=True, fit_intercept=T...",(0.01)
4,(0/-1),"[PBE_delta_H, PBE_delta_H_square, PBE_latt_con...",38,1.112226,0.95004,"Lasso(alpha=0.01, copy_X=True, fit_intercept=T...",(0.01)
5,(-1/-2),"[Z_B_square, PBE_delta_H, PBE_delta_H_square, ...",38,1.057042,0.904971,"Lasso(alpha=0.01, copy_X=True, fit_intercept=T...",(0.01)


In [61]:
#Validation of smallest MSE
from sklearn.linear_model import LassoCV
storege = np.zeros((100,2))
point = 0
for limit in np.linspace(0, abs(clf_3.best_estimator_.coef_).max(), 100):
    feature2 = []
    select = abs(clf_3.best_estimator_.coef_) >= limit
    for j in range(len(select)):
        if select[j]!=0: 
            feature2.append(all_features[j])
    X_temp = data[feature2]
    y_temp = data['(+2/+1)'] 
    reg_temp = LassoCV(cv = 5, random_state = 0, n_alphas=1, alphas=[0.0001]).fit(X_temp, y_temp)
    temp = reg_temp.predict(X_temp)
    MSEs_Lasso = mean_squared_error(temp, y_temp)
    RMSE_Lasso = np.mean((MSEs_Lasso)**0.5) 
    storege[point][0] = limit
    storege[point][1] = RMSE_Lasso
    point +=1



In [63]:
#Select feature based on threshold
feature2 = []
select = []
select = abs(Lasso_df.Model[2].coef_) >= 0.04
for j in range(len(select)):
    if select[j] == True: 
        feature2.append(all_features[j])
X_temp = train[feature2]
y_temp = train[predictors[2]] 
clf_temp = LassoCV(cv = 3, random_state = 0, n_alphas=1, alphas=[0.0001]).fit(X_temp, y_temp)
temp = clf_temp.predict(X_temp)
MSEs_Lasso = mean_squared_error(temp, y_temp)
RMSE_Lasso = np.mean((MSEs_Lasso)**0.5) 
#RMSE_Lasso, print_coefficients(clf_temp, feature2)
test_1 = clf_temp.predict(test[feature2])
temp_x = test[predictors[2]]
mean_squared_error(temp_x, test_1)**0.5, len(feature2)

(1.0238041026846507, 12)

In [48]:
#Top n feature based on coefficient
import heapq
a = abs((Lasso_df.Model[3]).coef_)
num = heapq.nlargest(42, range(len(a)), a.__getitem__)
type(all_features)
[all_features[i] for i in num]

['PBE_latt_const_sqrt',
 'PBE_latt_const_square',
 'CM8_sqrt',
 'Eps_ion',
 'CM4_sqrt',
 'PBE_latt_const',
 'PBE_gap',
 'CM1',
 'CM2',
 'Eps_ion_square',
 'CM3',
 'PBE_gap_sqrt',
 'Eps_ion_sqrt',
 'CM8',
 'CM4',
 'CM6_sqrt',
 'CM1_sqrt',
 'Eps_elec',
 'At_vol',
 'CM7_sqrt',
 'At_wt',
 'CM7',
 'ICSD_vol_sqrt',
 'At_num',
 'Eps_elec_square',
 'CM3_sqrt',
 'CM2_sqrt',
 'At_vol_square',
 'PBE_delta_H_square',
 'Sp_heat_cap',
 'Mend_num_sqrt',
 'PBE_delta_H',
 'Z_B_sqrt',
 'CM1_square',
 'Mend_num_square',
 'Density_sqrt',
 'Ox_state_square',
 'Period',
 'CM2_square',
 'CM3_square',
 'Ox_state_sqrt',
 'Cov_rad_sqrt']

# Ridge Regression

In [351]:
from sklearn.linear_model import Ridge

In [352]:
reg = Ridge(random_state=0)
parameters = {'alpha': np.logspace(-7, 7, num=15)}
rig_no = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_no_2 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_no_3 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_no_4 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_no_5 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_no_6 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_no.fit(X_no, y_no)
rig_no_2.fit(X_no, y2_no)
rig_no_3.fit(X_no, y3_no)
rig_no_4.fit(X_no, y4_no)
rig_no_5.fit(X_no, y5_no)
rig_no_6.fit(X_no, y6_no)
Ridge_no_model = [rig_no, rig_no_2, rig_no_3, 
              rig_no_4, rig_no_5, rig_no_6]

In [353]:
reg = Ridge(random_state=0)
parameters = {'alpha': np.logspace(-7, 7, num=15)}
rig = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_2 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_3 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_4 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_5 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig_6 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
rig.fit(X, y)
rig_2.fit(X, y2)
rig_3.fit(X, y3)
rig_4.fit(X, y4)
rig_5.fit(X, y5)
rig_6.fit(X, y6)
Ridge_model = [rig, rig_2, rig_3, 
              rig_4, rig_5, rig_6]

In [354]:
Ridge_no_data = []
for i in range(len(Ridge_no_model)):
    number = 0
    select = (Ridge_no_model[i].best_estimator_).coef_
    feature2 = []
    for j in range(len(select)):
        if select[j]: 
            feature2.append(all_features[j])
            number += 1
    Ridge_no_data.append({
    'Predictor': predictors[i],
    'Selected Features': feature2,
    'Total Features Num': number,
    'Training RMSE': (-Ridge_no_model[i].best_score_)**0.5,
    'Testing RMSE': (mean_squared_error(test_no[predictors[i]], Ridge_no_model[i].predict(test_no[all_features])))**0.5,
    'Model': Ridge_no_model[i].best_estimator_,
    'l2 penalty': Ridge_no_model[i].best_params_.values()
    })
Ridge_no_df = pd.DataFrame(Ridge_no_data)
Ridge_no_df

Unnamed: 0,Predictor,Selected Features,Total Features Num,Training RMSE,Testing RMSE,Model,l2 penalty
0,∆H (A-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,2.493707,2.323938,"Ridge(alpha=0.1, copy_X=True, fit_intercept=Tr...",(0.1)
1,∆H (B-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,2.92038,2.977708,"Ridge(alpha=0.1, copy_X=True, fit_intercept=Tr...",(0.1)
2,(+2/+1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.237515,1.054229,"Ridge(alpha=1.0, copy_X=True, fit_intercept=Tr...",(1.0)
3,(+1/0),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.245851,1.090824,"Ridge(alpha=0.1, copy_X=True, fit_intercept=Tr...",(0.1)
4,(0/-1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.208371,1.025943,"Ridge(alpha=0.1, copy_X=True, fit_intercept=Tr...",(0.1)
5,(-1/-2),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.151594,1.002613,"Ridge(alpha=0.1, copy_X=True, fit_intercept=Tr...",(0.1)


In [355]:
Ridge_data = []
for i in range(len(Ridge_model)):
    number = 0
    select = (Ridge_model[i].best_estimator_).coef_
    feature2 = []
    for j in range(len(select)):
        if select[j]: 
            feature2.append(all_features[j])
            number += 1
    Ridge_data.append({
    'Predictor': predictors[i],
    'Selected Features': feature2,
    'Total Features Num': number,
    'Training RMSE': (-Ridge_model[i].best_score_)**0.5,
    'Testing RMSE': (mean_squared_error(test[predictors[i]], Ridge_model[i].predict(test[all_features])))**0.5,
    'Model': Ridge_model[i].best_estimator_,
    'l2 penalty': Ridge_model[i].best_params_.values()
    })
Ridge_df = pd.DataFrame(Ridge_data)
Ridge_df

Unnamed: 0,Predictor,Selected Features,Total Features Num,Training RMSE,Testing RMSE,Model,l2 penalty
0,∆H (A-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,2.446211,2.313514,"Ridge(alpha=10.0, copy_X=True, fit_intercept=T...",(10.0)
1,∆H (B-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,2.901861,3.00304,"Ridge(alpha=10.0, copy_X=True, fit_intercept=T...",(10.0)
2,(+2/+1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.218103,0.957563,"Ridge(alpha=10.0, copy_X=True, fit_intercept=T...",(10.0)
3,(+1/0),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.200635,1.049866,"Ridge(alpha=10.0, copy_X=True, fit_intercept=T...",(10.0)
4,(0/-1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.161645,0.956235,"Ridge(alpha=10.0, copy_X=True, fit_intercept=T...",(10.0)
5,(-1/-2),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,1.096198,0.913508,"Ridge(alpha=10.0, copy_X=True, fit_intercept=T...",(10.0)


# Elastic Net

In [356]:
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [357]:
reg = ElasticNet(random_state=0)
parameters = {'alpha': np.logspace(-7, 7, num=15),
              'l1_ratio': np.linspace(0,1,10)}
EN_no = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_no_2 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_no_3 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_no_4 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_no_5 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_no_6 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_no.fit(X_no, y_no)
EN_no_2.fit(X_no, y2_no)
EN_no_3.fit(X_no, y3_no)
EN_no_4.fit(X_no, y4_no)
EN_no_5.fit(X_no, y5_no)
EN_no_6.fit(X_no, y6_no)
EN_no_model = [EN_no, EN_no_2, EN_no_3, 
              EN_no_4, EN_no_5, EN_no_6]

In [358]:
reg = ElasticNet(random_state=0)
parameters = {'alpha': np.logspace(-7, 7, num=15),
              'l1_ratio': np.linspace(0,1,10)}
EN = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_2 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_3 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_4 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_5 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN_6 = GridSearchCV(reg, parameters, cv = 5, scoring= "neg_mean_squared_error")
EN.fit(X, y)
EN_2.fit(X, y2)
EN_3.fit(X, y3)
EN_4.fit(X, y4)
EN_5.fit(X, y5)
EN_6.fit(X, y6)
EN_model = [EN, EN_2, EN_3, 
              EN_4, EN_5, EN_6]

In [359]:
EN_no_data = []
for i in range(len(EN_no_model)):
    number = 0
    select = (EN_no_model[i].best_estimator_).coef_
    feature2 = []
    for j in range(len(select)):
        if select[j]: 
            feature2.append(all_features[j])
            number += 1
    EN_no_data.append({
    'Predictor': predictors[i],
    'Selected Features': feature2,
    'Total Features Num': number,
    'Training RMSE': (-EN_no_model[i].best_score_)**0.5,
    'Testing RMSE': (mean_squared_error(test_no[predictors[i]], EN_no_model[i].predict(test_no[all_features])))**0.5,
    'Model': EN_no_model[i].best_estimator_,
    '𝛼/𝜌': EN_no_model[i].best_params_.values()
    })
EN_no_df = pd.DataFrame(EN_no_data)
EN_no_df

Unnamed: 0,Predictor,Selected Features,Total Features Num,Training RMSE,Testing RMSE,Model,𝛼/𝜌
0,∆H (A-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",55,2.488111,2.307819,"ElasticNet(alpha=0.001, copy_X=True, fit_inter...","(0.001, 1.0)"
1,∆H (B-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,2.913741,3.036732,"ElasticNet(alpha=0.001, copy_X=True, fit_inter...","(0.001, 0.1111111111111111)"
2,(+2/+1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",54,1.231587,1.000032,"ElasticNet(alpha=0.001, copy_X=True, fit_inter...","(0.001, 0.6666666666666666)"
3,(+1/0),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",44,1.212771,1.086696,"ElasticNet(alpha=0.001, copy_X=True, fit_inter...","(0.001, 1.0)"
4,(0/-1),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",44,1.154134,0.998769,"ElasticNet(alpha=0.001, copy_X=True, fit_inter...","(0.001, 1.0)"
5,(-1/-2),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",40,1.09587,0.962392,"ElasticNet(alpha=0.001, copy_X=True, fit_inter...","(0.001, 1.0)"


In [360]:
EN_data = []
for i in range(len(EN_model)):
    number = 0
    select = (EN_model[i].best_estimator_).coef_
    feature2 = []
    for j in range(len(select)):
        if select[j]: 
            feature2.append(all_features[j])
            number += 1
    EN_data.append({
    'Predictor': predictors[i],
    'Selected Features': feature2,
    'Total Features Num': number,
    'Training RMSE': (-EN_model[i].best_score_)**0.5,
    'Testing RMSE': (mean_squared_error(test[predictors[i]], EN_model[i].predict(test[all_features])))**0.5,
    'Model': EN_model[i].best_estimator_,
    '𝛼/𝜌': EN_model[i].best_params_.values()
    })
EN_df = pd.DataFrame(EN_data)
EN_df

Unnamed: 0,Predictor,Selected Features,Total Features Num,Training RMSE,Testing RMSE,Model,𝛼/𝜌
0,∆H (A-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",67,2.454953,2.310309,"ElasticNet(alpha=0.01, copy_X=True, fit_interc...","(0.01, 0.5555555555555556)"
1,∆H (B-rich),"[Z_B, Z_B_square, PBE_delta_H, PBE_delta_H_squ...",82,2.907085,3.099438,"ElasticNet(alpha=0.1, copy_X=True, fit_interce...","(0.1, 0.0)"
2,(+2/+1),"[Z_B_square, PBE_delta_H, PBE_delta_H_square, ...",42,1.192281,0.960712,"ElasticNet(alpha=0.01, copy_X=True, fit_interc...","(0.01, 1.0)"
3,(+1/0),"[PBE_delta_H, PBE_delta_H_square, PBE_latt_con...",44,1.166847,1.062388,"ElasticNet(alpha=0.01, copy_X=True, fit_interc...","(0.01, 1.0)"
4,(0/-1),"[PBE_delta_H, PBE_delta_H_square, PBE_latt_con...",38,1.112226,0.95004,"ElasticNet(alpha=0.01, copy_X=True, fit_interc...","(0.01, 1.0)"
5,(-1/-2),"[Z_B_square, PBE_delta_H, PBE_delta_H_square, ...",38,1.057042,0.904971,"ElasticNet(alpha=0.01, copy_X=True, fit_interc...","(0.01, 1.0)"
