In [2]:
from sklearn.kernel_ridge import KernelRidge
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import polynomial_kernel
from sklearn.metrics.pairwise import rbf_kernel
from sklearn.metrics.pairwise import laplacian_kernel
import pandas as pd
import math
import numpy as np
from matplotlib import pyplot as plt
from sklearn.model_selection import GridSearchCV
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels \
    import RBF, RationalQuadratic
import os
os.chdir('..\ModelExploration') 
import standardize

In [75]:
def krr(X_train, y_train):
    krr_result = []
    for i in range(len(predictors)):
        param_grid = {"alpha": np.logspace(-5, 10, num=16),
                      "kernel": ['linear', 'poly', 'laplacian', 'rbf']}
        kr = GridSearchCV(KernelRidge(), param_grid=param_grid, cv=5, scoring= "neg_mean_squared_error")
        kr.fit(X_train, y_train[i])
        rmse = np.sqrt(mean_squared_error(test[predictors[i]], kr.predict(test[all_features])))
        krr_result.append({
        'Predictor': predictors[i],
        'Training RMSE': (-kr.best_score_)**0.5,
        'Testing RMSE': rmse,
        'Model': kr.best_estimator_,
        'Best Parameter': kr.best_params_.values()
        })
    krr_result = pd.DataFrame(krr_result)
    return krr_result

# KRR

## Original data trained via KRR

In [80]:
data = pd.read_csv('ML_data.csv')

In [113]:
predictors = list(data.columns[4:10])
features = list(list(data.columns[14:19]) + list(data.columns[29:54]))

In [114]:
all_features = features

In [82]:
#all_features = []
#for feat in features:
#    square_feat = feat + '_square' 
#    data[square_feat] = data[feat] ** 2
#    if data[feat].min() >= 0:
#        sqrt_feat = feat + '_sqrt'
#        data[sqrt_feat] = data[feat].apply(math.sqrt)
#        all_features.extend([feat, square_feat, sqrt_feat])
#    else:
#        all_features.extend([feat, square_feat])

In [83]:
# Standardize each of the features
for feature in all_features:
    data[feature] = standardize.standardize(data[feature])
# Make the predictors have mean 0 
for predictor in predictors:
    mean_pre = data[predictor].mean() 
    data[predictor] -= mean_pre

In [115]:
train, test = train_test_split(data, test_size=0.2, random_state=0)

In [116]:
X_train = train[all_features]
y_train = [train['∆H (A-rich)'], train['∆H (B-rich)'], train['(+2/+1)'], train['(+1/0)'],
           train['(0/-1)'], train['(-1/-2)']]

In [117]:
krr(X_train, y_train)

Unnamed: 0,Predictor,Training RMSE,Testing RMSE,Model,Best Parameter
0,∆H (A-rich),2.002219,2.638377,"KernelRidge(alpha=0.1, coef0=1, degree=3, gamm...","(0.1, laplacian)"
1,∆H (B-rich),2.510018,3.150153,"KernelRidge(alpha=0.1, coef0=1, degree=3, gamm...","(0.1, laplacian)"
2,(+2/+1),0.778437,0.711157,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
3,(+1/0),0.727054,0.684512,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
4,(0/-1),0.664307,0.606917,"KernelRidge(alpha=0.001, coef0=1, degree=3, ga...","(0.001, laplacian)"
5,(-1/-2),0.564937,0.519097,"KernelRidge(alpha=0.0001, coef0=1, degree=3, g...","(0.0001, laplacian)"


## Remove outliers data trained via KRR

In [76]:
outlier = pd.read_csv('../outlier_KNN_Type_10%.csv')
outlier.rename(columns={'Unnamed: 0':'Index'}, inplace=True)
outlier.head()

Unnamed: 0,Index,Type,AB,Site,Impurity,∆H (A-rich),∆H (B-rich),(+2/+1),(+1/0),(0/-1),...,Therm_cond,Elec_cond,Heat_fusion,Heat_vap,Electronegativity,At_num,Period,Group,Valence,Ox_state
0,6,III-V,BN,M_A,Pb,17.053,9.405,-0.954,4.167,5.413,...,35.3,4.8,4.77,177.9,2.33,82,6,14,4,2
1,14,III-V,AlN,M_i_A,Te,20.547,19.797,2.745,3.928,4.71,...,2.35,0.0,17.49,50.63,2.1,52,5,16,6,4
2,15,III-V,BN,M_A,As,14.244,5.391,2.977,3.9,5.197,...,50.0,3.8,27.7,32.4,2.16,33,4,15,5,3
3,17,III-V,AlN,M_B,Se,3.581,10.182,-0.155,3.86,4.759,...,2.04,8.0,5.54,26.32,2.55,34,4,16,6,4
4,21,III-V,BN,M_B,F,7.785,12.884,2.341,3.683,5.552,...,0.03,12.0,0.26,3.27,3.98,9,2,17,1,1


In [118]:
data1 = data.copy()

In [119]:
data1.drop(outlier['Index'], inplace=True)

In [120]:
len(data1)

784

In [121]:
train, test = train_test_split(data1, test_size=0.2, random_state=0)
X_train = train[all_features]
y_train = [train['∆H (A-rich)'], train['∆H (B-rich)'], train['(+2/+1)'], train['(+1/0)'],
           train['(0/-1)'], train['(-1/-2)']]
krr(X_train, y_train)

Unnamed: 0,Predictor,Training RMSE,Testing RMSE,Model,Best Parameter
0,∆H (A-rich),1.718658,1.69842,"KernelRidge(alpha=0.1, coef0=1, degree=3, gamm...","(0.1, laplacian)"
1,∆H (B-rich),2.018964,2.07628,"KernelRidge(alpha=0.1, coef0=1, degree=3, gamm...","(0.1, laplacian)"
2,(+2/+1),0.729693,0.664053,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
3,(+1/0),0.708641,0.672146,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
4,(0/-1),0.619001,0.647361,"KernelRidge(alpha=0.001, coef0=1, degree=3, ga...","(0.001, laplacian)"
5,(-1/-2),0.545307,0.527653,"KernelRidge(alpha=1e-05, coef0=1, degree=3, ga...","(1e-05, laplacian)"


## Remove oulier (E>10ev) data trained via KRR

In [122]:
data2 = data.copy()

In [123]:
outlier2 = data[(data['∆H (A-rich)']>10) + (data['∆H (B-rich)']>10)].index

  op=op_str, alt_op=unsupported[op_str]


In [124]:
data2.drop(outlier2, inplace=True)

In [125]:
len(data2)

820

In [126]:
train, test = train_test_split(data2, test_size=0.2, random_state=0)
X_train = train[all_features]
y_train = [train['∆H (A-rich)'], train['∆H (B-rich)'], train['(+2/+1)'], train['(+1/0)'],
           train['(0/-1)'], train['(-1/-2)']]
krr(X_train, y_train)

Unnamed: 0,Predictor,Training RMSE,Testing RMSE,Model,Best Parameter
0,∆H (A-rich),1.827212,1.52506,"KernelRidge(alpha=0.1, coef0=1, degree=3, gamm...","(0.1, laplacian)"
1,∆H (B-rich),2.243164,2.229974,"KernelRidge(alpha=0.1, coef0=1, degree=3, gamm...","(0.1, laplacian)"
2,(+2/+1),0.699878,0.750296,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
3,(+1/0),0.726809,0.7548,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
4,(0/-1),0.640826,0.760853,"KernelRidge(alpha=0.01, coef0=1, degree=3, gam...","(0.01, laplacian)"
5,(-1/-2),0.50617,0.641492,"KernelRidge(alpha=1e-05, coef0=1, degree=3, ga...","(1e-05, laplacian)"


# GPR