In [1]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np
import random


from ase.cell import Cell
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix, MBTR

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy.linalg as LA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

import catboost as cb
import shap
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, r2_score


import warnings
warnings.filterwarnings('ignore')


In [23]:
DATA_PATH = './nomad2018-predict-transparent-conductors'


In [24]:
def custom_converter(entry):
    return np.array([float(x) for x in entry[1:-1].split(',')])

In [25]:
train_all_data = pd.read_csv(
    f'{DATA_PATH}/train_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)
test_all_data = pd.read_csv(
    f'{DATA_PATH}/test_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)

In [26]:
train_all_data['CoulombMatrix'][0].shape

(6400,)

# Basic linear regression

In [27]:
def get_eigenspectrum(matrix):
    spectrum = LA.eigvalsh(matrix)
    spectrum = np.sort(spectrum)[::-1]
    return spectrum

In [22]:
m = np.array([
    [1,2,5],
    [2,2,5],
    # [2,3,4]
])
# get_eigenspectrum(m).shape
StandardScaler().fit_transform(m.T).T

array([[-0.98058068, -0.39223227,  1.37281295],
       [-0.70710678, -0.70710678,  1.41421356]])

In [36]:
def create_matrix_df(data,pca_components=None,train=True):
    ewald_spectrum_list = []
    for m in data['SineMatrix']:
        ewald_spectrum_list.append(
            get_eigenspectrum(
                np.reshape(m, (80, 80))
            )
        )
    ewald_spectrum_df = pd.DataFrame(ewald_spectrum_list).astype(float)
    ewald_spectrum_df = ewald_spectrum_df.fillna(0)
    x = ewald_spectrum_df.loc[:, :].values
    # noramlize axis 0
    # x = StandardScaler().fit_transform(x.T).T
    # #y = data.loc[:, ['formation_energy_ev_natom']].values
    # pca = PCA(n_components=15).fit(x)

    # # PCA n_components calculation
    # rolling_sum = 0
    # n_components = 1
    # for i, num in enumerate(pca.explained_variance_ratio_):
    #     rolling_sum += num
    #     if rolling_sum > 0.95:
    #         n_components = i
    #         break
    
    # if not train:
    #     n_components = pca_components

    n_components = 80
    # # print(f'Performing PCA with {n_components} components')
    # pca = PCA(n_components)
    # principalComponents = pca.fit_transform(x)
    principalComponents = ewald_spectrum_df
    principalDf = pd.DataFrame(data=principalComponents)
    if train:
        to_drop = ['id', 'formation_energy_ev_natom', 'bandgap_energy_ev',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        #dfcombined = data.drop(to_drop, axis=1)
    else:
        to_drop = ['id',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        #dfcombined = data.drop(to_drop, axis=1)
    return dfcombined, n_components



In [37]:

def compute_final_values(train_all_data, test_all_data, target_column, model):
    # train
    dfcombined, n_components = create_matrix_df(train_all_data)
    X_train, X_test, y_train, y_test = train_test_split(dfcombined, train_all_data[target_column], test_size = 0.30, random_state=1)
    rf = model.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    y_pred = rf.predict(X_test)

    print("Score:", score)
    print("Error rate:", ((mean_squared_error(y_test, y_pred)*100)), "%")
    
    # print(f'Training score for {target_column}: {score}')
    # test
    dfcombined, n_components = create_matrix_df(test_all_data, train=False, pca_components=n_components)
    predicted = rf.predict(dfcombined)
    return predicted


## Experiments

In [41]:
models = [
    ('randomforest', RandomForestRegressor(n_estimators=550, random_state=2), RandomForestRegressor(n_estimators=550, random_state=2)),
    # ('ridge', Ridge(alpha=0.1), Ridge(alpha=0.1)),
    # ('lasso', Lasso(alpha=0, max_iter=1e5), Lasso(alpha=0, max_iter=1e5)),
    # ('linear regression', LinearRegression(),LinearRegression()),
    # ('catboost', cb.CatBoostRegressor(loss_function='RMSE', depth=10, learning_rate=0.01), cb.CatBoostRegressor(loss_function='RMSE', depth=10, learning_rate=0.01))
    # ('svr', SVR(kernel = 'linear'), SVR(kernel = 'linear'))
]

In [42]:
for name, train_model, test_model in models:
    print(f'Running {name}')
    pred_fe = compute_final_values(
        train_all_data, 
        test_all_data, 
        target_column='formation_energy_ev_natom',
        model = train_model
    )
    pred_bandgap = compute_final_values(
        train_all_data, 
        test_all_data, 
        target_column='bandgap_energy_ev',
        model = test_model
    )
    id_1 = np.arange(1, len(pred_fe)+1, 1, dtype=int)
    submission_df = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':pred_fe,'bandgap_energy_ev':pred_bandgap})
    submission_df.to_csv(f"submissions/trial_submission_df_{name}.csv", index=False)
    

Running randomforest
Score: 0.8775878987673387
Error rate: 0.12701006437633122 %
Score: 0.9555668980253293
Error rate: 4.614274072295345 %


### Code to save submission

In [19]:
id_1 = np.arange(1, len(pred_fe)+1, 1, dtype=int)
submission_df = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':pred_fe,'bandgap_energy_ev':pred_bandgap})
submission_df.head()


Unnamed: 0,id,formation_energy_ev_natom,bandgap_energy_ev
0,1,0.190086,1.6364
1,2,0.068089,3.826807
2,3,0.146527,3.60357
3,4,0.032716,3.026442
4,5,0.129898,1.609366


In [20]:
submission_df.to_csv("submissions/trial_submission_df_3.csv", index=False)

In [21]:
dfcombined_1, n_components_1 = create_matrix_df(train_all_data)

In [22]:
dfcombined_1.head()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,0,1,2,3,4,5
0,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,3.084841,-5.417648,2.433371,-0.362672,-0.05726,1.158514
1,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.301449,-3.98652,0.721551,2.608416,-0.755415,-0.149227
2,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,-8.694335,-0.372194,1.442581,2.477459,-0.931437,1.416226
3,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,-9.895077,0.879398,1.863889,-1.427307,-0.235001,-0.779067
4,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,6.926983,2.031831,-0.543498,2.671925,2.329128,0.374277


In [27]:
X_train, X_test, y_train, y_test = train_test_split(dfcombined_1, train_all_data['formation_energy_ev_natom'], test_size = 0.30, random_state=1)
rf_c2 = RandomForestRegressor(n_estimators=1000, random_state=2).fit(X_train, y_train)
y_pred = rf_c2.predict(X_test)
print("Error rate:", ((mean_squared_error(y_test, y_pred)*100)), "%")

Error rate: 0.15008443317849296 %


# Hyperparameter tuning

In [9]:
dfcombined, n_components = create_matrix_df(train_all_data)
X_train, X_test, y_train, y_test = train_test_split(dfcombined, train_all_data['formation_energy_ev_natom'], test_size = 0.30, random_state=1)

rf_parameters = {
    'n_estimators':[50, 100, 150]
}
ridge_parameters = {
    'alpha':[0, 0.001, 0.1, 0.5, 1, 2, 10]
}
lasso_parameters = {
    'alpha':[0, 0.001, 0.1, 0.5, 1, 2, 10]
}
catboost_parameters = {
    'learning_rate':[0.01, 0.05, 0.1, 0.5],
    'depth':[4,6,10],
    # 'l2_leaf_reg':[1, 3, 5, 7, 9]
}



In [12]:
# random forest
model = RandomForestRegressor()
rf_clf = GridSearchCV(model, rf_parameters)
rf_clf.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'n_estimators': [50, 100, 150]})

In [13]:
# ridge 
model = Ridge()
ridge_clf = GridSearchCV(model, ridge_parameters)
ridge_clf.fit(X_train, y_train)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [0, 0.001, 0.1, 0.5, 1, 2, 10]})

In [14]:
# lasso
model = Lasso()
lasso_clf = GridSearchCV(model, lasso_parameters)
lasso_clf.fit(X_train, y_train)

GridSearchCV(estimator=Lasso(),
             param_grid={'alpha': [0, 0.001, 0.1, 0.5, 1, 2, 10]})

In [10]:
# catbooast
model = cb.CatBoostRegressor(task_type='GPU')
cb_clf = GridSearchCV(model, catboost_parameters)
cb_clf.fit(X_train, y_train)



0:	learn: 0.1039359	total: 16.9ms	remaining: 16.9s
1:	learn: 0.1033636	total: 29.8ms	remaining: 14.9s
2:	learn: 0.1027078	total: 38.7ms	remaining: 12.9s
3:	learn: 0.1020946	total: 49.9ms	remaining: 12.4s
4:	learn: 0.1014244	total: 61.2ms	remaining: 12.2s
5:	learn: 0.1007537	total: 77.7ms	remaining: 12.9s
6:	learn: 0.1001087	total: 90.8ms	remaining: 12.9s
7:	learn: 0.0995088	total: 101ms	remaining: 12.5s
8:	learn: 0.0989181	total: 111ms	remaining: 12.2s
9:	learn: 0.0983338	total: 123ms	remaining: 12.2s
10:	learn: 0.0977386	total: 131ms	remaining: 11.8s
11:	learn: 0.0971533	total: 143ms	remaining: 11.8s
12:	learn: 0.0965442	total: 160ms	remaining: 12.1s
13:	learn: 0.0960049	total: 177ms	remaining: 12.5s
14:	learn: 0.0954530	total: 190ms	remaining: 12.5s
15:	learn: 0.0948574	total: 200ms	remaining: 12.3s
16:	learn: 0.0942965	total: 212ms	remaining: 12.3s
17:	learn: 0.0937321	total: 222ms	remaining: 12.1s
18:	learn: 0.0932183	total: 230ms	remaining: 11.9s
19:	learn: 0.0926970	total: 240ms	



8:	learn: 0.0999587	total: 70.4ms	remaining: 7.75s
9:	learn: 0.0994013	total: 76.3ms	remaining: 7.56s
10:	learn: 0.0988367	total: 82.9ms	remaining: 7.45s
11:	learn: 0.0982761	total: 90.6ms	remaining: 7.46s
12:	learn: 0.0977438	total: 105ms	remaining: 8.01s
13:	learn: 0.0971370	total: 113ms	remaining: 7.97s
14:	learn: 0.0965382	total: 120ms	remaining: 7.88s
15:	learn: 0.0959422	total: 126ms	remaining: 7.77s
16:	learn: 0.0953692	total: 133ms	remaining: 7.67s
17:	learn: 0.0947842	total: 144ms	remaining: 7.84s
18:	learn: 0.0942754	total: 151ms	remaining: 7.78s
19:	learn: 0.0937435	total: 158ms	remaining: 7.72s
20:	learn: 0.0931787	total: 164ms	remaining: 7.66s
21:	learn: 0.0927362	total: 170ms	remaining: 7.57s
22:	learn: 0.0922328	total: 178ms	remaining: 7.54s
23:	learn: 0.0917017	total: 186ms	remaining: 7.57s
24:	learn: 0.0912073	total: 194ms	remaining: 7.56s
25:	learn: 0.0907613	total: 210ms	remaining: 7.85s
26:	learn: 0.0902695	total: 221ms	remaining: 7.96s
27:	learn: 0.0898191	total: 2



0:	learn: 0.1047518	total: 6.44ms	remaining: 6.44s
1:	learn: 0.1041806	total: 12.9ms	remaining: 6.42s
2:	learn: 0.1035732	total: 19.6ms	remaining: 6.5s
3:	learn: 0.1029036	total: 26.6ms	remaining: 6.62s
4:	learn: 0.1022524	total: 33.1ms	remaining: 6.58s
5:	learn: 0.1016073	total: 39.4ms	remaining: 6.53s
6:	learn: 0.1010158	total: 46.3ms	remaining: 6.56s
7:	learn: 0.1004904	total: 58.2ms	remaining: 7.21s
8:	learn: 0.0998975	total: 70ms	remaining: 7.71s
9:	learn: 0.0993391	total: 79.7ms	remaining: 7.89s
10:	learn: 0.0987508	total: 85.9ms	remaining: 7.73s
11:	learn: 0.0981892	total: 92.7ms	remaining: 7.63s
12:	learn: 0.0976303	total: 100ms	remaining: 7.59s
13:	learn: 0.0970333	total: 107ms	remaining: 7.5s
14:	learn: 0.0964489	total: 113ms	remaining: 7.44s
15:	learn: 0.0958631	total: 120ms	remaining: 7.39s
16:	learn: 0.0953072	total: 127ms	remaining: 7.37s
17:	learn: 0.0947332	total: 135ms	remaining: 7.36s
18:	learn: 0.0942076	total: 143ms	remaining: 7.37s
19:	learn: 0.0936790	total: 154ms



0:	learn: 0.1049563	total: 7.74ms	remaining: 7.73s
1:	learn: 0.1043881	total: 15.3ms	remaining: 7.61s
2:	learn: 0.1037650	total: 23.8ms	remaining: 7.9s
3:	learn: 0.1031009	total: 31.8ms	remaining: 7.92s
4:	learn: 0.1024361	total: 49.7ms	remaining: 9.89s
5:	learn: 0.1017791	total: 61.1ms	remaining: 10.1s
6:	learn: 0.1011801	total: 67.5ms	remaining: 9.58s
7:	learn: 0.1006502	total: 76.1ms	remaining: 9.43s
8:	learn: 0.1000567	total: 82.6ms	remaining: 9.1s
9:	learn: 0.0994626	total: 91.3ms	remaining: 9.04s
10:	learn: 0.0988592	total: 98.9ms	remaining: 8.89s
11:	learn: 0.0982926	total: 108ms	remaining: 8.91s
12:	learn: 0.0977160	total: 125ms	remaining: 9.51s
13:	learn: 0.0971144	total: 133ms	remaining: 9.39s
14:	learn: 0.0965681	total: 159ms	remaining: 10.4s
15:	learn: 0.0959697	total: 170ms	remaining: 10.4s
16:	learn: 0.0954225	total: 178ms	remaining: 10.3s
17:	learn: 0.0948715	total: 194ms	remaining: 10.6s
18:	learn: 0.0943739	total: 205ms	remaining: 10.6s
19:	learn: 0.0938424	total: 212m



0:	learn: 0.1014768	total: 9.74ms	remaining: 9.73s
1:	learn: 0.0987679	total: 15.6ms	remaining: 7.8s
2:	learn: 0.0956678	total: 23.4ms	remaining: 7.78s
3:	learn: 0.0929890	total: 30.5ms	remaining: 7.61s
4:	learn: 0.0901458	total: 37.9ms	remaining: 7.53s
5:	learn: 0.0874550	total: 44.1ms	remaining: 7.3s
6:	learn: 0.0850182	total: 50.2ms	remaining: 7.12s
7:	learn: 0.0827974	total: 57.1ms	remaining: 7.08s
8:	learn: 0.0807227	total: 63.4ms	remaining: 6.98s
9:	learn: 0.0788261	total: 73.8ms	remaining: 7.31s
10:	learn: 0.0770604	total: 81.3ms	remaining: 7.31s
11:	learn: 0.0752344	total: 87.8ms	remaining: 7.23s
12:	learn: 0.0735815	total: 94.6ms	remaining: 7.18s
13:	learn: 0.0718444	total: 101ms	remaining: 7.14s
14:	learn: 0.0702262	total: 109ms	remaining: 7.15s
15:	learn: 0.0687956	total: 116ms	remaining: 7.13s
16:	learn: 0.0674961	total: 123ms	remaining: 7.09s
17:	learn: 0.0661624	total: 129ms	remaining: 7.02s
18:	learn: 0.0652183	total: 135ms	remaining: 6.97s
19:	learn: 0.0640417	total: 14



0:	learn: 0.0984614	total: 6.5ms	remaining: 6.49s
1:	learn: 0.0934237	total: 12.6ms	remaining: 6.27s
2:	learn: 0.0878375	total: 18.2ms	remaining: 6.05s
3:	learn: 0.0833660	total: 25ms	remaining: 6.22s
4:	learn: 0.0787972	total: 37ms	remaining: 7.36s
5:	learn: 0.0750498	total: 45.8ms	remaining: 7.58s
6:	learn: 0.0715259	total: 62.5ms	remaining: 8.87s
7:	learn: 0.0685379	total: 75.5ms	remaining: 9.36s
8:	learn: 0.0659458	total: 93.7ms	remaining: 10.3s
9:	learn: 0.0637874	total: 102ms	remaining: 10.1s
10:	learn: 0.0618594	total: 109ms	remaining: 9.8s
11:	learn: 0.0599471	total: 115ms	remaining: 9.49s
12:	learn: 0.0584236	total: 128ms	remaining: 9.75s
13:	learn: 0.0571411	total: 139ms	remaining: 9.79s
14:	learn: 0.0556938	total: 151ms	remaining: 9.91s
15:	learn: 0.0543399	total: 159ms	remaining: 9.79s
16:	learn: 0.0530021	total: 170ms	remaining: 9.81s
17:	learn: 0.0520690	total: 183ms	remaining: 9.99s
18:	learn: 0.0510565	total: 195ms	remaining: 10.1s
19:	learn: 0.0503737	total: 204ms	rema



0:	learn: 0.0738352	total: 146ms	remaining: 2m 25s
1:	learn: 0.0575840	total: 303ms	remaining: 2m 31s
2:	learn: 0.0497737	total: 456ms	remaining: 2m 31s
3:	learn: 0.0444315	total: 607ms	remaining: 2m 31s
4:	learn: 0.0416364	total: 689ms	remaining: 2m 17s
5:	learn: 0.0404034	total: 834ms	remaining: 2m 18s
6:	learn: 0.0399866	total: 856ms	remaining: 2m 1s
7:	learn: 0.0392810	total: 991ms	remaining: 2m 2s
8:	learn: 0.0384411	total: 1.05s	remaining: 1m 55s
9:	learn: 0.0379793	total: 1.07s	remaining: 1m 46s
10:	learn: 0.0372623	total: 1.22s	remaining: 1m 49s
11:	learn: 0.0366762	total: 1.3s	remaining: 1m 47s
12:	learn: 0.0357553	total: 1.45s	remaining: 1m 50s
13:	learn: 0.0352869	total: 1.58s	remaining: 1m 51s
14:	learn: 0.0350676	total: 1.61s	remaining: 1m 45s
15:	learn: 0.0343512	total: 1.76s	remaining: 1m 48s
16:	learn: 0.0341272	total: 1.78s	remaining: 1m 43s
17:	learn: 0.0339476	total: 1.93s	remaining: 1m 45s
18:	learn: 0.0337297	total: 2.07s	remaining: 1m 47s
19:	learn: 0.0336027	tota

GridSearchCV(estimator=<catboost.core.CatBoostRegressor object at 0x7f46e5bfd490>,
             param_grid={'depth': [4, 6, 10],
                         'learning_rate': [0.01, 0.05, 0.1, 0.5]})

In [15]:
print(f'Random forest: {rf_clf.best_params_}')
print(f'Lasso: {lasso_clf.best_params_}')
print(f'Ridge: {ridge_clf.best_params_}')
print(f'Catboost: {cb_clf.best_params_}')

Random forest: {'n_estimators': 150}
Lasso: {'alpha': 0}
Ridge: {'alpha': 0.1}
Catboost: {'depth': 10, 'learning_rate': 0.01}
