In [21]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np
import random


from ase.cell import Cell
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix, MBTR

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy.linalg as LA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import xgboost

import catboost as cb
import shap
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, r2_score


import warnings
warnings.filterwarnings('ignore')


In [22]:
DATA_PATH = './nomad2018-predict-transparent-conductors'


In [23]:
def custom_converter(entry):
    return np.array([float(x) for x in entry[1:-1].split(',')])

In [24]:
train_all_data = pd.read_csv(
    f'{DATA_PATH}/train_extrainfo.csv'
)
test_all_data = pd.read_csv(
    f'{DATA_PATH}/test_extrainfo.csv'
)

In [25]:
train_all_data.drop(['CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix'], axis=1).head()
# test_all_data.drop(['CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix'], axis=1).head()
train_all_data.head()

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev,CoulombMatrix,SineMatrix,EwaldSumMatrix
0,1,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387,"[121.10109815785134, 75.93020649243516, 122.09...","[1897.7459337097173, 175.12757587921956, 73.28...","[-147.16586613403337, -83.60940041075736, 50.8..."
1,2,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921,"[106.316346316413, 155.73823639588636, 139.218...","[1897.7459337097173, 155.40184168757955, 179.3...","[31.62494082794717, 499.7677140971106, 499.777..."
2,3,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438,"[76.85202206242872, 82.658215573199, 36.006484...","[1897.7459337097173, 64.86894200257981, 71.405...","[-105.23185191282306, -230.0887946397426, 9.15..."
3,4,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492,"[224.16370782409976, 216.10586898126473, 94.14...","[5694.30331076094, 475.2322348410168, 284.1925...","[-226.29548002634016, -767.1897774756807, -359..."
4,5,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793,"[161.3156894201164, 419.0476738666981, 161.311...","[5694.30331076094, 94.27325853815663, 195.3521...","[44.77486350842855, -804.0787210072267, -198.3..."


# Basic linear regression

In [26]:

def compute_final_values(train_all_data, test_all_data, target_column, model):
    # train
    dfcombined = train_all_data.drop(['id', 'formation_energy_ev_natom', 'bandgap_energy_ev',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix'], axis=1)
    X_train, X_test, y_train, y_test = train_test_split(dfcombined, train_all_data[target_column], test_size = 0.30, random_state=1)
    rf = model.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    y_pred = rf.predict(X_test)

    print("Score:", score)
    print("Error rate:", ((mean_squared_error(y_test, y_pred)*100)), "%")
    
    # print(f'Training score for {target_column}: {score}')
    # test
    # dfcombined, n_components = create_matrix_df(test_all_data, train=False, pca_components=n_components)
    dfcombined = test_all_data.drop(['id','CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix'], axis=1)
    predicted = rf.predict(dfcombined)
    return predicted


## Experiments

In [27]:
models = [
    ('randomforest', RandomForestRegressor(n_estimators=150, random_state=2), RandomForestRegressor(n_estimators=150, random_state=2)),
    ('ridge', Ridge(alpha=0.1), Ridge(alpha=0.1)),
    ('lasso', Lasso(alpha=0, max_iter=1e5), Lasso(alpha=0, max_iter=1e5)),
    ('linear regression', LinearRegression(),LinearRegression()),
    ('catboost', cb.CatBoostRegressor(loss_function='RMSE', depth=10, learning_rate=0.01), cb.CatBoostRegressor(loss_function='RMSE', depth=10, learning_rate=0.01)),
    ('xgboost', xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8), xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
)
    # ('svr', SVR(kernel = 'linear'), SVR(kernel = 'linear'))
]

In [28]:
for name, train_model, test_model in models:
    print(f'Running {name}')
    pred_fe = compute_final_values(
        train_all_data, 
        test_all_data, 
        target_column='formation_energy_ev_natom',
        model = train_model
    )
    pred_bandgap = compute_final_values(
        train_all_data, 
        test_all_data, 
        target_column='bandgap_energy_ev',
        model = test_model
    )
    id_1 = np.arange(1, len(pred_fe)+1, 1, dtype=int)
    submission_df = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':pred_fe,'bandgap_energy_ev':pred_bandgap})
    submission_df.to_csv(f"submissions/trial_submission_df_{name}.csv", index=False)
    

Running randomforest
Score: 0.8390751040866701
Error rate: 0.16696945141770828 %
Score: 0.9494293353412081
Error rate: 5.251645651182015 %
Running ridge
Score: 0.3826406517976859
Error rate: 0.6405481955535997 %
Score: 0.8105417723020573
Error rate: 19.67479534397565 %
Running lasso
Score: 0.38263706103979656
Error rate: 0.6405519211851842 %
Score: 0.8105686014240159
Error rate: 19.672009202195436 %
Running linear regression
Score: 0.38836033039176276
Error rate: 0.6346136781396318 %
Score: 0.8113390519611587
Error rate: 19.591999709733017 %
Running catboost
0:	learn: 0.1043233	total: 39.7ms	remaining: 39.7s
1:	learn: 0.1036457	total: 62.4ms	remaining: 31.2s
2:	learn: 0.1029017	total: 86.4ms	remaining: 28.7s
3:	learn: 0.1022223	total: 110ms	remaining: 27.4s
4:	learn: 0.1015790	total: 131ms	remaining: 26.1s
5:	learn: 0.1009167	total: 154ms	remaining: 25.5s
6:	learn: 0.1002913	total: 176ms	remaining: 25s
7:	learn: 0.0996492	total: 200ms	remaining: 24.8s
8:	learn: 0.0989642	total: 224ms	r