In [31]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np
import random


from ase.cell import Cell
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix, MBTR

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy.linalg as LA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR

import catboost as cb
import shap
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, r2_score


import warnings
warnings.filterwarnings('ignore')


In [4]:
DATA_PATH = './nomad2018-predict-transparent-conductors'


In [3]:
def custom_converter(entry):
    return np.array([float(x) for x in entry[1:-1].split(',')])

In [5]:
train_all_data = pd.read_csv(
    f'{DATA_PATH}/train_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)
test_all_data = pd.read_csv(
    f'{DATA_PATH}/test_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)

In [6]:
train_all_data['CoulombMatrix'][0].shape

(6400,)

# Basic linear regression

In [7]:
def get_eigenspectrum(matrix):
    spectrum = LA.eigvalsh(matrix)
    spectrum = np.sort(spectrum)[::-1]
    return spectrum

In [34]:
def create_matrix_df(data,pca_components=None,train=True):
    ewald_spectrum_list = []
    for m in data['SineMatrix']:
        ewald_spectrum_list.append(
            get_eigenspectrum(
                np.reshape(m, (80, 80))
            )
        )
    ewald_spectrum_df = pd.DataFrame(ewald_spectrum_list).astype(float)
    ewald_spectrum_df = ewald_spectrum_df.fillna(0)
    x = ewald_spectrum_df.loc[:, :].values
    x = StandardScaler().fit_transform(x)
    #y = data.loc[:, ['formation_energy_ev_natom']].values
    pca = PCA(n_components=15).fit(x)

    # PCA n_components calculation
    rolling_sum = 0
    n_components = 1
    for i, num in enumerate(pca.explained_variance_ratio_):
        rolling_sum += num
        if rolling_sum > 0.95:
            n_components = i
            break
    
    if not train:
        n_components = pca_components

    n_components = 80
    # print(f'Performing PCA with {n_components} components')
    pca = PCA(n_components)
    principalComponents = pca.fit_transform(x)
    principalDf = pd.DataFrame(data=principalComponents)
    if train:
        to_drop = ['id', 'formation_energy_ev_natom', 'bandgap_energy_ev',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        #dfcombined = data.drop(to_drop, axis=1)
    else:
        to_drop = ['id',
               'CoulombMatrix', 'SineMatrix', 'EwaldSumMatrix']
        dfcombined = pd.concat([data, principalDf], axis=1).drop(to_drop, axis=1)
        #dfcombined = data.drop(to_drop, axis=1)
    return dfcombined, n_components



In [35]:

def compute_final_values(train_all_data, test_all_data, target_column, model):
    # train
    dfcombined, n_components = create_matrix_df(train_all_data)
    X_train, X_test, y_train, y_test = train_test_split(dfcombined, train_all_data[target_column], test_size = 0.30, random_state=1)
    rf = model.fit(X_train, y_train)
    score = rf.score(X_test, y_test)
    y_pred = rf.predict(X_test)

    print("Score:", score)
    print("Error rate:", ((mean_squared_error(y_test, y_pred)*100)), "%")
    
    # print(f'Training score for {target_column}: {score}')
    # test
    dfcombined, n_components = create_matrix_df(test_all_data, train=False, pca_components=n_components)
    predicted = rf.predict(dfcombined)
    return predicted


## Experiments

In [32]:
models = [
    ('randomforest', RandomForestRegressor(n_estimators=1000, random_state=2), RandomForestRegressor(n_estimators=1000, random_state=2)),
    ('ridge', Ridge(alpha=0.001), Ridge(alpha=0.001)),
    ('lasso', Lasso(alpha=0.001, max_iter=1e5), Lasso(alpha=0.001, max_iter=1e5)),
    ('linear regression', LinearRegression(),LinearRegression()),
    ('catboost', cb.CatBoostRegressor(loss_function='RMSE'), cb.CatBoostRegressor(loss_function='RMSE'))
    # ('svr', SVR(kernel = 'linear'), SVR(kernel = 'linear'))
]

In [36]:
for name, train_model, test_model in models:
    print(f'Running {name}')
    pred_fe = compute_final_values(
        train_all_data, 
        test_all_data, 
        target_column='formation_energy_ev_natom',
        model = train_model
    )
    pred_bandgap = compute_final_values(
        train_all_data, 
        test_all_data, 
        target_column='bandgap_energy_ev',
        model = test_model
    )
    id_1 = np.arange(1, len(pred_fe)+1, 1, dtype=int)
    submission_df = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':pred_fe,'bandgap_energy_ev':pred_bandgap})
    submission_df.to_csv(f"submissions/trial_submission_df_{name}.csv", index=False)
    

Running randomforest
Score: 0.870897951477767
Error rate: 0.13395129508282644 %
Score: 0.9500845808264883
Error rate: 5.183599934827557 %
Running ridge
Score: 0.8111794089948328
Error rate: 0.19591294633168518 %
Score: 0.9458700873615818
Error rate: 5.621265257722683 %
Running lasso
Score: 0.769481773076637
Error rate: 0.23917680153048645 %
Score: 0.9420657652656483
Error rate: 6.0163352400060015 %
Running linear regression
Score: 0.811584151418254
Error rate: 0.19549300123853747 %
Score: 0.9459162970258133
Error rate: 5.616466491800954 %
Running catboost
Learning rate set to 0.04444
0:	learn: 0.1022115	total: 11.9ms	remaining: 11.9s
1:	learn: 0.0995523	total: 16.3ms	remaining: 8.12s
2:	learn: 0.0968388	total: 20.5ms	remaining: 6.82s
3:	learn: 0.0943771	total: 26.5ms	remaining: 6.59s
4:	learn: 0.0921157	total: 31ms	remaining: 6.17s
5:	learn: 0.0897248	total: 35.3ms	remaining: 5.85s
6:	learn: 0.0877750	total: 40ms	remaining: 5.67s
7:	learn: 0.0858929	total: 44.5ms	remaining: 5.52s
8:	le

### Code to save submission

In [19]:
id_1 = np.arange(1, len(pred_fe)+1, 1, dtype=int)
submission_df = pd.DataFrame({'id':id_1,'formation_energy_ev_natom':pred_fe,'bandgap_energy_ev':pred_bandgap})
submission_df.head()


Unnamed: 0,id,formation_energy_ev_natom,bandgap_energy_ev
0,1,0.190086,1.6364
1,2,0.068089,3.826807
2,3,0.146527,3.60357
3,4,0.032716,3.026442
4,5,0.129898,1.609366


In [20]:
submission_df.to_csv("submissions/trial_submission_df_3.csv", index=False)

In [21]:
dfcombined_1, n_components_1 = create_matrix_df(train_all_data)

In [22]:
dfcombined_1.head()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,0,1,2,3,4,5
0,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,3.084841,-5.417648,2.433371,-0.362672,-0.05726,1.158514
1,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.301449,-3.98652,0.721551,2.608416,-0.755415,-0.149227
2,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,-8.694335,-0.372194,1.442581,2.477459,-0.931437,1.416226
3,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,-9.895077,0.879398,1.863889,-1.427307,-0.235001,-0.779067
4,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,6.926983,2.031831,-0.543498,2.671925,2.329128,0.374277


In [27]:
X_train, X_test, y_train, y_test = train_test_split(dfcombined_1, train_all_data['formation_energy_ev_natom'], test_size = 0.30, random_state=1)
rf_c2 = RandomForestRegressor(n_estimators=1000, random_state=2).fit(X_train, y_train)
y_pred = rf_c2.predict(X_test)
print("Error rate:", ((mean_squared_error(y_test, y_pred)*100)), "%")

Error rate: 0.15008443317849296 %
