In [1]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA
# from modules.preprocessing import make_cube, get_pos_lattice
# from modules.preprocessing import atom_list

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# from modules.data_processing import make_Xy, post_process, RMSLE
import xgboost
from sklearn.svm import SVR 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
# import lightgbm as lgb
from modules.preprocessing import PrePro


from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
import xgboost
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer

import catboost as cb
import shap
from sklearn.inspection import permutation_importance

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, accuracy_score, mean_squared_error, r2_score


import warnings
warnings.filterwarnings('ignore')


In [2]:
DATA_PATH = './nomad2018-predict-transparent-conductors'

# Preprocess data


## Create cube

In [3]:
prepro = PrePro()

# Now generate cubes
cube_list = []

# generate cubes from train
for i in range(2400):
    if i % 100 == 0:
        print(i, '/', 2400)
    filename = f"{DATA_PATH}/train/{i+1}/geometry.xyz"
    cube = prepro.make_cube(filename)    
    cube_list.append(cube)

# generate cubes from test
for i in range(600):
    if i % 100 == 0:
        print(i, '/', 600)    
    filename = f"{DATA_PATH}/test/{i+1}/geometry.xyz"
    cube = prepro.make_cube(filename)
    
    cube_list.append(cube)
    
# save cube
cube = np.array(cube_list)
np.save(f"{DATA_PATH}/cube.npy", cube)

# perform PCA, and keep only 100 principal components for SVR
cube_vec = np.reshape(cube, (3000, -1))
pca = PCA(n_components=100)
pca.fit(cube_vec)

cube_PCA = np.linalg.lstsq(pca.components_.transpose()[:,:100], (cube_vec-pca.mean_).transpose())
np.save(f'{DATA_PATH}/cube_PCA.npy', cube_PCA[0])

KeyboardInterrupt: 

## Build final df to train on

In [3]:
train_all_data = pd.read_csv(f'{DATA_PATH}/train.csv')
test_all_data = pd.read_csv(f'{DATA_PATH}/test.csv')

df = pd.concat((train_all_data, test_all_data), ignore_index=True)
df

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001,,
2996,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004,,
2997,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857,,
2998,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007,,


In [4]:
df_all = pd.DataFrame()

In [5]:
# one-hot encoding for spacegroup
spacegroup_labels = LabelEncoder().fit_transform(df['spacegroup'])
spacegroup_labels = spacegroup_labels.reshape(len(spacegroup_labels), 1)

spacegroup_onehot = OneHotEncoder(sparse=False).fit_transform(spacegroup_labels)

In [6]:
df_all[['sg1','sg2','sg3','sg4','sg5','sg6']] = pd.DataFrame(spacegroup_onehot, index=df.index)
df_all['num_in'] = df['percent_atom_in'] * df['number_of_total_atoms']
df_all['num_ga'] = df['percent_atom_ga'] * df['number_of_total_atoms']
df_all['num_al'] = df['percent_atom_al'] * df['number_of_total_atoms']

df_all['percent_atom_in'] = df['percent_atom_in']
df_all['percent_atom_al'] = df['percent_atom_al']
df_all['percent_atom_ga'] = df['percent_atom_ga']
df_all['number_of_total_atoms'] = df['number_of_total_atoms']

df_all['lattice_vector_3_ang'] = df['lattice_vector_3_ang']
df_all['lattice_vector_2_ang'] = df['lattice_vector_2_ang']
df_all['lattice_vector_1_ang'] = df['lattice_vector_1_ang']
df_all['lattice_angle_gamma_degree'] = df['lattice_angle_gamma_degree']
df_all['lattice_angle_beta_degree'] = df['lattice_angle_beta_degree']
df_all['lattice_angle_alpha_degree'] = df['lattice_angle_alpha_degree']

df_all['cos3'] = np.cos(np.pi/180.0*df['lattice_angle_gamma_degree'])
df_all['cos2'] = np.cos(np.pi/180.0*df['lattice_angle_beta_degree'])
df_all['cos1'] = np.cos(np.pi/180.0*df['lattice_angle_alpha_degree'])
df_all['sin3'] = np.sin(np.pi/180.0*df['lattice_angle_gamma_degree'])
df_all['sin2'] = np.sin(np.pi/180.0*df['lattice_angle_beta_degree'])
df_all['sin1'] = np.sin(np.pi/180.0*df['lattice_angle_alpha_degree'])


In [7]:
df_all = df_all/df_all.max()

In [8]:
# add cube_PCA to X and Xsub
cube_PCA = np.load('./nomad2018-predict-transparent-conductors/cube_PCA.npy').transpose()
cube_PCA = cube_PCA / np.max(cube_PCA)

df_all[[f'pca{i}' for i in range(100)]] = pd.DataFrame(cube_PCA, index=df_all.index)

In [9]:
df_all.shape

(3000, 125)

In [10]:
df_pred_var = pd.DataFrame()
df_pred_var['formation_energy_ev_natom'] = df['formation_energy_ev_natom']
df_pred_var['bandgap_energy_ev'] = df['bandgap_energy_ev']

In [11]:
traindf = df_all[~df_pred_var['formation_energy_ev_natom'].isnull()]
traindf_pred = df_pred_var[~df_pred_var['formation_energy_ev_natom'].isnull()]
result_cols = ['formation_energy_ev_natom','bandgap_energy_ev']

X_train, y_train = traindf, traindf_pred

X_test = df_all[df_pred_var['formation_energy_ev_natom'].isnull()][[c for c in traindf.columns if c not in result_cols]]

# Gridsearch

best params for both formation energy and bandgap energy

In [13]:
def RMSLE(y, pred):
    log_diff = (np.log(1+pred) - np.log(1+y))**2
    return np.sqrt(log_diff.mean())


In [14]:
model = xgboost.XGBRegressor()
xgb_params={
    'n_estimators':[500,1000,1500],
    'max_depth':[6,7,8,9],
    'eta':[0.1],
    'subsample':[0.7],
    'colsample_bytree':[0.8]
}

In [15]:
scorer = make_scorer(RMSLE, greater_is_better=False)

In [16]:
f_clf = GridSearchCV(
    model,
    xgb_params,
    cv=3,
    scoring=scorer
)
f_clf.fit(X_train, np.log(1+y_train.iloc[:,0]))

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                 

In [17]:
f_clf.best_params_

{'colsample_bytree': 0.8,
 'eta': 0.1,
 'max_depth': 6,
 'n_estimators': 500,
 'subsample': 0.7}

In [18]:
b_clf = GridSearchCV(
    model,
    xgb_params,
    cv=3,
    scoring=scorer
)
b_clf.fit(X_train, np.log(1+y_train.iloc[:,1]))

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                 

In [19]:
b_clf.best_params_

{'colsample_bytree': 0.8,
 'eta': 0.1,
 'max_depth': 6,
 'n_estimators': 500,
 'subsample': 0.7}

# Train and make submission

In [12]:
X_train.shape

(2400, 125)

In [13]:
f_model = xgboost.XGBRegressor(
    n_estimators = 1000,
    max_depth=7,
    eta=0.1, 
    subsample=0.7, 
    colsample_bytree=0.8
)
b_model = xgboost.XGBRegressor(
    n_estimators = 1000,
    max_depth=7,
    eta=0.1, 
    subsample=0.7, 
    colsample_bytree=0.8
)

# f_model = KernelRidge(alpha=0.1, kernel='rbf')
# b_model = KernelRidge(alpha=0.1, kernel='rbf')

# f_model = GaussianProcessRegressor()
# b_model = GaussianProcessRegressor()

f_model.fit(X_train, np.log(1+y_train.iloc[:,0]))
b_model.fit(X_train, np.log(1+y_train.iloc[:,1]))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             eta=0.1, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.100000001,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [14]:
def post_process(pred):
    """
        Postprocessing = undo log transform + threshold at 0
    """
    pred = pred * (pred > 0)
    pred = np.exp(pred) - 1.0
    return pred

In [15]:
f_submission = post_process(f_model.predict(X_test))
b_submission = post_process(b_model.predict(X_test))


In [16]:
# uncomment to save

# pd.DataFrame({
#     'id': list(range(1, len(f_submission)+1)),
#     'formation_energy_ev_natom': f_submission,
#     'bandgap_energy_ev': b_submission
# }).to_csv('submissions/cube_method.csv', index=False)


In [42]:
models = [
    ('randomforest', RandomForestRegressor(n_estimators=150, random_state=2), RandomForestRegressor(n_estimators=150, random_state=2)),
    ('ridge', Ridge(alpha=0.1), Ridge(alpha=0.1)),
    ('lasso', Lasso(alpha=0, max_iter=1e5), Lasso(alpha=0, max_iter=1e5)),
    ('linear regression', LinearRegression(),LinearRegression()),
    ('catboost', cb.CatBoostRegressor(loss_function='RMSE', depth=10, learning_rate=0.01), cb.CatBoostRegressor(loss_function='RMSE', depth=10, learning_rate=0.01)),
    ('xgboost', xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8), xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8))
]

In [43]:
def compute_final_values(train_all_data, pred_all_data, model, is_fe_check):
    X_train, X_test, y_train, y_test = train_test_split(train_all_data, pred_all_data, test_size = 0.30, random_state=1)
    # train
    idx = 0 if is_fe_check else 1
    rf = model.fit(X_train, np.log(1+y_train.iloc[:,idx]))

    score = rf.score(X_test, np.log(1+y_test.iloc[:,idx]))
    # y_pred = rf.predict(X_test)

    print("Score:", score)
    # print("Error rate:", ((mean_squared_error(y_test.iloc[:,idx], y_pred)*100)), "%")
    
    # # test
    # predicted = rf.predict(X_test)
    # return predicted


In [44]:
for name, bandgap_model, fe_model in models:
    print(f'Running {name}')
    pred_bandgap = compute_final_values(X_train, y_train, bandgap_model, is_fe_check=False)
    pred_fe = compute_final_values(X_train, y_train, fe_model, is_fe_check=True)


Running randomforest
Score: 0.9278556157880594
Score: 0.8960144516403202
Running ridge
Score: 0.9278564156752962
Score: 0.6495151382137874
Running lasso
Score: 0.9267502256412852
Score: 0.6650343236992304
Running linear regression
Score: 0.9274335806164542
Score: 0.6741070310116821
Running catboost


Custom logger is already specified. Specify more than one logger at same time is not thread safe.

0:	learn: 0.3248163	total: 179ms	remaining: 2m 59s
1:	learn: 0.3225993	total: 364ms	remaining: 3m 1s
2:	learn: 0.3204072	total: 541ms	remaining: 2m 59s
3:	learn: 0.3183773	total: 707ms	remaining: 2m 56s
4:	learn: 0.3162032	total: 875ms	remaining: 2m 54s
5:	learn: 0.3142616	total: 1.03s	remaining: 2m 51s
6:	learn: 0.3121833	total: 1.21s	remaining: 2m 51s
7:	learn: 0.3101676	total: 1.38s	remaining: 2m 50s
8:	learn: 0.3079170	total: 1.55s	remaining: 2m 50s
9:	learn: 0.3058787	total: 1.71s	remaining: 2m 49s
10:	learn: 0.3038206	total: 1.88s	remaining: 2m 48s
11:	learn: 0.3019217	total: 2.04s	remaining: 2m 47s
12:	learn: 0.2999735	total: 2.21s	remaining: 2m 47s
13:	learn: 0.2979226	total: 2.37s	remaining: 2m 47s
14:	learn: 0.2959661	total: 2.54s	remaining: 2m 47s
15:	learn: 0.2938399	total: 2.71s	remaining: 2m 46s
16:	learn: 0.2920316	total: 2.88s	remaining: 2m 46s
17:	learn: 0.2905071	total: 3.05s	remaining: 2m 46s
18:	learn: 0.2884565	total: 3.22s	remaining: 2m 46s
19:	learn: 0.2866807	to