In [127]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA
# from modules.preprocessing import make_cube, get_pos_lattice
# from modules.preprocessing import atom_list

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
# from modules.data_processing import make_Xy, post_process, RMSLE
import xgboost
from sklearn.svm import SVR 
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from sklearn.kernel_ridge import KernelRidge
from sklearn.gaussian_process import GaussianProcessRegressor
# import lightgbm as lgb
from modules.preprocessing import PrePro

In [2]:
DATA_PATH = './nomad2018-predict-transparent-conductors'

# Preprocess data


## Create cube

In [128]:
prepro = PrePro()

# Now generate cubes
cube_list = []

# generate cubes from train
for i in range(2400):
    if i % 100 == 0:
        print(i, '/', 2400)
    filename = f"{DATA_PATH}/train/{i+1}/geometry.xyz"
    cube = prepro.make_cube(filename)    
    cube_list.append(cube)

# generate cubes from test
for i in range(600):
    if i % 100 == 0:
        print(i, '/', 600)    
    filename = f"{DATA_PATH}/test/{i+1}/geometry.xyz"
    cube = prepro.make_cube(filename)
    
    cube_list.append(cube)
    
# save cube
cube = np.array(cube_list)
np.save(f"{DATA_PATH}/cube.npy", cube)

# perform PCA, and keep only 100 principal components for SVR
cube_vec = np.reshape(cube, (3000, -1))
pca = PCA(n_components=100)
pca.fit(cube_vec)

cube_PCA = np.linalg.lstsq(pca.components_.transpose()[:,:100], (cube_vec-pca.mean_).transpose())
np.save(f'{DATA_PATH}/cube_PCA.npy', cube_PCA[0])

0 / 2400
100 / 2400
200 / 2400
300 / 2400
400 / 2400
500 / 2400
600 / 2400
700 / 2400
800 / 2400
900 / 2400
1000 / 2400
1100 / 2400
1200 / 2400
1300 / 2400
1400 / 2400
1500 / 2400
1600 / 2400
1700 / 2400
1800 / 2400
1900 / 2400
2000 / 2400
2100 / 2400
2200 / 2400
2300 / 2400
0 / 600
100 / 600
200 / 600
300 / 600
400 / 600
500 / 600




## Build final df to train on

In [171]:
train_all_data = pd.read_csv(f'{DATA_PATH}/train.csv')
test_all_data = pd.read_csv(f'{DATA_PATH}/test.csv')

df = pd.concat((train_all_data, test_all_data), ignore_index=True)
df

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001,,
2996,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004,,
2997,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857,,
2998,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007,,


In [186]:
df_all = pd.DataFrame()

In [187]:
# one-hot encoding for spacegroup
spacegroup_labels = LabelEncoder().fit_transform(df['spacegroup'])
spacegroup_labels = spacegroup_labels.reshape(len(spacegroup_labels), 1)

spacegroup_onehot = OneHotEncoder(sparse=False).fit_transform(spacegroup_labels)


In [188]:
df_all[['sg1','sg2','sg3','sg4','sg5','sg6']] = pd.DataFrame(spacegroup_onehot, index=df.index)
df_all['num_in'] = df['percent_atom_in'] * df['number_of_total_atoms']
df_all['num_ga'] = df['percent_atom_ga'] * df['number_of_total_atoms']
df_all['num_al'] = df['percent_atom_al'] * df['number_of_total_atoms']

df_all['percent_atom_in'] = df['percent_atom_in']
df_all['percent_atom_al'] = df['percent_atom_al']
df_all['percent_atom_ga'] = df['percent_atom_ga']
df_all['number_of_total_atoms'] = df['number_of_total_atoms']

df_all['lattice_vector_3_ang'] = df['lattice_vector_3_ang']
df_all['lattice_vector_2_ang'] = df['lattice_vector_2_ang']
df_all['lattice_vector_1_ang'] = df['lattice_vector_1_ang']
df_all['lattice_angle_gamma_degree'] = df['lattice_angle_gamma_degree']
df_all['lattice_angle_beta_degree'] = df['lattice_angle_beta_degree']
df_all['lattice_angle_alpha_degree'] = df['lattice_angle_alpha_degree']

df_all['cos3'] = np.cos(np.pi/180.0*df['lattice_angle_gamma_degree'])
df_all['cos2'] = np.cos(np.pi/180.0*df['lattice_angle_beta_degree'])
df_all['cos1'] = np.cos(np.pi/180.0*df['lattice_angle_alpha_degree'])
df_all['sin3'] = np.sin(np.pi/180.0*df['lattice_angle_gamma_degree'])
df_all['sin2'] = np.sin(np.pi/180.0*df['lattice_angle_beta_degree'])
df_all['sin1'] = np.sin(np.pi/180.0*df['lattice_angle_alpha_degree'])


In [189]:
df_all = df_all/df_all.max()

In [190]:
# add cube_PCA to X and Xsub
cube_PCA = np.load('./nomad2018-predict-transparent-conductors/cube_PCA.npy').transpose()
cube_PCA = cube_PCA / np.max(cube_PCA)

df_all[[f'pca{i}' for i in range(100)]] = pd.DataFrame(cube_PCA, index=df_all.index)

  self[k1] = value[k2]


In [191]:
df_all.shape

(3000, 125)

In [192]:
df_pred_var = pd.DataFrame()
df_pred_var['formation_energy_ev_natom'] = df['formation_energy_ev_natom']
df_pred_var['bandgap_energy_ev'] = df['bandgap_energy_ev']

In [193]:
traindf = df_all[~df_pred_var['formation_energy_ev_natom'].isnull()]
traindf_pred = df_pred_var[~df_pred_var['formation_energy_ev_natom'].isnull()]
result_cols = ['formation_energy_ev_natom','bandgap_energy_ev']

X_train, y_train = traindf, traindf_pred

X_test = df_all[df_pred_var['formation_energy_ev_natom'].isnull()][[c for c in traindf.columns if c not in result_cols]]

# Gridsearch

best params for both formation energy and bandgap energy

In [153]:
def RMSLE(y, pred):
    log_diff = (np.log(1+pred) - np.log(1+y))**2
    return np.sqrt(log_diff.mean())


In [154]:
model = xgboost.XGBRegressor()
xgb_params={
    'n_estimators':[500,1000,1500],
    'max_depth':[6,7,8,9],
    'eta':[0.1],
    'subsample':[0.7],
    'colsample_bytree':[0.8]
}

In [155]:
scorer = make_scorer(RMSLE, greater_is_better=False)

In [156]:
f_clf = GridSearchCV(
    model,
    xgb_params,
    cv=3,
    scoring=scorer
)
f_clf.fit(X_train, np.log(1+y_train.iloc[:,0]))

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                 

In [157]:
f_clf.best_params_

{'colsample_bytree': 0.8,
 'eta': 0.1,
 'max_depth': 6,
 'n_estimators': 500,
 'subsample': 0.7}

In [None]:
b_clf = GridSearchCV(
    model,
    xgb_params,
    cv=3,
    scoring=scorer
)
b_clf.fit(X_train, np.log(1+y_train.iloc[:,1]))

GridSearchCV(cv=3,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None,
                                    enable_categorical=False, gamma=None,
                                    gpu_id=None, importance_type=None,
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n...
                                    num_parallel_tree=None, predictor=None,
                                    random_state=None, reg_alpha=None,
                                    reg_lambda=None, scale_pos_weight=None,
                                 

In [None]:
b_clf.best_params_

{'colsample_bytree': 0.8,
 'eta': 0.1,
 'max_depth': 6,
 'n_estimators': 500,
 'subsample': 0.7}

# Train and make submission

In [195]:
X_train.shape

(2400, 125)

In [196]:
f_model = xgboost.XGBRegressor(
    n_estimators = 1000,
    max_depth=7,
    eta=0.1, 
    subsample=0.7, 
    colsample_bytree=0.8
)
b_model = xgboost.XGBRegressor(
    n_estimators = 1000,
    max_depth=7,
    eta=0.1, 
    subsample=0.7, 
    colsample_bytree=0.8
)

# f_model = KernelRidge(alpha=0.1, kernel='rbf')
# b_model = KernelRidge(alpha=0.1, kernel='rbf')

# f_model = GaussianProcessRegressor()
# b_model = GaussianProcessRegressor()

f_model.fit(X_train, np.log(1+y_train.iloc[:,0]))
b_model.fit(X_train, np.log(1+y_train.iloc[:,1]))

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, enable_categorical=False,
             eta=0.1, gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.100000001,
             max_delta_step=0, max_depth=7, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=1000, n_jobs=12,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [197]:
def post_process(pred):
    """
        Postprocessing = undo log transform + threshold at 0
    """
    pred = pred * (pred > 0)
    pred = np.exp(pred) - 1.0
    return pred

In [198]:
f_submission = post_process(f_model.predict(X_test))
b_submission = post_process(b_model.predict(X_test))


In [199]:
pd.DataFrame({
    'id': list(range(1, len(f_submission)+1)),
    'formation_energy_ev_natom': f_submission,
    'bandgap_energy_ev': b_submission
}).to_csv('submissions/cube_method.csv', index=False)