In [174]:
import os
import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA
# from modules.preprocessing import make_cube, get_pos_lattice
# from modules.preprocessing import atom_list

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from modules.data_processing import make_Xy, post_process, RMSLE
import xgboost
from sklearn.svm import SVR 
from sklearn.model_selection import GridSearchCV


In [142]:
DATA_PATH = './nomad2018-predict-transparent-conductors'

In [143]:
train_all_data = pd.read_csv(f'{DATA_PATH}/train.csv')
test_all_data = pd.read_csv(f'{DATA_PATH}/test.csv')

df = pd.concat((train_all_data, test_all_data), ignore_index=True)
df

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001,,
2996,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004,,
2997,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857,,
2998,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007,,


In [144]:
# one-hot encoding for spacegroup
spacegroup_labels = LabelEncoder().fit_transform(df['spacegroup'])
spacegroup_labels = spacegroup_labels.reshape(len(spacegroup_labels), 1)

spacegroup_onehot = OneHotEncoder(sparse=False).fit_transform(spacegroup_labels)


In [148]:
df[['sg1','sg2','sg3','sg4','sg5','sg6']] = pd.DataFrame(spacegroup_onehot, index=df.index)

In [149]:
df

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev,sg1,sg2,sg3,sg4,sg5,sg6
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387,0.0,1.0,0.0,0.0,0.0,0.0
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210,0.0,0.0,0.0,1.0,0.0,0.0
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438,0.0,0.0,0.0,0.0,0.0,1.0
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492,0.0,0.0,1.0,0.0,0.0,0.0
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001,,,1.0,0.0,0.0,0.0,0.0,0.0
2996,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004,,,0.0,1.0,0.0,0.0,0.0,0.0
2997,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857,,,0.0,0.0,0.0,1.0,0.0,0.0
2998,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007,,,0.0,1.0,0.0,0.0,0.0,0.0


In [150]:
df['num_al'] = df['percent_atom_al'] * df['number_of_total_atoms']
df['num_ga'] = df['percent_atom_ga'] * df['number_of_total_atoms']
df['num_in'] = df['percent_atom_in'] * df['number_of_total_atoms']

df

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,...,bandgap_energy_ev,sg1,sg2,sg3,sg4,sg5,sg6,num_al,num_ga,num_in
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,...,3.4387,0.0,1.0,0.0,0.0,0.0,0.0,50.0,30.000,0.000
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,...,2.9210,0.0,0.0,0.0,1.0,0.0,0.0,50.0,30.000,0.000
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,...,2.7438,0.0,0.0,0.0,0.0,0.0,1.0,32.5,7.500,0.000
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,...,3.3492,0.0,0.0,1.0,0.0,0.0,0.0,22.5,0.000,7.500
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,...,1.3793,0.0,0.0,0.0,1.0,0.0,0.0,0.0,50.000,30.000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,...,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,47.504,32.496
2996,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,...,,0.0,1.0,0.0,0.0,0.0,0.0,5.0,0.000,35.000
2997,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,...,,0.0,0.0,0.0,1.0,0.0,0.0,0.0,20.000,60.000
2998,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,...,,0.0,1.0,0.0,0.0,0.0,0.0,25.0,0.000,15.000


In [151]:
df_others = df[[c for c in df.columns if c not in ['id','spacegroup']]]
df_others = df_others/df_others.max()
df_others

Unnamed: 0,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,...,bandgap_energy_ev,sg1,sg2,sg3,sg4,sg5,sg6,num_al,num_ga,num_in
0,1.000,0.6250,0.3750,0.0000,0.399476,0.831006,0.362089,0.889092,0.847733,0.749677,...,0.650517,0.0,1.0,0.0,0.0,0.0,0.0,0.62500,0.37500,0.00000
1,1.000,0.6250,0.3750,0.0000,0.248220,0.600935,0.932246,0.889250,0.847693,0.999571,...,0.552581,0.0,0.0,0.0,1.0,0.0,0.0,0.62500,0.37500,0.00000
2,0.500,0.8125,0.1875,0.0000,0.391396,0.549984,0.550896,0.898637,0.858287,0.254206,...,0.519059,0.0,0.0,0.0,0.0,0.0,1.0,0.40625,0.09375,0.00000
3,0.375,0.7500,0.0000,0.2500,0.200840,0.486225,0.533883,0.888956,0.847824,0.999564,...,0.633586,0.0,0.0,1.0,0.0,0.0,0.0,0.28125,0.00000,0.09375
4,1.000,0.0000,0.6250,0.3750,0.267382,0.647328,0.969830,0.889027,0.847717,0.999461,...,0.260930,0.0,0.0,0.0,1.0,0.0,0.0,0.00000,0.62500,0.37500
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,1.000,0.0000,0.5938,0.4062,0.996030,0.621595,0.248296,0.889068,0.986861,0.749663,...,,1.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.59380,0.40620
2996,0.500,0.1250,0.0000,0.8750,0.223908,0.921732,0.398907,0.889074,0.847680,0.749666,...,,0.0,1.0,0.0,0.0,0.0,0.0,0.06250,0.00000,0.43750
2997,1.000,0.0000,0.2500,0.7500,0.278473,0.674149,0.988878,0.889137,0.847598,0.999431,...,,0.0,0.0,0.0,1.0,0.0,0.0,0.00000,0.25000,0.75000
2998,0.500,0.6250,0.0000,0.3750,0.208085,0.861578,0.374639,0.889107,0.847720,0.749668,...,,0.0,1.0,0.0,0.0,0.0,0.0,0.31250,0.00000,0.18750


In [154]:
# add cube_PCA to X and Xsub
cube_PCA = np.load('./nomad2018-predict-transparent-conductors/cube_PCA.npy').transpose()
cube_PCA = cube_PCA / np.max(cube_PCA)

# X = np.hstack((X, cube_PCA[:2400, ]))
# Xsub = np.hstack((Xsub, cube_PCA[-600:, ]))

In [157]:
df_others[[f'pca{i}' for i in range(100)]] = pd.DataFrame(cube_PCA, index=df_others.index)

  self[k1] = value[k2]


In [160]:
traindf = df_others[~df_others['formation_energy_ev_natom'].isnull()]
result_cols = ['formation_energy_ev_natom','bandgap_energy_ev']

X_train, y_train = traindf[[c for c in traindf.columns if c not in result_cols]], traindf[result_cols]

X_test = df_others[df_others['formation_energy_ev_natom'].isnull()][[c for c in traindf.columns if c not in result_cols]]

In [170]:
def RMSLE(y, pred):
    rmsle = []
    for icol in range(y.shape[1]):
        diff = (np.log(1+pred.iloc[:, icol]) - np.log(1+y.iloc[:, icol]))**2
        rmsle.append(np.sqrt(diff[~diff.isnull()].mean()))
    return rmsle



In [177]:
model = xgboost.XGBRegressor()
xgb_params={
    'n_estimators':[500,1000,1500],
    'max_depth':[6,7,8],
    'eta':[0.1],
    'subsample':[0.7],
    'colsample_bytree':[0.8]
}

In [179]:
clf = GridSearchCV(
    model,
    xgb_params,
    cv=3
)
clf.fit(X_train, y_train.iloc[:,0])