In [1]:

import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA

from modules.preprocessing import PrePro
# from modules.preprocessing import atom_list
from modules.data_processing import make_Xy, post_process, RMSLE

from sklearn.svm import SVR 
import xgboost

In [3]:
prepro = PrePro()

# Now generate cubes
cube_list = []

# generate cubes from train
for i in range(2400):
    if i % 100 == 0:
        print(i, '/', 2400)
    filename = "./nomad2018-predict-transparent-conductors/train/{}/geometry.xyz".format(i+1)
    cube = prepro.make_cube(filename)
    
    cube_list.append(cube)

# generate cubes from test
for i in range(600):
    if i % 100 == 0:
        print(i, '/', 600)    
    filename = "./nomad2018-predict-transparent-conductors/test/{}/geometry.xyz".format(i+1)
    cube = prepro.make_cube(filename)
    
    cube_list.append(cube)
    
# save cube
cube = np.array(cube_list)
np.save("./nomad2018-predict-transparent-conductors/cube.npy", cube)

# perform PCA, and keep only 100 principal components for SVR
cube_vec = np.reshape(cube, (3000, -1))
pca = PCA(n_components=100)
pca.fit(cube_vec)

cube_PCA = np.linalg.lstsq(pca.components_.transpose()[:,:100], (cube_vec-pca.mean_).transpose())
np.save('./nomad2018-predict-transparent-conductors/cube_PCA.npy', cube_PCA[0])

0 / 2400
100 / 2400
200 / 2400
300 / 2400
400 / 2400
500 / 2400
600 / 2400
700 / 2400
800 / 2400
900 / 2400
1000 / 2400
1100 / 2400
1200 / 2400
1300 / 2400
1400 / 2400
1500 / 2400
1600 / 2400
1700 / 2400
1800 / 2400
1900 / 2400
2000 / 2400
2100 / 2400
2200 / 2400
2300 / 2400
0 / 600
100 / 600
200 / 600
300 / 600
400 / 600
500 / 600




In [2]:
# get data from csv
X, y, Xsub, id_sub = make_Xy()

# add cube_PCA to X and Xsub
cube_PCA = np.load('./nomad2018-predict-transparent-conductors/cube_PCA.npy').transpose()
cube_PCA = cube_PCA / np.max(cube_PCA)

X = np.hstack((X, cube_PCA[:2400, ]))
Xsub = np.hstack((Xsub, cube_PCA[-600:, ]))

(3000, 6)
(3000, 25)


In [5]:
X.shape

(2400, 125)

In [5]:
X.shape

(2400, 125)

In [7]:
dimX = X.shape[1]
N = X.shape[0]

C_list = [1.0 * 1.5**p for p in range(8,11)]
epsilon_list = [0.1 * 1.5**p for p in range(-8,-3)]
gamma_list = [1.0 * 1.5**p for p in range(0,5)]

error0 = {}
error1 = {}
current_min  = np.inf

while len(error0) < 15:
    C = np.random.choice(C_list)
    epsilon = np.random.choice(epsilon_list)
    gamma = np.random.choice(gamma_list)
    
    t = (C, epsilon, gamma)
    if t in error0:
        continue
 
    pred = np.tile(np.nan, (N,2))
    for iCV in range(5):
        ind_test = np.tile(False, N)
        ind_test[iCV::5] = True

        X_train, y_train = X[~ind_test, :], y[~ind_test, :]
        X_test, y_test = X[ind_test, :], y[ind_test, :]

        y_train = np.log(1+y_train)

        clf0 = SVR(C=C, epsilon=epsilon, gamma=gamma/float(dimX), kernel='rbf')
        clf0.fit(X_train, y_train[:, 0])

        clf1 = SVR(C=C, epsilon=epsilon, gamma=gamma/float(dimX), kernel='rbf')
        clf1.fit(X_train, y_train[:, 1])

        pred[ind_test, 0] = clf0.predict(X_test)
        pred[ind_test, 1] = clf1.predict(X_test)
    
    # post process
    pred = post_process(pred)
    
    error0[t], error1[t] = RMSLE(y, pred)
    current_min = 0.5*(min(error0.values()) + min(error1.values()))

    print("trial", len(error0), " - ", "current min", '{0:.4f}'.format(current_min))

trial 1  -  current min 0.0535
trial 2  -  current min 0.0535


KeyboardInterrupt: 

In [5]:
pd.DataFrame(X)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115,116,117,118,119,120,121,122,123,124
0,0.0,1.0,0.0,0.0,0.0,0.0,0.00000,0.375000,0.625000,0.0000,...,-0.002452,-0.016044,-0.052766,0.006915,0.029493,-0.007292,0.036000,-0.032594,0.050063,-0.019586
1,0.0,0.0,0.0,1.0,0.0,0.0,0.00000,0.375000,0.625000,0.0000,...,0.033004,0.000394,-0.027771,0.010385,0.044621,0.009004,-0.033127,0.037724,0.030019,-0.037387
2,0.0,0.0,0.0,0.0,0.0,1.0,0.00000,0.093750,0.406250,0.0000,...,-0.000006,-0.023564,0.018550,-0.011254,0.035884,-0.000355,0.025180,-0.017152,-0.006443,-0.004310
3,0.0,0.0,1.0,0.0,0.0,0.0,0.09375,0.000000,0.281250,0.2500,...,0.025927,-0.035263,0.053008,0.079386,-0.045858,-0.002875,-0.029347,0.008719,0.059916,0.035869
4,0.0,0.0,0.0,1.0,0.0,0.0,0.37500,0.625000,0.000000,0.3750,...,0.040145,-0.000792,0.017869,0.046774,0.023710,-0.023936,0.007820,0.003851,-0.034871,0.019494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0.0,1.0,0.0,0.0,0.0,0.0,0.00000,0.125000,0.375000,0.0000,...,0.031921,-0.003183,-0.043201,-0.005050,0.002970,-0.039720,0.005556,-0.014470,-0.009522,-0.010115
2396,0.0,0.0,1.0,0.0,0.0,0.0,0.00000,0.218738,0.156263,0.0000,...,0.041837,-0.019553,0.004207,-0.051991,0.008429,-0.057814,-0.082929,0.026217,-0.017058,-0.070163
2397,0.0,0.0,0.0,0.0,1.0,0.0,0.00000,0.562500,0.437500,0.0000,...,-0.046299,-0.062999,-0.013534,-0.019412,0.024185,-0.032461,-0.023570,0.023540,-0.060406,0.055243
2398,0.0,1.0,0.0,0.0,0.0,0.0,0.50000,0.187500,0.312500,0.5000,...,0.001559,0.046253,0.004336,0.019356,0.034060,0.007627,0.077699,0.057185,-0.010061,-0.052877


In [8]:
# get best hyperparameters for each output
# (C0, epsilon0, gamma0) = min(error0, key=error0.get) 
# (C1, epsilon1, gamma1) = min(error1, key=error1.get) 

# train SVR
#clf0 = SVR(C=C0, epsilon=epsilon0, gamma=gamma0/float(dimX), kernel='rbf')
clf0 = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
clf0.fit(X, np.log(1+y[:, 0]) )

#clf1 = SVR(C=C1, epsilon=epsilon1, gamma=gamma1/float(dimX), kernel='rbf')
clf1 = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
clf1.fit(X, np.log(1+y[:, 1]) )

# predict submission samples
ysub0 = clf0.predict(Xsub)
ysub1 = clf1.predict(Xsub)

# post process
ysub0 = post_process(ysub0)
ysub1 = post_process(ysub1)    

# write submission
submission = pd.concat([pd.DataFrame(id_sub), pd.DataFrame(ysub0), pd.DataFrame(ysub1)], axis=1)
submission.columns = ['id','formation_energy_ev_natom', 'bandgap_energy_ev']
submission.to_csv('submission_xgboost.csv', index = False)

In [10]:
pd.DataFrame(X)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,115,116,117,118,119,120,121,122,123,124
0,0.0,1.0,0.0,0.0,0.0,0.0,0.00000,0.375000,0.625000,0.0000,...,-0.002452,-0.016044,-0.052766,0.006915,0.029493,-0.007292,0.036000,-0.032594,0.050063,-0.019586
1,0.0,0.0,0.0,1.0,0.0,0.0,0.00000,0.375000,0.625000,0.0000,...,0.033004,0.000394,-0.027771,0.010385,0.044621,0.009004,-0.033127,0.037724,0.030019,-0.037387
2,0.0,0.0,0.0,0.0,0.0,1.0,0.00000,0.093750,0.406250,0.0000,...,-0.000006,-0.023564,0.018550,-0.011254,0.035884,-0.000355,0.025180,-0.017152,-0.006443,-0.004310
3,0.0,0.0,1.0,0.0,0.0,0.0,0.09375,0.000000,0.281250,0.2500,...,0.025927,-0.035263,0.053008,0.079386,-0.045858,-0.002875,-0.029347,0.008719,0.059916,0.035869
4,0.0,0.0,0.0,1.0,0.0,0.0,0.37500,0.625000,0.000000,0.3750,...,0.040145,-0.000792,0.017869,0.046774,0.023710,-0.023936,0.007820,0.003851,-0.034871,0.019494
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,0.0,1.0,0.0,0.0,0.0,0.0,0.00000,0.125000,0.375000,0.0000,...,0.031921,-0.003183,-0.043201,-0.005050,0.002970,-0.039720,0.005556,-0.014470,-0.009522,-0.010115
2396,0.0,0.0,1.0,0.0,0.0,0.0,0.00000,0.218738,0.156263,0.0000,...,0.041837,-0.019553,0.004207,-0.051991,0.008429,-0.057814,-0.082929,0.026217,-0.017058,-0.070163
2397,0.0,0.0,0.0,0.0,1.0,0.0,0.00000,0.562500,0.437500,0.0000,...,-0.046299,-0.062999,-0.013534,-0.019412,0.024185,-0.032461,-0.023570,0.023540,-0.060406,0.055243
2398,0.0,1.0,0.0,0.0,0.0,0.0,0.50000,0.187500,0.312500,0.5000,...,0.001559,0.046253,0.004336,0.019356,0.034060,0.007627,0.077699,0.057185,-0.010061,-0.052877
