In [6]:

import numpy as np
import pandas as pd
import random
from sklearn.decomposition import PCA

from modules.preprocessing import PrePro
# from modules.preprocessing import atom_list
from modules.data_processing import make_Xy, post_process, RMSLE

from sklearn.svm import SVR 
import xgboost

In [3]:
prepro = PrePro()

# Now generate cubes
cube_list = []

# generate cubes from train
for i in range(2400):
    if i % 100 == 0:
        print(i, '/', 2400)
    filename = "./nomad2018-predict-transparent-conductors/train/{}/geometry.xyz".format(i+1)
    cube = prepro.make_cube(filename)
    
    cube_list.append(cube)

# generate cubes from test
for i in range(600):
    if i % 100 == 0:
        print(i, '/', 600)    
    filename = "./nomad2018-predict-transparent-conductors/test/{}/geometry.xyz".format(i+1)
    cube = prepro.make_cube(filename)
    
    cube_list.append(cube)
    
# save cube
cube = np.array(cube_list)
np.save("./nomad2018-predict-transparent-conductors/cube.npy", cube)

# perform PCA, and keep only 100 principal components for SVR
cube_vec = np.reshape(cube, (3000, -1))
pca = PCA(n_components=100)
pca.fit(cube_vec)

cube_PCA = np.linalg.lstsq(pca.components_.transpose()[:,:100], (cube_vec-pca.mean_).transpose())
np.save('./nomad2018-predict-transparent-conductors/cube_PCA.npy', cube_PCA[0])

0 / 2400
100 / 2400
200 / 2400
300 / 2400
400 / 2400
500 / 2400
600 / 2400
700 / 2400
800 / 2400
900 / 2400
1000 / 2400
1100 / 2400
1200 / 2400
1300 / 2400
1400 / 2400
1500 / 2400
1600 / 2400
1700 / 2400
1800 / 2400
1900 / 2400
2000 / 2400
2100 / 2400
2200 / 2400
2300 / 2400
0 / 600
100 / 600
200 / 600
300 / 600
400 / 600
500 / 600




In [7]:
# get data from csv
X, y, Xsub, id_sub = make_Xy()

# add cube_PCA to X and Xsub
cube_PCA = np.load('./nomad2018-predict-transparent-conductors/cube_PCA.npy').transpose()
cube_PCA = cube_PCA / np.max(cube_PCA)

X = np.hstack((X, cube_PCA[:2400, ]))
Xsub = np.hstack((Xsub, cube_PCA[-600:, ]))

In [9]:
dimX = X.shape[1]
N = X.shape[0]

C_list = [1.0 * 1.5**p for p in range(8,11)]
epsilon_list = [0.1 * 1.5**p for p in range(-8,-3)]
gamma_list = [1.0 * 1.5**p for p in range(0,5)]

error0 = {}
error1 = {}
current_min  = np.inf

while len(error0) < 15:
    C = np.random.choice(C_list)
    epsilon = np.random.choice(epsilon_list)
    gamma = np.random.choice(gamma_list)
    
    t = (C, epsilon, gamma)
    if t in error0:
        continue
 
    pred = np.tile(np.nan, (N,2))
    for iCV in range(5):
        ind_test = np.tile(False, N)
        ind_test[iCV::5] = True

        X_train, y_train = X[~ind_test, :], y[~ind_test, :]
        X_test, y_test = X[ind_test, :], y[ind_test, :]

        y_train = np.log(1+y_train)

        clf0 = SVR(C=C, epsilon=epsilon, gamma=gamma/float(dimX), kernel='rbf')
        clf0.fit(X_train, y_train[:, 0])

        clf1 = SVR(C=C, epsilon=epsilon, gamma=gamma/float(dimX), kernel='rbf')
        clf1.fit(X_train, y_train[:, 1])

        pred[ind_test, 0] = clf0.predict(X_test)
        pred[ind_test, 1] = clf1.predict(X_test)
    
    # post process
    pred = post_process(pred)
    
    error0[t], error1[t] = RMSLE(y, pred)
    current_min = 0.5*(min(error0.values()) + min(error1.values()))

    print("trial", len(error0), " - ", "current min", '{0:.4f}'.format(current_min))

trial 1  -  current min 0.0528
trial 2  -  current min 0.0528
trial 3  -  current min 0.0528
trial 4  -  current min 0.0528
trial 5  -  current min 0.0528
trial 6  -  current min 0.0528
trial 7  -  current min 0.0528
trial 8  -  current min 0.0528
trial 9  -  current min 0.0528
trial 10  -  current min 0.0528
trial 11  -  current min 0.0528
trial 12  -  current min 0.0528
trial 13  -  current min 0.0528
trial 14  -  current min 0.0528
trial 15  -  current min 0.0528


In [10]:
# get best hyperparameters for each output
(C0, epsilon0, gamma0) = min(error0, key=error0.get) 
(C1, epsilon1, gamma1) = min(error1, key=error1.get) 

# train SVR
#clf0 = SVR(C=C0, epsilon=epsilon0, gamma=gamma0/float(dimX), kernel='rbf')
clf0 = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
clf0.fit(X, np.log(1+y[:, 0]) )

#clf1 = SVR(C=C1, epsilon=epsilon1, gamma=gamma1/float(dimX), kernel='rbf')
clf1 = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
clf1.fit(X, np.log(1+y[:, 1]) )

# predict submission samples
ysub0 = clf0.predict(Xsub)
ysub1 = clf1.predict(Xsub)

# post process
ysub0 = post_process(ysub0)
ysub1 = post_process(ysub1)    

# write submission
submission = pd.concat([pd.DataFrame(id_sub), pd.DataFrame(ysub0), pd.DataFrame(ysub1)], axis=1)
submission.columns = ['id','formation_energy_ev_natom', 'bandgap_energy_ev']
submission.to_csv('submission_xgboost.csv', index = False)