In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
df_train=pd.read_csv('train.csv')

In [4]:
X_train_all=df_train.iloc[:,1:-2]
formation_e=df_train.iloc[:,-2]
bandgap_e=df_train.iloc[:,-1]
print(X_train_all.shape)
print(formation_e.shape)

(2400, 11)
(2400,)


In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, fe_train,fe_test,bg_train,bg_test =\
    train_test_split(X_train_all, formation_e, bandgap_e, test_size=0.3, random_state=101)
print(X_train.shape)
print(fe_train.shape)

(1680, 11)
(1680,)


In [6]:
mean=X_train.mean(axis=0)
std=X_train.std(axis=0)

X_train-=mean
X_test-=mean

X_train/=std
X_test/=std

#the max for the lattice_angle_alpha_degree is a bit large, looking at the histogram of this column
#shows that it is a bit skewed. maybe it is becase of the discontinuity in the angle from 0 to 360 degrees
#reparametrization of the data might help

print(X_train_all.shape)
print(X_train.shape)
print(fe_train.shape)

(2400, 11)
(1680, 11)
(1680,)


In [7]:
#from sklearn.manifold import Isomap
#model = Isomap(n_neighbors=25, n_components=3)
#X_embedded = model.fit_transform(X_train)
#X_embedded_test = model.transform(X_test)

In [8]:
#plt.scatter(X_embedded[:, 0],X_embedded[:, 1])

In [9]:
from sklearn.cluster import KMeans

n_clusters=10
kmeans = KMeans(n_clusters=n_clusters, random_state=0).fit(X_train)

columns = list(X_train.columns)
cluster_cols = ['cl' + str(i) for i in range(n_clusters)]
cluster_df = pd.get_dummies(kmeans.labels_).values

X_train_concat = np.concatenate([X_train.values, cluster_df], axis=1)
X_train = pd.DataFrame(X_train_concat, columns=columns + cluster_cols)

print(X_train.shape)
print(X_train.head())

(1680, 21)
   spacegroup  number_of_total_atoms  percent_atom_al  percent_atom_ga  \
0    0.769588               0.808445        -1.098680         2.293487   
1    1.015640              -0.957729         1.134929        -0.508165   
2   -1.503461               0.808445        -1.451572         0.425434   
3   -1.503461               0.808445         0.194383        -0.241362   
4    0.769588               0.808445        -0.981299        -1.308576   

   percent_atom_in  lattice_vector_1_ang  lattice_vector_2_ang  \
0        -0.926240             -0.119460              1.232599   
1        -0.690075             -0.052910             -0.732131   
2         1.081353              2.578604             -0.304136   
3         0.018421              2.511686             -0.452576   
4         2.143907              0.004901              1.604988   

   lattice_vector_3_ang  lattice_angle_alpha_degree  \
0             -0.570631                   -0.194190   
1              0.279898              

In [10]:
pred=kmeans.predict(X_test)

In [11]:
cluster_test=pd.get_dummies(pred)
X_test = pd.concat([X_test, cluster_test], axis=1)
print(X_test.shape)

(1223, 21)


In [12]:
from sklearn.model_selection import GridSearchCV
import copy

import xgboost as xgb

model1 = GridSearchCV(
    xgb.XGBRegressor(),
    param_grid={'max_depth':[1,2,3,4,5,6,7],'n_estimators':[50,100,200]},
    scoring='r2',
    verbose=1,
    n_jobs=4,
)
model2 = copy.deepcopy(model1)

In [13]:
model1.fit(X_train,fe_train )

print(model1.best_score_ ,model1.best_params_)
from pprint import pprint
pprint(sorted(zip(X_train.columns, model1.best_estimator_.feature_importances_), key=lambda x: -x[1]))

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=4)]: Done  56 out of  63 | elapsed:    3.3s remaining:    0.4s


0.8306432774130252 {'max_depth': 3, 'n_estimators': 100}
[('percent_atom_in', 0.1527559),
 ('percent_atom_al', 0.14173229),
 ('lattice_angle_gamma_degree', 0.10393701),
 ('lattice_vector_2_ang', 0.096062995),
 ('lattice_vector_3_ang', 0.09448819),
 ('lattice_angle_alpha_degree', 0.08661418),
 ('percent_atom_ga', 0.08503937),
 ('lattice_vector_1_ang', 0.08503937),
 ('lattice_angle_beta_degree', 0.080314964),
 ('spacegroup', 0.05511811),
 ('number_of_total_atoms', 0.014173228),
 ('cl6', 0.0031496063),
 ('cl9', 0.0015748031),
 ('cl0', 0.0),
 ('cl1', 0.0),
 ('cl2', 0.0),
 ('cl3', 0.0),
 ('cl4', 0.0),
 ('cl5', 0.0),
 ('cl7', 0.0),
 ('cl8', 0.0)]


[Parallel(n_jobs=4)]: Done  63 out of  63 | elapsed:    3.8s finished


In [14]:
model2.fit(X_train, bg_train)

print(model2.best_score_ ,model2.best_params_)
from pprint import pprint
pprint(sorted(zip(X_train.columns, model2.best_estimator_.feature_importances_), key=lambda x: -x[1]))

Fitting 3 folds for each of 21 candidates, totalling 63 fits


[Parallel(n_jobs=4)]: Done  63 out of  63 | elapsed:    3.9s finished


0.9460860792980776 {'max_depth': 3, 'n_estimators': 200}
[('percent_atom_al', 0.15432099),
 ('lattice_angle_gamma_degree', 0.14429012),
 ('lattice_angle_alpha_degree', 0.11419753),
 ('lattice_vector_3_ang', 0.11265432),
 ('lattice_vector_1_ang', 0.10570987),
 ('percent_atom_in', 0.093364194),
 ('lattice_angle_beta_degree', 0.081018515),
 ('percent_atom_ga', 0.07330247),
 ('lattice_vector_2_ang', 0.063271604),
 ('spacegroup', 0.044753086),
 ('cl6', 0.0046296297),
 ('cl0', 0.0030864198),
 ('cl5', 0.0030864198),
 ('number_of_total_atoms', 0.0023148148),
 ('cl1', 0.0),
 ('cl2', 0.0),
 ('cl3', 0.0),
 ('cl4', 0.0),
 ('cl7', 0.0),
 ('cl8', 0.0),
 ('cl9', 0.0)]


In [21]:
#APPLYING SUPPORT VECTOR REGRESSOR
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

parameters = {'kernel':['rbf','poly'], 'C':[10, 50, 100],'gamma':[0.001,.01,.1,.5]}
svr = SVR()
clf = GridSearchCV(svr, parameters, cv=5, scoring='r2', verbose=1, n_jobs=4)
clf.fit(X_train,fe_train)


Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=4)]: Done 120 out of 120 | elapsed:  7.0min finished


GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params=None, iid=True, n_jobs=4,
       param_grid={'kernel': ['rbf', 'poly'], 'C': [10, 50, 100], 'gamma': [0.001, 0.01, 0.1, 0.5]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='r2', verbose=1)

In [22]:
print(clf.best_score_ ,clf.best_params_)


0.6506056550180294 {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
