In [34]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error,  mean_squared_log_error

DATA_PATH = './nomad2018-predict-transparent-conductors'

# Preprocessing

In [23]:
train_data = pd.read_csv(f'{DATA_PATH}/train.csv')
test_data = pd.read_csv(f'{DATA_PATH}/test.csv')

In [25]:
train_data = train_data.drop(['id'], axis=1)
test_data = test_data.drop(['id'], axis=1)
train_data.head()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387
1,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921
2,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438
3,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793


In [45]:
def get_volume(lv1, lv2, lv3, alpha, beta, gamma):
    s = np.ones(alpha.shape) + 2*np.cos(alpha)*np.cos(beta)*np.cos(gamma) - np.square(np.cos(alpha))- np.square(np.cos(beta))- np.square(np.cos(gamma))
    result = lv1*lv2*lv3*np.sqrt(s)
    # can be na as some sqrt's are done on negative numbers which give NaN value.
    return result.fillna(result.mean())


train_volume = get_volume(
    train_data['lattice_vector_1_ang'],
    train_data['lattice_vector_2_ang'],
    train_data['lattice_vector_3_ang'],
    train_data['lattice_angle_alpha_degree'],
    train_data['lattice_angle_beta_degree'],
    train_data['lattice_angle_gamma_degree'],
)
test_volume = get_volume(
    test_data['lattice_vector_1_ang'],
    test_data['lattice_vector_2_ang'],
    test_data['lattice_vector_3_ang'],
    test_data['lattice_angle_alpha_degree'],
    test_data['lattice_angle_beta_degree'],
    test_data['lattice_angle_gamma_degree'],
)
train_data['volume'] = train_volume
test_data['volume'] = test_volume

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [46]:
train_data.head()

Unnamed: 0,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev,volume
0,33,80.0,0.625,0.375,0.0,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.068,3.4387,358.004815
1,194,80.0,0.625,0.375,0.0,6.184,6.1838,23.6287,90.0186,89.998,120.0025,0.249,2.921,462.04017
2,227,40.0,0.8125,0.1875,0.0,9.751,5.6595,13.963,90.9688,91.1228,30.5185,0.1821,2.7438,371.996916
3,167,30.0,0.75,0.0,0.25,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492,173.807109
4,194,80.0,0.0,0.625,0.375,6.6614,6.6612,24.5813,89.996,90.0006,119.9893,0.0505,1.3793,551.194843


# Training

## Simple validation split

In [29]:
target_cols = ['formation_energy_ev_natom','bandgap_energy_ev']
feature_cols = [c for c in train_data.columns if c not in target_cols]
print(target_cols)
print(feature_cols)

['formation_energy_ev_natom', 'bandgap_energy_ev']
['spacegroup', 'number_of_total_atoms', 'percent_atom_al', 'percent_atom_ga', 'percent_atom_in', 'lattice_vector_1_ang', 'lattice_vector_2_ang', 'lattice_vector_3_ang', 'lattice_angle_alpha_degree', 'lattice_angle_beta_degree', 'lattice_angle_gamma_degree', 'volume']


In [31]:
X_train, X_test, y_train, y_test = train_test_split(train_data[feature_cols], train_data[target_cols], test_size = 0.30)

In [47]:
# for formation energy
colname = 'formation_energy_ev_natom'
rf_formation_energy = RandomForestRegressor(n_estimators=100)
rf_formation_energy.fit(X_train, y_train[colname])
test_pred = rf_formation_energy.predict(X_test)

score = rf_formation_energy.score(X_test, y_test[colname])
mse = mean_squared_error(y_test[colname], test_pred)
msle = mean_squared_log_error(y_test[colname], test_pred)

rmse = np.sqrt(mse)
rmsle = np.sqrt(msle)

print(f'Score: {score}')
print(f'RMSLE: {rmsle}')

Score: 0.8084817686614452
RMSLE: 0.036948806058882115


In [48]:
# for bandgap energy
colname = 'bandgap_energy_ev'
rf_bandgap_energy = RandomForestRegressor(n_estimators=100)
rf_bandgap_energy.fit(X_train, y_train[colname])
test_pred = rf_bandgap_energy.predict(X_test)

score = rf_bandgap_energy.score(X_test, y_test[colname])
mse = mean_squared_error(y_test[colname], test_pred)
msle = mean_squared_log_error(y_test[colname], test_pred)

rmse = np.sqrt(mse)
rmsle = np.sqrt(msle)

print(f'Score: {score}')
print(f'RMSLE: {rmsle}')

Score: 0.9368000942800468
RMSLE: 0.09615171540286224


# Create submission

In [44]:
pd.DataFrame({
    'id': list(range(1, test_data.shape[0]+1)),
    'formation_energy_ev_natom':rf_formation_energy.predict(test_data),
    'bandgap_energy_ev':rf_bandgap_energy.predict(test_data)
}).to_csv('submissions/rohan_random_forest.csv', index=False)