In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from math import sqrt
from sklearn import model_selection
from scipy.stats import spearmanr

In [None]:
inp_vectors = pd.read_csv("rand_forest.csv")
target_data = pd.read_csv('properties.csv')
train_masks = []
test_masks = []

In [None]:
db_cofs = inp_vectors[['cof']]
db_cofs = np.squeeze(db_cofs.values)

In [None]:
db_traindata = inp_vectors[['ASA_m^2/g','Density','LS','B','O','C','H',
                        'Si','N','S','Ni','Zn','Cu','Co','F','P','Cl','V','Br']]
db_traindata = db_traindata.values

In [None]:
for i in range(10):
    obj = pd.read_pickle(r'splits/split_run_{}.pkl'.format(i))
    train_masks.append(obj['masks']['train'])
    test_masks.append(obj['masks']['test'])

First, remove the values in target that don't have valid graphs

In [None]:
list1 = np.squeeze(target_data[['name']].values)
list2 = db_cofs
valid_list = list(set(list1).intersection(list2))

In [None]:
list2

In [None]:
target_data = target_data[['name','h2o_henry', 'h2s_henry', 'xe_henry', 'kr_henry', 'co2_0.001bar', 'o2_5bar', 'o2_140bar', 'co2_30bar', 'n2_0.001bar', 'n2_30bar', 'h2_77K_5bar', 'h2_77K_100bar', 
            'h2_298K_5bar', 'h2_298K_100bar', 'ch4_65bar', 'ch4_5.8bar']]
target_data = target_data.values

In [None]:
target_data.shape

In [None]:
removal = []
for i,cof in enumerate(target_data):
    if cof[0] not in valid_list:
        removal.append(i)

In [None]:
target_data = np.delete(target_data, removal, 0)

In [None]:
target_data.shape

Then, use the targets to produce an ordered set of input vectors

In [None]:
sorted_cofs = []
for name in list2:
    sorted_cofs.append(np.where(target_data[:,0] == name))
sorted_cofs = np.squeeze(sorted_cofs)

In [None]:
target_data = target_data[:,1:]

In [None]:
db_traindata = db_traindata[sorted_cofs]

In [None]:
sorted_cofs[1]

In [None]:
db_traindata.shape

Seperate Training and Testing Sets

In [None]:
train_ones = np.empty((16,10))
test_ones = np.empty((16,10))

In [None]:
for i in range(10):
    for j in range(16):
        train_ones[i][j] = np.where(train_masks[i][:,j] == 1)[0])
        test_ones[i][j] = np.where(test_masks[i][:,j] == 1)[0])

In [None]:
train_inputs = np.empty((16,10))
test_inputs = np.empty((16,10))
train_outputs = np.empty((16,10))
test_outputs = np.empty((16,10))

In [None]:
for i in range(10):
    for j in range(16):
        train_inputs[i][j] = np.array([db_traindata[i] for i in train_ones[i][j],dtype=np.float64)
        test_inputs[i][j] = np.array([db_traindata[i] for i in test_ones[i][j],dtype=np.float64)
        train_outputs[i][j] = np.array([target_data[i] for i in train_ones[i][j],dtype=np.float64)
        test_outputs[i][j] = np.array([target_data[i] for i in test_ones[i][j],dtype=np.float64)

Standardize Targets

In [None]:
#train_output_logged = np.zeros(train_output_0.shape)

In [None]:
#for i in range(16):
#    train_output_logged[:,i] = np.log10(train_output_0[:,i])
#train_output_0 = train_output_logged

In [None]:
means_0 = train_output_0.mean(axis=0)
stds_0 = np.std(train_output_0, axis=0)

In [None]:
z_train_0 = np.zeros((152,16))
z_test_0 = np.zeros((414,16))

In [None]:
for i,cof in enumerate(train_output_0):
    for j,target in enumerate(cof):
            z_train_0[i][j] = (train_output_0[i][j] - means_0[j])/ stds_0[j]
        
for i,cof in enumerate(test_output_0):
    for j,target in enumerate(cof):        
            z_test_0[i][j] = (test_output_0[i][j] - means_0[j])/ stds_0[j]

Implement Random Forest Model

In [None]:
target_name = ['h2o_henry', 'h2s_henry', 'xe_henry', 'kr_henry', 'co2_0.001bar', 'o2_5bar', 'o2_140bar', 'co2_30bar', 'n2_0.001bar', 'n2_30bar', 'h2_77K_5bar', 'h2_77K_100bar', 
            'h2_298K_5bar', 'h2_298K_100bar', 'ch4_65bar', 'ch4_5.8bar']

In [None]:
for i in range(16):
    Random_Forest = ExtraTreesRegressor(n_estimators = 200, random_state = 0, criterion = "mse", bootstrap = True, warm_start = True)
    Random_Forest.fit(train_input_0, z_train_0[:,i])
    test_pred = Random_Forest.predict(test_input_0)
    print(target_name[i])
    print('-------------')
    print('Mean Absolute Error:', metrics.mean_absolute_error(z_test_0[:,i], test_pred))
    print('Mean Squared Error:', metrics.mean_squared_error(z_test_0[:,i], test_pred ))
    print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(z_test_0[:,i], test_pred)))
    print('Spearman Correlation:', spearmanr(z_test_0[:,i], test_pred)[0]) 
    print('_____________________________________________\n\n')