In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn import metrics
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.neural_network import MLPRegressor
from math import sqrt
from sklearn import model_selection
from scipy.stats import spearmanr

In [2]:
# Read Vectors Used as Input Features
inp_vectors = pd.read_csv("rand_forest.csv")
# Read Vectors Used as Target Features
target_data = pd.read_csv('properties.csv')
# Prepare to Read Train and Testing Masks as Lists
train_masks = []
test_masks = []

In [3]:
# Seperate the Names of Cofs from Dataframe
# Use for future alignment of Target and Input DB order
db_cofs = inp_vectors[['cof']]
# Create Single Dimentional Array
db_cofs = np.squeeze(db_cofs.values)

In [4]:
# Seperate the Training Features from Names
db_traindata = inp_vectors[['ASA_m^2/g','Density','LS','B','O','C','H',
                        'Si','N','S','Ni','Zn','Cu','Co','F','P','Cl','V','Br']]
# Convert to Arrays
db_traindata = db_traindata.values

In [5]:
# Read in The Train and Test Masks from Seperate pkl Files
# Stored in the './splits' directory
for i in range(10):
    # Read in Object from pkl
    obj = pd.read_pickle(r'splits/split_run_{}.pkl'.format(i))
    # Seperate Out Masks as Keys
    train_masks.append(obj['masks']['train'])
    test_masks.append(obj['masks']['test'])

In [8]:
obj['ids_graphs_cold']

[46,
 47,
 291,
 398,
 221,
 256,
 374,
 134,
 368,
 58,
 166,
 33,
 297,
 424,
 300,
 350,
 364,
 519,
 279,
 8,
 425,
 207,
 365,
 434,
 70,
 287,
 14,
 173,
 447,
 171,
 205,
 212,
 0,
 565,
 67,
 121,
 520,
 310,
 399,
 528,
 223,
 525,
 513,
 487,
 418,
 154,
 75,
 337,
 347,
 352,
 187,
 511,
 11,
 180,
 174,
 80,
 542,
 193,
 293,
 538,
 254,
 159,
 438,
 127,
 318,
 20,
 392,
 431,
 2,
 23,
 334,
 198,
 483,
 469,
 59,
 240,
 367,
 401,
 234,
 415,
 162,
 247,
 54,
 465,
 73,
 541,
 29,
 32,
 170,
 138,
 504,
 564,
 277,
 131,
 239,
 326,
 168,
 201,
 194,
 21,
 420,
 93,
 429,
 6,
 83,
 558,
 42,
 543,
 147,
 467,
 151,
 98,
 257,
 441]

In [14]:
inp_vectors_cold = inp_vectors.values[obj['ids_graphs_cold']]
#X_train, X_test, Y_train, Y_test -> better naming convention


In [27]:
inp_vectors_warm = inp_vectors.values[obj['ids_graphs_warm']]

In [29]:
inp_vectors_warm.shape

(452, 20)

First, remove the values in target that don't have valid graphs

In [19]:
# Convert Names from Targets and Inputs into lists
# Then use set Overlap in order to determine values we don't need
# Some are not present in splits because of Graph Bugs
list1 = np.squeeze(target_data[['name']].values)
list2 = db_cofs
# valid_list is the list of overlapping (and thus valid) cofs
valid_list = list(set(list1).intersection(list2))

In [20]:
# Extract the targets from the Matrix Factorization paper
target_data = target_data[['name','h2o_henry', 'h2s_henry', 'xe_henry', 'kr_henry', 'co2_0.001bar', 'o2_5bar', 'o2_140bar', 'co2_30bar', 'n2_0.001bar', 'n2_30bar', 'h2_77K_5bar', 'h2_77K_100bar', 
            'h2_298K_5bar', 'h2_298K_100bar', 'ch4_65bar', 'ch4_5.8bar']]
# Convert to Arrays
target_data = target_data.values

seperation of warm and cold graphs

In [24]:
target_data_cold = target_data[obj['ids_graphs_cold']]

In [30]:
target_data_warm = target_data[obj['ids_graphs_warm']]

In [31]:
target_data_warm.shape

(452, 17)

In [32]:
# Define a list called removal, this will hold indeces we need to remove
removal = []
# Sift through all of the target data
for i,cof in enumerate(target_data):
    # If the name is not valid, then add the index to the removal list
    if cof[0] not in valid_list:
        removal.append(i)

In [33]:
# Now use the numpy delete function in order to remove values at those indeces
# Now all of the target data is what we would like to use!
target_data = np.delete(target_data, removal, 0)

Then, use the targets to produce an ordered set of input vectors

In [34]:
# Make an empty list called sorted_cofs, which will hold indeces that we would like
# to reorganize our list of training data to. 
sorted_cofs = []
for name in list2:
    # Find the index that we need to shift the current value to, based on the target organization
    sorted_cofs.append(np.where(target_data[:,0] == name))
# Make sure the dimentions are right
sorted_cofs = np.squeeze(sorted_cofs)

In [35]:
# Now that we are organized, remove the names from the targets
target_data = target_data[:,1:]

In [36]:
# Shift the training data using the new indeces
db_traindata = db_traindata[sorted_cofs]

NOW you would want to make cold and warm batches

In [41]:
db_traindata_cold = db_traindata[obj['ids_graphs_cold']]
db_traindata_warm = db_traindata[obj['ids_graphs_warm']]

In [42]:
target_data_cold = target_data[obj['ids_graphs_cold']] 
target_data_warm = target_data[obj['ids_graphs_warm']]

In [48]:
training_mask = obj['masks']['train']

In [58]:
data = np.multiply(train_masks[1],target_data_warm)[:,1]

In [59]:
data[data != 0]

array([9.428440000000001e-05, 0.000114672, 3.49334e-05,
       3.3294200000000004e-05, 2.7062899999999998e-05, 5.55429e-05,
       4.73722e-05, 0.0014552, 9.35926e-05, 0.000168302,
       2.0753899999999998e-05, 1.14166e-05, 4.2636e-05, 1.18783e-05,
       3.29302e-05, 1.87006e-05, 5.19729e-05, 4.18379e-05, 4.89429e-05,
       1.7079e-05, 8.031399999999999e-05, 4.3941400000000006e-05,
       2.57802e-05, 3.5383400000000004e-05, 0.00010005100000000001,
       6.09999e-05, 4.79526e-05, 2.6399699999999997e-05,
       3.2759400000000004e-05, 6.144439999999999e-05, 3.99435e-05,
       4.48509e-05, 3.84352e-05, 2.09692e-05, 2.41475e-05, 8.04017e-05,
       4.31768e-05, 0.00132955, 0.000117988, 0.000134237,
       2.6611900000000002e-05, 1.62172e-05, 3.71527e-05, 2.52684e-05,
       2.25688e-05, 4.46037e-05, 2.01106e-05, 5.2531700000000004e-05,
       3.64921e-05, 4.02127e-05, 9.794940000000001e-05, 4.43404e-05,
       3.92503e-05, 2.26667e-05, 2.7729200000000002e-05, 3.06251e-05,
       3.65

Seperate Training and Testing Sets TODO - write more asserts in final draft

In [81]:
# Now prepare empty arrays that will hold the training set, and testing set, for each target and each split
train_inputs = np.empty((10,16), dtype=object)
test_inputs = np.empty((10,16), dtype=object)
train_outputs = np.empty((10,16), dtype=object)
test_outputs = np.empty((10,16), dtype=object)

In [82]:
for i in range(10):
    for j in range(16):
        train_ones = np.where(train_masks[i][:,j] == 1)[0]
        train_inputs = np.array([db_traindata_warm[i] for i in train_ones],dtype=np.float64)
        train_outputs = np.multiply(train_masks[i],target_data_warm)[:,j]
        train_outputs = train_outputs[train_outputs != 0]

In [83]:
for i in range(10):
    for j in range(16):
        test_inputs = db_traindata_cold
        test_outputs = target_data_cold[:,j]

In [84]:
test_inputs.shape

(114, 19)

In [86]:
test_outputs

array([3.8474982722326, 2.7859601515990002, 5.0505034785015,
       4.5589461404299, 4.0637463902337005, 3.1214879603067005,
       2.5139844068395, 2.9967296885762, 3.1718324877059, 3.8030312922664,
       6.333711910192701, 2.2688428990166, 3.8246274921351997,
       7.2997999250009, 2.5265275973688, 2.9794002185602997,
       2.1684461820015, 3.5402912375002002, 3.6724612389813,
       2.6892693971816004, 3.0572384067628997, 1.3046221095407,
       3.8135024157846003, 4.6175211170326, 2.7478683863385998,
       3.2982030877078996, 3.785015742887, 1.6776968134673,
       3.9202134219472997, 3.3078260395607995, 3.5877602298357005,
       3.7913155336097004, 3.0427115741508994, 4.821859897581599,
       3.2221231316053, 3.2046239541657995, 4.5017060724121,
       0.8979819117370799, 3.1070862435096003, 5.663239339727401,
       3.1571453204301, 3.3476726420402003, 3.3025931276775005,
       3.6843639567653, 5.4835640700034, 4.2734565253485,
       6.1679205396709005, 2.5372885967363996

In [76]:
train_inputs.shape

(221, 19)

In [77]:
train_outputs.shape

(221,)

Standardize Targets

In [87]:
# Each index, since correlated to a target, will have a mean, and absolute deviation
means = np.empty((10,16), dtype=np.float64)
stdevs = np.empty((10,16), dtype=np.float64)

In [88]:
# Calculate the mean and stdevs, then push into the corresponding slot
for i in range(10):
    for j in range(16):
        means[i][j] = train_outputs[i][j].mean()
        stdevs[i][j] = np.std(train_outputs[i][j])

TypeError: 'float' object is not subscriptable

In [None]:
# Create arrays to store the standardized (z-score) version of the training and testing outputs
train_outputs_z = np.empty((10,16), dtype=object)
test_outputs_z = np.empty((10,16), dtype=object)

In [None]:
# Basic Standardization Procedure, dimentions of outputs do not change
for i in range(10):
    for j in range(16):
        train_outputs_z[i][j] = (train_outputs[i][j] - means[i][j]) / stdevs[i][j]
        test_outputs_z[i][j] = (test_outputs[i][j] - means[i][j]) / stdevs[i][j]

Implement Random Forest Model

In [None]:
# Create a variable of the target names for ease of printing
target_name = ['h2o_henry', 'h2s_henry', 'xe_henry', 'kr_henry', 'co2_0.001bar', 'o2_5bar', 'o2_140bar', 'co2_30bar', 'n2_0.001bar', 'n2_30bar', 'h2_77K_5bar', 'h2_77K_100bar', 
            'h2_298K_5bar', 'h2_298K_100bar', 'ch4_65bar', 'ch4_5.8bar']

In [None]:
# Get ready to store all of the performance metric for each test that is run
MAEs = np.empty((10,16), dtype=np.float64)
SPRs = np.empty((10,16), dtype=np.float64)
MSEs = np.empty((10,16), dtype=np.float64)
RMSEs = np.empty((10,16), dtype=np.float64)

In [None]:
# Now run all 10 x 16 tests in sequence, using the test and train splits we made
for i in range(10):
    # Print the epoch to get some idea of what is going on
    print("Epoch: ", i)
    for j in range(16):
        # Instanciate the model, baseline model taken from another project (for now)
        Random_Forest = ExtraTreesRegressor(n_estimators = 200, random_state = 0, criterion = "mae", bootstrap = True, warm_start = True)
        # Fit the model using the inputs and zscored outputs
        Random_Forest.fit(train_inputs[i][j], train_outputs_z[i][j])
        # Make predictions using test inputs
        test_preds = Random_Forest.predict(test_inputs[i][j])
        # Find the metrics
        MAEs[i][j] = metrics.mean_absolute_error(test_outputs_z[i][j], test_preds)
        SPRs[i][j] = spearmanr(test_outputs_z[i][j], test_preds)[0]
        MSEs[i][j] = metrics.mean_squared_error(test_outputs_z[i][j], test_preds)
        RMSEs[i][j] = np.sqrt(metrics.mean_squared_error(test_preds, test_outputs_z[i][j]))

In [None]:
# Take averages over the 10 runs for each target
MAEs = MAEs.mean(axis=0)
SPRs = SPRs.mean(axis=0)
MSEs = MSEs.mean(axis=0)
RMSEs = RMSEs.mean(axis=0)

In [None]:
# Tell the user what is going on
for i in range(16):
    print("Printing Details for Target: ", target_name[i])
    print("---------------------------")
    print("Average MAE: ", MAEs[i])
    print("Average MSE: ", MSEs[i])
    print("Average SPR: ", SPRs[i])
    print("Average RMSE: ", RMSEs[i])
    print("________________________________________________\n\n")
    
print("Overall Averages")
print("****************************************")
print("MAE: ", MAEs.mean())
print("MSE: ", MSEs.mean())
print("SPR: ", SPRs.mean())
print("RMSE: ", RMSEs.mean())
print("***************************************:)")