In [2]:
import os
import torch
import pandas as pd
from polymerlearn.utils import get_IV_add, GraphDataset

# Load data from local path:
data = pd.read_csv(os.path.join('/Users/owenqueen/Desktop/eastman_project-confidential/Eastman_Project/PolymerGNN/dataset', 
            'pub_data.csv'))

In [3]:
add = get_IV_add(data)

dataset = GraphDataset(
    data = data,
    structure_dir = '../Structures/AG/xyz',
    Y_target=['IV'],
    test_size = 0.2,
    add_features=add
)

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [4]:
from polymerlearn.models.gnn import PolymerGNN_IV
from polymerlearn.utils import CV_eval

model_kwargs = {
    'input_feat': 6,         # How many input features on each node; don't change this
    'hidden_channels': 32,   # How many intermediate dimensions to use in model
                            # Can change this ^^
    'num_additional': 4      # How many additional resin properties to include in the prediction
                            # Corresponds to the number in get_IV_add
}
model = PolymerGNN_IV(**model_kwargs)

optimizer_gen = torch.optim.AdamW
criterion = torch.nn.MSELoss()

all_predictions, all_y, all_inds, state_dicts = CV_eval(
    dataset = dataset,
    model_generator = PolymerGNN_IV,
    optimizer_generator = optimizer_gen,
    criterion = criterion,
    model_generator_kwargs = model_kwargs,
    optimizer_kwargs = {'lr': 0.0001, 'weight_decay':0.01},
    epochs = 800,
    batch_size = 64,
    verbose = 1,
    save_state_dicts=True
)

Fold: 1 	 Epoch: 0,                     	 Train r2: -1.5719 	 Train Loss: 5.6361
Fold: 1 	 Epoch: 50,                     	 Train r2: 0.4221 	 Train Loss: 0.9526
Fold: 1 	 Epoch: 100,                     	 Train r2: 0.6710 	 Train Loss: 1.0138
Fold: 1 	 Epoch: 150,                     	 Train r2: 0.6960 	 Train Loss: 0.7551
Fold: 1 	 Epoch: 200,                     	 Train r2: 0.7715 	 Train Loss: 0.9338
Fold: 1 	 Epoch: 250,                     	 Train r2: 0.8129 	 Train Loss: 0.7847
Fold: 1 	 Epoch: 300,                     	 Train r2: 0.7876 	 Train Loss: 0.5539
Fold: 1 	 Epoch: 350,                     	 Train r2: 0.8261 	 Train Loss: 0.7636
Fold: 1 	 Epoch: 400,                     	 Train r2: 0.8693 	 Train Loss: 0.5521
Fold: 1 	 Epoch: 450,                     	 Train r2: 0.8327 	 Train Loss: 0.6537
Fold: 1 	 Epoch: 500,                     	 Train r2: 0.8442 	 Train Loss: 0.5207
Fold: 1 	 Epoch: 550,                     	 Train r2: 0.8974 	 Train Loss: 0.2355
Fold: 1 	 Epoch: 6

In [2]:
# Use this block to save and/or load model weights and results from a previous trial

# torch.save((all_predictions, all_y, all_inds, state_dicts), open('CV_tup.pt', 'wb')) # Save state dicts to load later
# torch.save(dataset, 'dataset.pt')

# all_predictions, all_y, all_inds, state_dicts = torch.load('CV_tup.pt')
# dataset = torch.load('dataset.pt')

In [5]:
from polymerlearn.explain import PolymerGNN_EXPLAIN, PolymerGNNExplainer

model_kwargs = {
    'input_feat': 6,         # How many input features on each node; don't change this
    'hidden_channels': 32,   # How many intermediate dimensions to use in model
                            # Can change this ^^
    'num_additional': 4      # How many additional resin properties to include in the prediction
                            # Corresponds to the number in get_IV_add
}

exps = []

for sd in state_dicts:

    mexplain = PolymerGNN_EXPLAIN(**model_kwargs)
    mexplain.load_state_dict(sd) # Load weights from trained model over to explaining one

    explainer = PolymerGNNExplainer(mexplain)

    exp_out = explainer.get_testing_explanation(dataset)

    exps.append(exp_out)

  torch.tensor(self.add_test).float().to(self.device)
  torch.tensor(add_test).float())


{'A': tensor([1.1564, 0.0000]), 'G': tensor([1.6978, 0.0000, 0.9692, 0.0000]), 'add': tensor([ 2.1333, -0.1153, -0.1453,  0.0000]), 'table_ind': 5}
['IPA', 'TPA']
['HDO', '1,4-CHDM', 'TMCD', 'TCDDM']
{'A': tensor([0.5982, 0.0000]), 'G': tensor([0.4637, 0.2488, 0.3581]), 'add': tensor([ 0.8977, -0.0667, -0.2231,  0.0000]), 'table_ind': 10}
['IPA', 'TPA']
['1,4-CHDM', 'TMCD', 'MPD']
{'A': tensor([0.9456, 0.0000]), 'G': tensor([0.7417, 0.4074, 0.6968]), 'add': tensor([ 1.8868, -0.0742, -0.2069,  0.0000]), 'table_ind': 11}
['IPA', 'TPA']
['1,4-CHDM', 'TMCD', 'MPD']
{'A': tensor([0.8647, 0.0000]), 'G': tensor([0.4790, 0.3352, 0.4193, 0.3675]), 'add': tensor([ 1.5352, -0.1210, -0.2751,  0.0172]), 'table_ind': 12}
['IPA', 'TPA']
['1,4-CHDM', 'TMCD', 'TMP', 'MPD']
{'A': tensor([0.0757, 0.3400, 0.0000]), 'G': tensor([0.1842, 0.0703, 0.4527]), 'add': tensor([ 0.5585, -0.0377, -0.2155,  0.0075]), 'table_ind': 28}
['1,4-CHDA', 'IPA', 'TPA']
['NPG', 'TMCD', 'TMP']
{'A': tensor([0.0923, 0.4145, 0.00

In [6]:
def agg_exps(exp_list, add_data_keys = ['Mw', 'AN', 'OHN', '%TMP']):

    agg_addkeys = {a:[] for a in add_data_keys}
    acid_scores = []
    glycol_scores = []

    for i in range(len(exp_list)):
        for ad in add_data_keys:
            agg_addkeys[ad] += exp_list[i][3][ad]

        for j in range(len(exp_list[i][0])):
            acid_scores.append(torch.sum(exp_list[i][0][j]['A']).item())
            glycol_scores.append(torch.sum(exp_list[i][0][j]['G']).item())

    return acid_scores, glycol_scores, agg_addkeys


In [7]:
# Compute rankings:

acid_scores, glycol_scores, agg_addkeys = agg_exps(exps)

In [8]:
import numpy as np
for k, v in agg_addkeys.items():
    if k == '%TMP':
        v = np.abs(v)
        m = np.mean(v[v > 1e-9])
    else:
        m = np.mean(np.abs(v))
    print(f'{k} mean importance: {m}')

print('Acid embedding mean importance:', np.mean(acid_scores))
print('Glycol embedding mean importance', np.mean(glycol_scores))

Mw mean importance: 1.2528209559619428
AN mean importance: 0.08861688307867249
OHN mean importance: 0.36916257832199334
%TMP mean importance: 0.03391442998227748
Acid embedding mean importance: 0.5863616717047989
Glycol embedding mean importance 1.0000917118042707


In [10]:
avg_scores = {
    'A':[],
    'G':[],
    'Mw':[],
    'AN':[],
    'OHN':[],
    '%TMP':[]
}

name_list = ['A', 'G', 'Mw', 'AN', 'OHN', '%TMP']

for i in range(len(acid_scores)):
    # Iterates over samples:
    l = [
        acid_scores[i],
        glycol_scores[i],
        agg_addkeys['Mw'][i],
        agg_addkeys['AN'][i],
        agg_addkeys['OHN'][i],
        agg_addkeys['%TMP'][i]
    ]

    sort_inds = list(np.argsort(np.abs(l)))
    sort_inds.reverse()

    for j, n in enumerate(name_list):
        avg_scores[n].append(sort_inds.index(j) + 1)

for k, v in avg_scores.items():
    print(f'{k} : {np.mean(v):.4f}')
    

A : 3.0500
G : 1.8208
Mw : 1.4125
AN : 5.0417
OHN : 3.7167
%TMP : 5.9583


In [14]:
import numpy as np
import matplotlib.pyplot as plt

plt.figure(dpi=150, figsize=(4, 5))
all_data = []
all_data.append(acid_scores)
all_data.append(glycol_scores)

for k, v in agg_addkeys.items():
    if k == '%TMP':
        m = np.array(v)[np.abs(v) > 1e-9]
    else:
        m = v
    print(f'{k} mean importance: {np.mean(m)}')

    all_data.append(m)

print('Acid embedding mean importance:', np.mean(acid_scores))
print('Glycol embedding mean importance', np.mean(glycol_scores))


plt.hlines(0, xmin=0, xmax=len(name_list) + 1, colors = 'black', linestyles='dashed')
plt.boxplot(all_data)
plt.ylabel('Attribution')
plt.xticks(list(range(1, len(name_list) + 1)), name_list)

plt.show()

Mw mean importance: 1.2528209559619428
AN mean importance: -0.08745921769501971
OHN mean importance: -0.36916257832199334
%TMP mean importance: 0.031037314794957638
Acid embedding mean importance: 0.5863616717047989
Glycol embedding mean importance 1.0000917118042707


TypeError: use() missing 1 required positional argument: 'style'

<Figure size 600x750 with 0 Axes>

In [7]:
# Summarize importance scores:
from polymerlearn.utils.graph_prep import get_AG_info

# Mw summary:
acid_scores = []
glycol_scores = []

mw_scores = []
an_scores = []
ohn_scores = []
tmp_scores = []

acid_names = pd.Series([c[1:] for c in data_mask.columns[20:33].tolist()])
glycol_names = pd.Series([c[1:] for c in data_mask.columns[34:46].tolist()])
acids, glycols, _, _ = get_AG_info(data_mask)

acid_key = {a:[] for a in acid_names}
glycol_key = {g:[] for g in glycol_names}

for i in range(len(exp_summary)):

    df_ind = exp_summary[i]['table_ind']

    for a in range(len(acids[df_ind])):
        acid_key[acids[df_ind][a]].append(exp_summary[i]['A'][a].item()) 
    
    for g in range(len(glycols[df_ind])):
        glycol_key[glycols[df_ind][g]].append(exp_summary[i]['G'][g].item()) 

    acid_scores.append(torch.sum(exp_summary[i]['A']).item())
    glycol_scores.append(torch.sum(exp_summary[i]['G']).item())

    # Break down individual scores:
    mw_scores.append(exp_summary[i]['add'][0].item())
    an_scores.append(exp_summary[i]['add'][1].item())
    ohn_scores.append(exp_summary[i]['add'][2].item())
    tmp_scores.append(exp_summary[i]['add'][3].item())

print(acid_scores)
print(glycol_scores)

print(mw_scores)
print(an_scores)
print(ohn_scores)
print(tmp_scores)

print(acid_key)
print(glycol_key)


[-0.5220816731452942, -0.871830940246582, -1.372873067855835, -0.7051889300346375, -1.479537844657898, -0.8235787749290466, -0.49047982692718506, -0.4644251763820648, -1.7646515369415283, -1.5553362369537354, -1.144914150238037, -2.20985746383667, -1.4301812648773193, -0.5704778432846069, -0.9277279376983643, -0.6290164589881897, -0.811713457107544, -0.4857237935066223, -1.2340717315673828, -0.7832549214363098, -2.0962419509887695, -0.3163483738899231, -0.6890739798545837, -1.1152818202972412, -0.6931972503662109, -0.519429087638855, -0.70504230260849, -0.5535339117050171, -0.7099688053131104, -0.5150319337844849, -1.429985761642456, -0.777635931968689, -0.45038893818855286, -0.8560777902603149, -0.5053524971008301, -0.9263220429420471, -0.41691508889198303, -1.2905853986740112, -1.724608302116394, -1.264465093612671, -1.064122200012207, -0.8285700678825378, -0.9279764890670776, -0.7006344199180603, -2.206166982650757, -0.6776403188705444, -2.2430825233459473, -0.9685702323913574, -2.1