# WC predict with MOF constitutues

## Module Import

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] ="0"
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
import tensorflow.keras as keras

## Data Load

In [None]:
random.seed(2020)

mof_names = []
wc_values = []

csv_path = "MOF_wc_data.csv"
with open(csv_path, "r") as f:
    lines = f.readlines()
    random.shuffle(lines)
    for line in lines:
        mof_name, wc_value = line.split(",")

        mof_names.append(mof_name)
        wc_values.append(float(wc_value))

In [None]:
print(mof_names[:10])

In [None]:
print(wc_values[:10])

## string to integer mapping

In [None]:
vocabulary = set()
for mof_name in mof_names:
    vocabulary.update(mof_name.split("+"))

In [None]:
word2index = {word: i for i, word in enumerate(vocabulary, start=1)}

In [None]:
print(word2index)

In [None]:
x_data = []

for mof_name in mof_names:
    x = [0] * 7
    for i, word in enumerate(mof_name.split("+")):
        x[i] = word2index[word]
    x_data.append(x)
x_data = np.array(x_data)

In [None]:
print(x_data)

## Data normalization and reshape

In [None]:
y_data = np.array(wc_values).reshape(-1, 1) / 100.0

In [None]:
print(y_data)

In [None]:
x_train = x_data[:55000]
y_train = y_data[:55000]

x_test = x_data[55000:]
y_test = y_data[55000:]

## Build and train Model

In [None]:
import tensorflow.keras as keras
model = keras.models.Sequential()
model.add(keras.layers.Embedding(input_dim=len(vocabulary)+1, output_dim=16, input_length=7))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(64, activation="relu"))
model.add(keras.layers.Dense(32, activation="relu"))
model.add(keras.layers.Dense(1))

model.compile(optimizer="adam", loss="mse")
model.fit(x_train, y_train, batch_size=32, epochs=4)

In [None]:
y_pred = model(x_test)

plt.figure(figsize=(8, 8))
plt.scatter(y_pred*100, y_test*100, s=0.5, alpha=0.5, color="blue")
plt.plot([0, 200], [0, 200], color="black", ls="--")

plt.xlabel("Prediction (cc/cc)", fontsize=30)
plt.ylabel("Real WC (cc/cc)", fontsize=30)

plt.xlim([0, 200])
plt.ylim([0, 200])

plt.show()