# Machine Learning CO<sub>2</sub> Working Capacity of MOFs

## 0. Import packages

In [None]:
# import standard scientific libraries
import os
import math
import numpy as np
import pandas as pd

# import ML models from scikit-learn
# from sklearn.linear_model import LinearRegression
# from sklearn.kernel_ridge import KernelRidge
# from sklearn.gaussian_process import GaussianProcessRegressor
# from sklearn import svm
# from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_absolute_error

## 1. Import the data

In [None]:
pd.set_option('max_columns', None)
pd.set_option("display.precision", 8)

dataset = "./"

In [None]:
train = pd.read_csv("train_easy2use.csv")
train.shape

In [None]:
train

In [None]:
train[train["surface_area"] < 0]

In [None]:
train.head()

In [None]:
feat = list(train.columns.values)
feat = {k: v for k, v in enumerate(feat)}
feat

## 2. Clean data

In [None]:
# find rows having NaN
train.isnull().any(axis=0)

## 3. Prepare training and test sets

Prepare input
- x_train = train input
- y_train = train output
- x_test = test input
- y_test = test predict

In [None]:
ratio = 0.8
train_size = math.floor(train.shape[0]*0.8)
test_size = train.shape[0] - train_size
print("dataset size:", train.shape[0])
print("train size:", train_size)
print("test size:", test_size)

In [None]:
feat = list(train.columns.values)
feat = {k: v for k, v in enumerate(feat)}
feat

In [None]:
x_train = train.iloc[0:train_size].drop(columns=['MOFname','CO2_working_capacity']).astype(np.float32)
y_train = train['CO2_working_capacity'].iloc[0:train_size].astype(np.float32)
x_test = train.iloc[train_size:train_size+test_size].drop(columns=['MOFname','CO2_working_capacity']).astype(np.float32)
y_true = train['CO2_working_capacity'].iloc[train_size:train_size+test_size].astype(np.float32)

In [None]:
y_train

## 4. Neural network

### 4.1: Feed forward neural network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [None]:
# 1. creat model using sequential method.
model = Sequential()
model.add(Dense(16, input_dim=(x_train.shape[1]), activation='relu')) # input
model.add(Dense(32, activation='relu')) # hidden 1
model.add(Dense(32, activation='relu')) # hidden 2
model.add(Dense(1, activation='linear')) # output

In [None]:
# 2. compile model
#val_loss = model.compile(loss='mae', optimizer='adam', metrics=['accuracy'])
val_loss = model.compile(loss=tf.keras.losses.mae,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=["accuracy"])
# 2. train model
history = model.fit(x_train, y_train, epochs=300, batch_size=64)

<details>
<summary> <font color='green'>Click here for some more information about hyperparaper of neural network</font></summary>
We use MAE as a loss function in the neural network but we use LMAE as a metric in our competition. Is this reasonable?
</details>

In [None]:
y_pred = model.predict(x_test)
y_pred

In [None]:
from matplotlib import pyplot as plt

In [None]:
history_dict = history.history
loss_values = history_dict['loss']
val_loss_values = history_dict['val_loss']
plt.plot(loss_values, 'b', label='training loss')
plt.plot(val_loss_values, 'r', label='val training loss')

In [None]:
log_mae = np.log10(mean_absolute_error(y_pred, y_true))
log_mae

## 4.2 Customization

In [None]:
import tensorflow.keras.backend as kb
import tensorflow as tf

# กำหนด loss ใหม่ด้วย function ที่รับค่า actual value และ prediction value เข้ามา และ return loss LMAE ออกไป
def custom_loss(y_actual, y_pred): 
    custom_loss=tf.experimental.numpy.log10(kb.sum(kb.abs(y_actual - y_pred)) / y_actual.shape[0])
    return custom_loss

In [None]:
model = Sequential()
model.add(Dense(12, input_dim=(x_train.shape[1]), activation='relu')) # input
model.add(Dense(24, activation='relu')) # hidden 1
model.add(Dense(12, activation='relu')) # hidden 2
model.add(Dense(1, activation='linear')) # output

In [None]:
model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(32, activation=tf.nn.relu, input_shape=[1]),
        tf.keras.layers.Dense(32, activation=tf.nn.relu),
        tf.keras.layers.Dense(1),
    ]
)

In [None]:
model.compile(loss=custom_loss, optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, epochs=100, batch_size=64)

In [None]:
# predict
y_pred = model.predict(x_test)
# evaluate error
log_mae = np.log10(mean_absolute_error(y_pred, y_true))
log_mae

## 5. Predicting CO<sub>2</sub> WC 

### Import pretest and proprocessing

Today we will prepare a submission file for pretest set for the phase 1 (Development).

In [None]:
test = pd.read_csv("test_easy2use.csv")
test.shape

In [None]:
test

### Let's predict and create a submission file

Join the [Codalab competition](https://competitions.codalab.org/competitions/34540) for this course!

Create a `submission.csv` with your predictions to join the competition and upload it to the competition site.

In [None]:
pred = model.predict(test.drop(columns=['MOFname']))
pred

In [None]:
submission = pd.DataFrame({
    "id": [str(i) for i in range(68614,85613 + 1)],
    "CO2_working_capacity [mL/g]": pred.T[0]
    })

submission.to_csv("submission.csv", index=False)

In [None]:
submission