In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#%matplotlib widget

from voxel import *
from mol_tools import *
from ML_utils import *

from sklearn.model_selection import train_test_split, KFold

In [2]:
def drop_duplicates(df):
    df = df.drop_duplicates(subset="compound", keep="first")
    print("Number of molecules", df.shape[0])
    return df

In [3]:
def get_all_elements(df):
      return np.unique(np.hstack(np.array(df.species)))

In [4]:
#df = pd.read_pickle("data/icsd_221_cp5_3_species.pkl")
#df = pd.read_pickle("data/all_221_cp5_3_species_oxides.pkl")
#df = pd.read_pickle("data/all_3_species_oxides.pkl")

df1 = pd.read_pickle(f"data/all_3_species_oxides.pkl")
df2 = pd.read_pickle(f"data/all_sup4_species_oxides.pkl")
df = df1.append(df2)

df = drop_duplicates(df)

Number of molecules 10772


In [5]:
elem_counts = pd.Series(np.hstack(np.array(df.species))).value_counts()
low_freq = elem_counts[elem_counts < 50].index.values
low_freq_mask = df.species.apply(lambda s: len(np.setdiff1d(s, low_freq)) < len(s))
df = df[~low_freq_mask]

In [6]:
elements = get_all_elements(df)
elements

array(['Ag', 'Al', 'As', 'Au', 'B', 'Ba', 'Be', 'Bi', 'Br', 'C', 'Ca',
       'Cd', 'Ce', 'Cl', 'Co', 'Cr', 'Cs', 'Cu', 'Dy', 'Er', 'Eu', 'F',
       'Fe', 'Ga', 'Gd', 'Ge', 'H', 'Hf', 'Hg', 'Ho', 'I', 'In', 'Ir',
       'K', 'La', 'Li', 'Mg', 'Mn', 'Mo', 'N', 'Na', 'Nb', 'Nd', 'Ni',
       'O', 'Os', 'P', 'Pb', 'Pd', 'Pr', 'Pt', 'Rb', 'Re', 'Rh', 'Ru',
       'S', 'Sb', 'Sc', 'Se', 'Si', 'Sn', 'Sr', 'Ta', 'Tb', 'Tc', 'Te',
       'Ti', 'Tl', 'U', 'V', 'W', 'Xe', 'Y', 'Yb', 'Zn', 'Zr'],
      dtype='<U2')

In [7]:
nchannel = len(elements)
nchannel

76

In [8]:
np.random.seed(0)
train_df, test_df = train_test_split(df, test_size=0.1)

In [15]:
def make_simple_descriptor(mol, elements, one_hot=True):
    descriptor = np.zeros((len(elements),))
    
    for i, (element, n_atoms) in enumerate(zip(mol.species, mol.composition)):
        try:
            j = np.where(elements == element)[0][0] # get channel for element
            if one_hot:
                descriptor[j] = 1
            else:
                descriptor[j] = n_atoms
        except:
            print(element, mol.compound, mol.composition)
            
    return descriptor

In [16]:
df.loc[0]

Unnamed: 0,auid,aurl,compound,composition,species,natoms,spacegroup,pearson_symbol,geometry,positions_fractional,positions_cartesian,enthalpy_atom,enthalpy_formation_atom
0,aflow:b281fdee92d2d2b2,aflowlib.duke.edu:AFLOWDATA/LIB3_RAW/AgOOs_pv/...,Ag1O3Os1,"[1, 3, 1]","[Ag, O, Os]",5,221,cP5,"[3.9184369, 3.9184369, 3.9184369, 90.0, 90.0, ...","[[0.0, 0.0, 0.0], [0.0, 0.5, 0.5], [0.5, 0.0, ...","[[0.0, 0.0, 0.0], [0.0, 1.95922, 1.95922], [1....",-5.34058,0.427456
0,aflow:8dc9ef747342f953,aflowlib.duke.edu:AFLOWDATA/ICSD_WEB/HEX/Ag5Hg...,Ag10Hg2O12Sb2,"[10, 2, 12, 2]","[Ag, Hg, O, Sb]",26,163,hP26,"[6.053494, 6.054145, 12.56352, 89.99686, 90.00...","[[4.5797788e-06, 3.8849732e-06, 0.12767818], [...","[[-2e-05, 7e-05, 1.60409], [3.02663, -5.24223,...",-3.36361,


In [17]:
make_simple_descriptor(df.loc[0], elements)

['Ag', 'O', 'Os'] 0         Ag1O3Os1
0    Ag10Hg2O12Sb2
Name: compound, dtype: object 0         [1, 3, 1]
0    [10, 2, 12, 2]
Name: composition, dtype: object
['Ag', 'Hg', 'O', 'Sb'] 0         Ag1O3Os1
0    Ag10Hg2O12Sb2
Name: compound, dtype: object 0         [1, 3, 1]
0    [10, 2, 12, 2]
Name: composition, dtype: object


array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0.])

In [None]:
seed_everything()
train_df, test_df = train_test_split(df, test_size=0.1)

In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
y = df.enthalpy_atom# * df.natoms
y_train = train_df.enthalpy_atom# * train_df.natoms
y_test = test_df.enthalpy_atom# * test_df.natoms

## Zero R

In [None]:
y_pred_train = np.mean(y_train)
np.mean((y_pred_train - y_train)**2)

In [None]:
y_test_pred = np.mean(y_train)
np.mean((y_test_pred - y_test)**2)

In [None]:
seed_everything()
all_ys = []
all_y_hats = []
all_ys_train = []
all_y_hats_train = []

for train_indices, test_indices in KFold(n_splits=5, shuffle=True).split(df):
    
    cv_train_df = df.iloc[train_indices]
    cv_test_df = df.iloc[test_indices]
    
    
    cv_y_train = y.iloc[train_indices] # cv_train_df.enthalpy_atom
    cv_y_test = y.iloc[test_indices] # cv_test_df.enthalpy_atom
    
    cv_y_test_pred = np.mean(cv_y_train)
    
    ys = cv_y_test
    y_hats = np.full_like(cv_y_test, cv_y_test_pred)
    
    y_hats_train = np.full_like(cv_y_train, cv_y_test_pred)
    print("Train:", np.mean((y_hats_train - cv_y_train)**2),
          "Test:", np.mean((y_hats - cv_y_test)**2))
    
    
    all_ys.append(ys)
    all_y_hats.append(y_hats)
    
    all_ys_train.append(cv_y_train)
    all_y_hats_train.append(y_hats_train)
    
print("Avg Train: ", np.mean((np.hstack(all_ys_train) - np.hstack(all_y_hats_train))**2), end=" ")
print("Avg Test: ", np.mean((np.hstack(all_ys) - np.hstack(all_y_hats))**2))

## Ridge

In [None]:
one_hot=True

In [None]:
X_train = np.vstack(train_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))
X_test = np.vstack(test_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))

In [None]:
from  sklearn.linear_model import RidgeCV

In [None]:
rcv = RidgeCV()
rcv.fit(X_train, y_train)

In [None]:
y_pred_train = rcv.predict(X_train)
np.mean((y_pred_train - y_train)**2)

In [None]:
plt.scatter(y_train, y_pred_train, alpha=0.1)
plt.plot([y.min(), y.max()], [y.min(), y.max()], c="red")
plt.show()

In [None]:
y_test_pred = rcv.predict(X_test)
np.mean((y_test_pred - y_test)**2)

In [None]:
plt.scatter(y_test, y_test_pred, alpha=0.1)
plt.plot([y.min(), y.max()], [y.min(), y.max()], c="red")
plt.show()

In [None]:
seed_everything()
all_ys = []
all_y_hats = []
all_ys_train = []
all_y_hats_train = []

for train_indices, test_indices in KFold(n_splits=5, shuffle=True).split(df):
    
    cv_train_df = df.iloc[train_indices]
    cv_test_df = df.iloc[test_indices]
    
    
    cv_X_train = np.vstack(cv_train_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))
    cv_X_test = np.vstack(cv_test_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))
    cv_y_train = y.iloc[train_indices] # cv_train_df.enthalpy_atom
    cv_y_test = y.iloc[test_indices] # cv_test_df.enthalpy_atom
    
    rcv = RidgeCV()
    rcv.fit(cv_X_train, cv_y_train)
    y_hats = rcv.predict(cv_X_test)
    
    y_hats_train = rcv.predict(cv_X_train)
    print("Train:", np.mean((y_hats_train - cv_y_train)**2),
          "Test:", np.mean((y_hats - cv_y_test)**2))
    
    all_ys.append(cv_y_test)
    all_y_hats.append(y_hats)
    
    all_ys_train.append(cv_y_train)
    all_y_hats_train.append(y_hats_train)

    
print("Avg Train: ", np.mean((np.hstack(all_ys_train) - np.hstack(all_y_hats_train))**2), end=" ")
print("Avg Test: ", np.mean((np.hstack(all_ys) - np.hstack(all_y_hats))**2))

## MLP

In [None]:
one_hot = False

In [None]:
X_train = np.vstack(train_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))
X_test = np.vstack(test_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))

In [None]:
from sklearn.neural_network import MLPRegressor

In [None]:
mlp = MLPRegressor(random_state=0, max_iter=5000)
mlp.fit(X_train, y_train)

In [None]:
y_pred_train = mlp.predict(X_train)
np.mean((y_pred_train - y_train)**2)

In [None]:
plt.scatter(y_train, y_pred_train, alpha=0.1)
plt.plot([y.min(), y.max()], [y.min(), y.max()], c="red")
plt.show()

In [None]:
y_test_pred = mlp.predict(X_test)
np.mean((y_test_pred - y_test)**2)

In [None]:
plt.scatter(y_test, y_test_pred, alpha=0.1)
plt.plot([y.min(), y.max()], [y.min(), y.max()], c="red")
plt.show()

In [None]:
seed_everything()
all_ys = []
all_y_hats = []
all_ys_train = []
all_y_hats_train = []

for train_indices, test_indices in KFold(n_splits=5, shuffle=True).split(df):
    
    cv_train_df = df.iloc[train_indices]
    cv_test_df = df.iloc[test_indices]
    
    
    cv_X_train = np.vstack(cv_train_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))
    cv_X_test = np.vstack(cv_test_df.apply(lambda mol: make_simple_descriptor(mol, elements, one_hot=one_hot), axis=1))
    cv_y_train = y.iloc[train_indices] # cv_train_df.enthalpy_atom
    cv_y_test = y.iloc[test_indices] # cv_test_df.enthalpy_atom
    
    mlp = MLPRegressor(random_state=0, max_iter=5000)
    mlp.fit(cv_X_train, cv_y_train)
    y_hats = mlp.predict(cv_X_test)
    
    y_hats_train = mlp.predict(cv_X_train)
    print("Train:", np.mean((y_hats_train - cv_y_train)**2),
          "Test:", np.mean((y_hats - cv_y_test)**2))
    
    all_ys.append(cv_y_test)
    all_y_hats.append(y_hats)
    
    all_ys_train.append(cv_y_train)
    all_y_hats_train.append(y_hats_train)
    

print("Avg Train: ", np.mean((np.hstack(all_ys_train) - np.hstack(all_y_hats_train))**2), end=" ")
print("Avg Test: ", np.mean((np.hstack(all_ys) - np.hstack(all_y_hats))**2))