In [1]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np
import random


from ase.cell import Cell
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix, MBTR

In [2]:
DATA_PATH = './nomad2018-predict-transparent-conductors'


In [3]:
train_data = pd.read_csv(
    f'{DATA_PATH}/train.csv')
train_data


Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,2396,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,90.0015,0.0381,3.7349
2396,2397,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,120.0007,0.0670,3.4915
2397,2398,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,89.9954,0.0906,3.2750
2398,2399,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,90.0004,0.2566,1.3915


In [4]:
test_data = pd.read_csv(
    f'{DATA_PATH}/test.csv')
test_data



Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
0,1,33,80.0,0.1875,0.4688,0.3438,10.5381,9.0141,9.6361,89.9997,90.0003,90.0006
1,2,33,80.0,0.7500,0.2500,0.0000,9.8938,8.5014,9.1298,90.0038,90.0023,90.0015
2,3,167,30.0,0.6667,0.1667,0.1667,4.9811,4.9808,13.4799,89.9900,90.0109,120.0014
3,4,12,80.0,0.5625,0.4375,0.0000,24.3370,6.0091,5.7620,89.9995,103.8581,90.0002
4,5,12,80.0,0.1875,0.5000,0.3125,24.6443,6.2906,6.1589,90.0000,104.5929,90.0001
...,...,...,...,...,...,...,...,...,...,...,...,...
595,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001
596,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004
597,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857
598,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007


# Visualising the molecule

In [5]:
def get_xyz_data(filename):
    pos_data = []
    lat_data = []
    with open(filename) as f:
        for line in f.readlines():
            x = line.split()
            noise = np.array(
                [random.random()*0.001, random.random()*0.001, random.random()*0.001])
            if x[0] == 'atom':
                pos_data.append([np.array(x[1:4], dtype=float)+noise, x[4]])
            elif x[0] == 'lattice_vector':
                lat_data.append(np.array(x[1:4], dtype=float))
    return pos_data, np.array(lat_data)


xyzdata = get_xyz_data(f'{DATA_PATH}/train/10/geometry.xyz')
pos_data = xyzdata[0]
atoms = [ase.Atom(symbol=sym, position=pos) for pos, sym in pos_data]
mol = ase.Atoms(atoms)
# unlike jmol, the bonds are not automatically computed.
# hence, it does not look the same.
view_molecule(mol, viewer='x3d')


# Adding new features to the training data
- CoulombMatrix
- SineMatrix
- EwaldSumMatrix

In [6]:
np.array([1,2,3])+np.array([1,2,3])

array([2, 4, 6])

In [7]:
def get_xyz_data(filename):
    '''This version of the function also adds lattice vector info
    to the molecule during creation '''
    pos_data = []
    lat_data = []
    with open(filename) as f:
        for line in f.readlines():
            x = line.split()
            noise = np.array(
                [random.random()*0.001, random.random()*0.001, random.random()*0.001])
            if x[0] == 'atom':
                pos_data.append([np.array(x[1:4], dtype=float)+noise, x[4]])
            elif x[0] == 'lattice_vector':
                lat_data.append(np.array(x[1:4], dtype=float))
    return pos_data, np.array(lat_data)


get_xyz_data(f'{DATA_PATH}/train/10/geometry.xyz')


([[array([-0.0543326 , -0.02404889,  5.9484366 ]), 'Ga'],
  [array([-0.16424224, -0.07213884, 17.84425242]), 'Al'],
  [array([4.59979677, 2.74209944, 6.00232658]), 'Ga'],
  [array([ 4.48959484,  2.69347477, 17.89801032]), 'Ga'],
  [array([3.07153627, 0.01597498, 5.97783499]), 'Ga'],
  [array([ 2.96155913, -0.03317135, 17.87302465]), 'Ga'],
  [array([1.47430535, 2.70284211, 5.97303468]), 'Ga'],
  [array([ 1.36429075,  2.65438222, 17.86898084]), 'Al'],
  [array([0.00079433, 0.00072862, 0.00050627]), 'Ga'],
  [array([-0.10913479, -0.04766758, 11.89620726]), 'Ga'],
  [array([4.65432806, 2.76640647, 0.05351241]), 'Ga'],
  [array([ 4.54533771,  2.71769307, 11.94964877]), 'Al'],
  [array([3.12590775, 0.03969516, 0.0295076 ]), 'Ga'],
  [array([ 3.01664162e+00, -8.93523950e-03,  1.19258510e+01]), 'Al'],
  [array([1.52867924, 2.72714222, 0.02445334]), 'Al'],
  [array([ 1.41915919,  2.67829504, 11.92102504]), 'Ga'],
  [array([-0.04997939,  1.79299332,  2.98100553]), 'Ga'],
  [array([-0.15920206, 

In [9]:
n_atoms_max = int(train_data['number_of_total_atoms'].max())
n_atoms_max

80

In [10]:
molecules = []
for idx in train_data['id']:
    path = f'{DATA_PATH}/train/{idx}/geometry.xyz'
    xyz_data = get_xyz_data(path)
    pos_data = xyz_data[0]
    lattice_vector_data = xyz_data[1]
    # v = [np.linalg.norm(i) for i in lattice_vector_data]
    # create molecule
    atoms = [ase.Atom(symbol=sym, position=pos) for pos, sym in pos_data]
    mol = ase.Atoms(atoms)
    mol.set_cell(Cell(lattice_vector_data))
    molecules.append(mol)

In [11]:
coulomb_matrices = CoulombMatrix(
    n_atoms_max=n_atoms_max, 
    permutation="sorted_l2"
).create(molecules)


In [12]:
sine_matrices = SineMatrix(
    n_atoms_max=n_atoms_max, 
    permutation="sorted_l2",
).create(molecules, n_jobs=4)


In [13]:
ewald_sum_matrices = EwaldSumMatrix(
    n_atoms_max=n_atoms_max,
    permutation="sorted_l2"
).create(molecules, n_jobs=4)


In [14]:
len(coulomb_matrices)

2400

In [15]:
train_data

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,2396,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,90.0015,0.0381,3.7349
2396,2397,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,120.0007,0.0670,3.4915
2397,2398,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,89.9954,0.0906,3.2750
2398,2399,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,90.0004,0.2566,1.3915


converting to list before for writing purposes. If used directly, can assign `list(coulomb_matrices)` directly for example.

In [16]:
train_data['CoulombMatrix'] = list(map(list, coulomb_matrices))
train_data['SineMatrix'] = list(map(list, sine_matrices))
train_data['EwaldSumMatrix'] = list(map(list, ewald_sum_matrices))


## Write data to a file and test i/o

In [17]:
train_data.to_csv(f'{DATA_PATH}/train_extrainfo.csv', index=False)

In [18]:
def custom_converter(entry):
    return np.array([float(x) for x in entry[1:-1].split(',')])

In [19]:
d = pd.read_csv(
    f'{DATA_PATH}/train_extrainfo.csv',
    converters={
        'CoulombMatrix':custom_converter,
        'SineMatrix':custom_converter,
        'EwaldSumMatrix':custom_converter
    }
)

In [20]:
d['CoulombMatrix'][0]

array([121.1169616 ,  75.93246627, 122.12105516, ...,   5.98180385,
         5.76648083,  22.95621302])