In [1]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import ase
import numpy as np


from ase.build import molecule
from dscribe.descriptors import CoulombMatrix, SineMatrix, EwaldSumMatrix

In [2]:
train_data = pd.read_csv('./nomad2018-predict-transparent-conductors/train.csv')
train_data

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,2396,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,90.0015,0.0381,3.7349
2396,2397,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,120.0007,0.0670,3.4915
2397,2398,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,89.9954,0.0906,3.2750
2398,2399,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,90.0004,0.2566,1.3915


In [3]:
test_data = pd.read_csv(
    './nomad2018-predict-transparent-conductors/test.csv')
test_data


Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
0,1,33,80.0,0.1875,0.4688,0.3438,10.5381,9.0141,9.6361,89.9997,90.0003,90.0006
1,2,33,80.0,0.7500,0.2500,0.0000,9.8938,8.5014,9.1298,90.0038,90.0023,90.0015
2,3,167,30.0,0.6667,0.1667,0.1667,4.9811,4.9808,13.4799,89.9900,90.0109,120.0014
3,4,12,80.0,0.5625,0.4375,0.0000,24.3370,6.0091,5.7620,89.9995,103.8581,90.0002
4,5,12,80.0,0.1875,0.5000,0.3125,24.6443,6.2906,6.1589,90.0000,104.5929,90.0001
...,...,...,...,...,...,...,...,...,...,...,...,...
595,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001
596,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004
597,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857
598,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007


In [4]:
train_data['spacegroup'].unique()

array([ 33, 194, 227, 167, 206,  12], dtype=int64)

In [5]:
def get_xyz_data(filename):
    pos_data = []
    lat_data = []
    with open(filename) as f:
        for line in f.readlines():
            x = line.split()
            if x[0] == 'atom':
                pos_data.append([np.array(x[1:4], dtype=float), x[4]])
            elif x[0] == 'lattice_vector':
                lat_data.append(np.array(x[1:4], dtype=float))
    return pos_data, np.array(lat_data)


get_xyz_data('./nomad2018-predict-transparent-conductors/train/10/geometry.xyz')


([[array([-0.05487074, -0.02429496,  5.9479779 ]), 'Ga'],
  [array([-0.16461223, -0.07288488, 17.84393368]), 'Al'],
  [array([4.59925077, 2.74195764, 6.00133319]), 'Ga'],
  [array([ 4.48950929,  2.69336772, 17.89728898]), 'Ga'],
  [array([3.07074593, 0.01523894, 5.9770041 ]), 'Ga'],
  [array([ 2.96100444, -0.03335098, 17.87295989]), 'Ga'],
  [array([1.4736341 , 2.70242374, 5.97230698]), 'Ga'],
  [array([ 1.36389262,  2.65383382, 17.86826277]), 'Al'],
  [array([0., 0., 0.]), 'Ga'],
  [array([-0.10974148, -0.04858992, 11.89595579]), 'Ga'],
  [array([4.65412151, 2.7662526 , 0.05335529]), 'Ga'],
  [array([ 4.54438003,  2.71766268, 11.94931108]), 'Al'],
  [array([3.12561667, 0.0395339 , 0.0290262 ]), 'Ga'],
  [array([ 3.01587519e+00, -9.05602000e-03,  1.19249820e+01]), 'Al'],
  [array([1.52850484, 2.7267187 , 0.02432909]), 'Al'],
  [array([ 1.41876336,  2.67812878, 11.92028488]), 'Ga'],
  [array([-0.05030208,  1.79230656,  2.98053228]), 'Ga'],
  [array([-0.16004356,  1.74371664, 14.87648807

In [6]:
xyzdata = get_xyz_data(
    './nomad2018-predict-transparent-conductors/train/1/geometry.xyz')

In [7]:
pos_data = xyzdata[0]
atoms = [ase.Atom(symbol=sym, position=pos) for pos, sym in pos_data]

In [8]:
mol = ase.Atoms(atoms)
mol

Atoms(symbols='Al20Ga12O48', pbc=False)

In [9]:
# unlike jmol, the bonds are not automatically computed.
# hence, it does not look the same.
view_molecule(mol, viewer='x3d')  

## feature descriptors

In [10]:
xyzdata_1 = get_xyz_data('./nomad2018-predict-transparent-conductors/train/1/geometry.xyz')
xyzdata_2 = get_xyz_data('./nomad2018-predict-transparent-conductors/train/2/geometry.xyz')
xyzdata_3 = get_xyz_data('./nomad2018-predict-transparent-conductors/train/3/geometry.xyz')

In [11]:
pos_data_1 = xyzdata_1[0]
pos_data_2 = xyzdata_2[0]
pos_data_3 = xyzdata_3[0]

In [12]:
atoms_1 = [ase.Atom(symbol=sym, position=pos) for pos, sym in pos_data_1]
mol_1 = ase.Atoms(atoms_1)
mol_1

Atoms(symbols='Al20Ga12O48', pbc=False)

In [13]:
atoms_2 = [ase.Atom(symbol=sym, position=pos) for pos, sym in pos_data_2]
mol_2 = ase.Atoms(atoms_2)
atoms_3 = [ase.Atom(symbol=sym, position=pos) for pos, sym in pos_data_3]
mol_3 = ase.Atoms(atoms_3)

In [14]:
samples = [mol_1, mol_2, mol_3]


cm_desc = CoulombMatrix(n_atoms_max=80, permutation="sorted_l2")
#material_1 = mol
coulomb_matrix = cm_desc.create(samples[0])

coulomb_matrices = cm_desc.create(samples)
coulomb_matrices = cm_desc.create(samples, n_jobs=3)

In [15]:
coulomb_matrices

array([[121.10109816,  75.93020649, 122.09373049, ...,   5.9815775 ,
          5.76657693,  22.9555197 ],
       [106.31634632, 155.7382364 , 139.21880573, ...,   2.89524711,
          2.81373206,   2.95362712],
       [ 76.85202206,  82.65821557,  36.0064848 , ...,   0.        ,
          0.        ,   0.        ]])

In [16]:
len(coulomb_matrices[2])

6400

In [17]:
len(coulomb_matrices[0])

6400