In [14]:
import pandas as pd
from ase.visualize import view as view_molecule
from ase.io import read as read_molecule
import numpy as np

In [2]:
train_data = pd.read_csv('./nomad2018-predict-transparent-conductors/train.csv')
train_data

Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree,formation_energy_ev_natom,bandgap_energy_ev
0,1,33,80.0,0.6250,0.3750,0.0000,9.9523,8.5513,9.1775,90.0026,90.0023,90.0017,0.0680,3.4387
1,2,194,80.0,0.6250,0.3750,0.0000,6.1840,6.1838,23.6287,90.0186,89.9980,120.0025,0.2490,2.9210
2,3,227,40.0,0.8125,0.1875,0.0000,9.7510,5.6595,13.9630,90.9688,91.1228,30.5185,0.1821,2.7438
3,4,167,30.0,0.7500,0.0000,0.2500,5.0036,5.0034,13.5318,89.9888,90.0119,120.0017,0.2172,3.3492
4,5,194,80.0,0.0000,0.6250,0.3750,6.6614,6.6612,24.5813,89.9960,90.0006,119.9893,0.0505,1.3793
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,2396,33,40.0,0.7500,0.2500,0.0000,4.9469,8.5014,9.1298,90.0038,90.0023,90.0015,0.0381,3.7349
2396,2397,167,30.0,0.4167,0.5833,0.0000,4.9566,4.9562,13.4178,89.9938,90.0075,120.0007,0.0670,3.4915
2397,2398,206,80.0,0.4375,0.5625,0.0000,9.2204,9.2200,9.2199,90.0047,90.0046,89.9954,0.0906,3.2750
2398,2399,33,80.0,0.3125,0.1875,0.5000,10.6529,9.0954,9.7210,90.0015,89.9996,90.0004,0.2566,1.3915


In [9]:
test_data = pd.read_csv(
    './nomad2018-predict-transparent-conductors/test.csv')
test_data


Unnamed: 0,id,spacegroup,number_of_total_atoms,percent_atom_al,percent_atom_ga,percent_atom_in,lattice_vector_1_ang,lattice_vector_2_ang,lattice_vector_3_ang,lattice_angle_alpha_degree,lattice_angle_beta_degree,lattice_angle_gamma_degree
0,1,33,80.0,0.1875,0.4688,0.3438,10.5381,9.0141,9.6361,89.9997,90.0003,90.0006
1,2,33,80.0,0.7500,0.2500,0.0000,9.8938,8.5014,9.1298,90.0038,90.0023,90.0015
2,3,167,30.0,0.6667,0.1667,0.1667,4.9811,4.9808,13.4799,89.9900,90.0109,120.0014
3,4,12,80.0,0.5625,0.4375,0.0000,24.3370,6.0091,5.7620,89.9995,103.8581,90.0002
4,5,12,80.0,0.1875,0.5000,0.3125,24.6443,6.2906,6.1589,90.0000,104.5929,90.0001
...,...,...,...,...,...,...,...,...,...,...,...,...
595,596,12,80.0,0.0000,0.5938,0.4062,24.8145,6.3964,6.2933,90.0002,104.7733,90.0001
596,597,33,40.0,0.1250,0.0000,0.8750,5.5783,9.4849,10.1107,90.0008,89.9967,90.0004
597,598,194,80.0,0.0000,0.2500,0.7500,6.9377,6.9372,25.0641,90.0072,89.9880,119.9857
598,599,33,40.0,0.6250,0.0000,0.3750,5.1841,8.8659,9.4956,90.0041,90.0009,90.0007


Predict two things:
- Formation energy (`formation_energy_ev_natom`), and the bandgap energy (`bandgap_energy_ev`)
- https://pymol.org/2/ is paid
- Jmol is open source

# Data meaning
- `spacegroup`: combination of bravais lattices with diff planes and rotations -> finite -> 230
    - unique, including symmetry
    - [bravais lattices](https://www.wikiwand.com/en/Bravais_lattice) : fundamental unit/arrangement of a crystal (can be cubic, hexagonal, etc)
        - 14, crystal systems + centering options
    - numbers mapped according to this : https://www.wikiwand.com/en/List_of_space_groups

In [3]:
train_data['spacegroup'].unique()

array([ 33, 194, 227, 167, 206,  12])

In [4]:
for c in train_data.columns:
    print(c)

id
spacegroup
number_of_total_atoms
percent_atom_al
percent_atom_ga
percent_atom_in
lattice_vector_1_ang
lattice_vector_2_ang
lattice_vector_3_ang
lattice_angle_alpha_degree
lattice_angle_beta_degree
lattice_angle_gamma_degree
formation_energy_ev_natom
bandgap_energy_ev


In [17]:
def get_xyz_data(filename):
    pos_data = []
    lat_data = []
    with open(filename) as f:
        for line in f.readlines():
            x = line.split()
            if x[0] == 'atom':
                pos_data.append([np.array(x[1:4], dtype=float), x[4]])
            elif x[0] == 'lattice_vector':
                lat_data.append(np.array(x[1:4], dtype=float))
    return pos_data, np.array(lat_data)


get_xyz_data('./nomad2018-predict-transparent-conductors/train/1/geometry.xyz')


([[array([1.60888794, 7.27641622, 6.38315519]), 'Ga'],
  [array([6.5849318 , 7.2527921 , 6.36582623]), 'Al'],
  [array([3.43575841, 1.25977584, 1.79461265]), 'Al'],
  [array([8.41180227, 1.23615172, 1.77728368]), 'Ga'],
  [array([0.95588031, 2.99892522, 1.80135131]), 'Ga'],
  [array([5.93192417, 2.9753011 , 1.78402234]), 'Al'],
  [array([4.08876604, 5.53726684, 6.37641653]), 'Al'],
  [array([9.0648099 , 5.51364272, 6.35908757]), 'Al'],
  [array([ 0.91723977,  5.62244433, -0.00937511]), 'Al'],
  [array([ 5.89328363,  5.59882021, -0.02670407]), 'Ga'],
  [array([4.11494027, 2.90993936, 4.57123193]), 'Ga'],
  [array([9.09098413, 2.88631524, 4.55390297]), 'Al'],
  [array([1.6195002 , 1.34833524, 4.58165063]), 'Al'],
  [array([6.59554406, 1.32471112, 4.56432166]), 'Ga'],
  [array([3.44432022, 7.19371437, 9.15764532]), 'Al'],
  [array([8.42036408, 7.17009025, 9.14031635]), 'Ga'],
  [array([0.86879359, 0.26222265, 9.1650644 ]), 'Ga'],
  [array([5.84483745, 0.23859853, 9.14773544]), 'Al'],
  [a