# Practice with Crystal Structure Prediction
## Christian Jorgensen
## January 12, 2024

### This notebook includes me playing around with the data from Rosy's paper "A data-driven interpretation of the stability of organic molecular crystals" from 2023.

In [9]:
from ase.io import read
from ase.visualize import view

Our goal is to predict crystal lattice energy given molecular structure. Rosy has provided 3 files as part of her work:

'all_crystals.xyz' gives the relaxed geometries and properties for 2,707 organic crystals.

'all_relaxed_molecules.xyz' gives the relaxed geometries and properties for 3,242 organic molecules.

'all_relaxed_molecules_tagged' gives the same information as 'all_relaxed_molecules.xyz', along with the structural motifs the molecules have.

The xyz files can be loaded using ASE (Atomic Simulation Environment). Loading these creates 'Atoms' objects, which can be interacted with in a lot of interesting ways.

In [12]:
crystal = read('all_crystals.xyz') #automatically loads last file, can add idx.

In [15]:
view(crystal) #Creates popup containing 3D crystal geometry

<Popen: returncode: None args: ['C:\\Users\\cajch\\anaconda3\\python.exe', '...>

In [18]:
crystal.positions #Get atom positions

array([[ 2.05236 ,  7.65926 ,  1.48623 ],
       [ 6.20346 ,  1.14229 ,  6.3228  ],
       [ 1.50045 ,  6.87725 ,  4.53659 ],
       [ 6.75538 ,  1.92431 ,  3.27243 ],
       [ 5.70339 ,  6.79393 ,  3.69392 ],
       [ 2.55243 ,  2.00762 ,  4.1151  ],
       [ 4.66855 ,  5.95589 ,  3.83304 ],
       [ 3.58727 ,  2.84566 ,  3.97598 ],
       [ 4.68612 ,  5.29936 ,  4.62709 ],
       [ 3.5697  ,  3.5022  ,  3.18193 ],
       [ 3.9846  ,  5.8303  ,  3.07756 ],
       [ 4.27123 ,  2.97125 ,  4.73146 ],
       [ 3.2199  ,  4.77618 ,  1.84675 ],
       [ 5.03592 ,  4.02538 ,  5.96227 ],
       [ 1.20008 ,  3.85813 ,  1.28671 ],
       [ 7.05575 ,  4.94342 ,  6.52231 ],
       [ 1.37902 ,  3.07002 ,  1.95651 ],
       [ 6.8768  ,  5.73153 ,  5.85252 ],
       [ 5.37453 ,  4.05421 ,  1.03182 ],
       [ 2.88129 ,  4.74735 ,  6.7772  ],
       [ 5.67111 ,  7.71002 ,  2.47348 ],
       [ 2.58472 ,  1.09153 ,  5.33554 ],
       [ 4.99862 ,  0.640493,  2.85803 ],
       [ 3.2572  ,  8.16106 ,  4.9

In [22]:
crystal.numbers #Get atomic numbers of atoms in the crystal

array([16, 16,  8,  8,  6,  6,  7,  7,  1,  1,  1,  1,  8,  8,  7,  7,  1,
        1,  1,  1,  6,  6,  1,  1,  1,  1,  6,  6,  1,  1,  1,  1,  6,  6])

In [25]:
crystal.get_global_number_of_atoms() #Get number of atoms in the crystal

34

In [27]:
crystal.get_chemical_symbols() #Get list of chemical symbols

['S',
 'S',
 'O',
 'O',
 'C',
 'C',
 'N',
 'N',
 'H',
 'H',
 'H',
 'H',
 'O',
 'O',
 'N',
 'N',
 'H',
 'H',
 'H',
 'H',
 'C',
 'C',
 'H',
 'H',
 'H',
 'H',
 'C',
 'C',
 'H',
 'H',
 'H',
 'H',
 'C',
 'C']

In [28]:
crystal.get_chemical_formula() #Get chemical formula

'C8H16N4O4S2'

In [34]:
crystal.symbols #Get string of chemical symbols

Symbols('S2O2C2N2H4O2N2H4C2H4C2H4C2')

In [35]:
crystal.info #Get important crystal info that's available as part of the file

{'dft_energy_ryd': -522.550312,
 'dft_cohesive_energy_relaxed_configs_ryd': -0.3732642399999122,
 'mol_indices': 3241,
 'crystal_idx': 2821,
 'CCDC_ID': 'QOCNAX01'}

In [19]:
crystal.arrays #Get a bunch of information

{'numbers': array([16, 16,  8,  8,  6,  6,  7,  7,  1,  1,  1,  1,  8,  8,  7,  7,  1,
         1,  1,  1,  6,  6,  1,  1,  1,  1,  6,  6,  1,  1,  1,  1,  6,  6]),
 'positions': array([[ 2.05236 ,  7.65926 ,  1.48623 ],
        [ 6.20346 ,  1.14229 ,  6.3228  ],
        [ 1.50045 ,  6.87725 ,  4.53659 ],
        [ 6.75538 ,  1.92431 ,  3.27243 ],
        [ 5.70339 ,  6.79393 ,  3.69392 ],
        [ 2.55243 ,  2.00762 ,  4.1151  ],
        [ 4.66855 ,  5.95589 ,  3.83304 ],
        [ 3.58727 ,  2.84566 ,  3.97598 ],
        [ 4.68612 ,  5.29936 ,  4.62709 ],
        [ 3.5697  ,  3.5022  ,  3.18193 ],
        [ 3.9846  ,  5.8303  ,  3.07756 ],
        [ 4.27123 ,  2.97125 ,  4.73146 ],
        [ 3.2199  ,  4.77618 ,  1.84675 ],
        [ 5.03592 ,  4.02538 ,  5.96227 ],
        [ 1.20008 ,  3.85813 ,  1.28671 ],
        [ 7.05575 ,  4.94342 ,  6.52231 ],
        [ 1.37902 ,  3.07002 ,  1.95651 ],
        [ 6.8768  ,  5.73153 ,  5.85252 ],
        [ 5.37453 ,  4.05421 ,  1.03182 ],
     

In [40]:
crystal.todict()

{'numbers': array([16, 16,  8,  8,  6,  6,  7,  7,  1,  1,  1,  1,  8,  8,  7,  7,  1,
         1,  1,  1,  6,  6,  1,  1,  1,  1,  6,  6,  1,  1,  1,  1,  6,  6]),
 'positions': array([[ 2.05236 ,  7.65926 ,  1.48623 ],
        [ 6.20346 ,  1.14229 ,  6.3228  ],
        [ 1.50045 ,  6.87725 ,  4.53659 ],
        [ 6.75538 ,  1.92431 ,  3.27243 ],
        [ 5.70339 ,  6.79393 ,  3.69392 ],
        [ 2.55243 ,  2.00762 ,  4.1151  ],
        [ 4.66855 ,  5.95589 ,  3.83304 ],
        [ 3.58727 ,  2.84566 ,  3.97598 ],
        [ 4.68612 ,  5.29936 ,  4.62709 ],
        [ 3.5697  ,  3.5022  ,  3.18193 ],
        [ 3.9846  ,  5.8303  ,  3.07756 ],
        [ 4.27123 ,  2.97125 ,  4.73146 ],
        [ 3.2199  ,  4.77618 ,  1.84675 ],
        [ 5.03592 ,  4.02538 ,  5.96227 ],
        [ 1.20008 ,  3.85813 ,  1.28671 ],
        [ 7.05575 ,  4.94342 ,  6.52231 ],
        [ 1.37902 ,  3.07002 ,  1.95651 ],
        [ 6.8768  ,  5.73153 ,  5.85252 ],
        [ 5.37453 ,  4.05421 ,  1.03182 ],
     

# Extracting Information

### From the crystals file, need to get total energy, lattice energy, crystal index, and corresponding molecule index

In [63]:
from tqdm import trange

total_energies, lattice_energies, crystal_indices, corr_mol_indices = [], [], [], []

for i in trange(0, 2707): #missing crystal indices belong to test set?
    
    crystal = read('all_crystals.xyz', i)
    
    total_energies.append(crystal.info['dft_energy_ryd'])
    lattice_energies.append(crystal.info['dft_cohesive_energy_relaxed_configs_ryd'])
    crystal_indices.append(crystal.info['crystal_idx'])
    corr_mol_indices.append(crystal.info['mol_indices'])
    

100%|██████████| 2707/2707 [08:04<00:00,  5.58it/s]


In [64]:
len(total_energies)

2707

### From the molecules file, need to get atom positions and atom identities

In [68]:
atom_pos, atom_symbols, mol_indices, corr_crystal_indices = [], [], [], []
for i in trange(3242):

    molecule = read('all_relaxed_molecules.xyz', i)
    
    atom_pos.append(molecule.arrays['positions'])
    atom_symbols.append(molecule.arrays['numbers'])
    mol_indices.append(molecule.info['molecule_idx'])
    corr_crystal_indices.append(molecule.info['crystal_idx'])

100%|██████████| 3242/3242 [07:48<00:00,  6.92it/s]
