### Program written by Pablo Sánchez-Palencia, 2022
Built-up on the basis of the code previously written by Scott Midgley.

Scope: To ingest VASP energies from .csv format, cluster correlation functions from the corresponding .txt and generate Coulomb matrix eigenspectrum (CME) from POSCAR structure files. Output  saved as .pkl file, ready for machine learning models.

In [1]:
# Import modules
import pandas as pd
import numpy as np
from pymatgen.io.ase import AseAtomsAdaptor as AAA
from matminer.featurizers import structure as sf
from dscribe.descriptors import CoulombMatrix
from ase.io import read
from numpy.linalg import eig
import time
import os

In [2]:
# Read DFT derived energies from .csv file to data frame.
energies = pd.read_csv('../../repository_data/vasp-energies.csv', header=None)
energies.columns = ['tag','inv','SCF', 'BGE']

In [3]:
# Add predicted HSE-bandgaps, mixing formation energies and composition 
energies['HSE-corr']=1.08248836*energies['BGE']-0.04680858*energies['inv']+1.044979437918063   #Coefficients come from hse-data.py

ge=-381.21959448; sn=-357.66903362
energies['MixE']=energies['SCF']-1/3*ge-2/3*sn

In [4]:
#List of structures with DFT data to import in files
with open('../../repository_data/gga_structures_list.txt', "r") as obj_file: 
    file_check = obj_file.read().splitlines() 

In [5]:
# Iterate over structures in structure directory, generating CME for each configuration.
dirs=os.listdir('../../repository_data/structure_files')
files=[dirs[int(strct)] for strct in file_check] 

In [6]:
energies['tag']=file_check

In [7]:
#read correlation matrix file
with open('../../repository_data/correlation_matrix.txt', "r") as obj_file: 
    corr_mat=[]
    for line in obj_file: 
        corr_mat.append([int(x) for x in line.split()[1:]])
    corr_mat=np.array(corr_mat)
energies['CorrM']=list(corr_mat[list(map(int,file_check))])

In [8]:
cm_dscribe_list = []; cm_matminer_list=[]
cm_ds = CoulombMatrix(n_atoms_max=56,permutation="eigenspectrum")
cm_mm = sf.CoulombMatrix(flatten=True)
start_time = time.time()
for i,f in enumerate(files[:]):
    struct = read('../../repository_data/structure_files/'+f)
    struct.set_pbc([True,True,True])
    dscribe_matrix = cm_ds.create([struct])
    dscribe_matrix=np.real(dscribe_matrix)
    cm_dscribe_list.append(dscribe_matrix)
        
    struct = AAA.get_structure(struct)
    matminer_matrix = cm_mm.fit([struct])
    featurized_structure = matminer_matrix.featurize(struct)
    #featurized_structure=featurized_structure[0][:24,:24]    ###Change to featurized_matrix in the appending and flatten=False
    #featurized_matrix,v=eig(featurized_structure)
    #featurized_matrix=np.real(featurized_matrix)
    cm_matminer_list.append(np.sort(featurized_structure)[::-1])
    
    if i%200==0: print("ITER CHECKER: Structure",str(i).zfill(4)," charged")
    
print('Number of matrices read: ', len(cm_dscribe_list))
print("--- %s minutes ---" % ((time.time() - start_time)/60))

  zeros[: len(eigs)] = eigs


ITER CHECKER: Structure 0000  charged
ITER CHECKER: Structure 0200  charged
ITER CHECKER: Structure 0400  charged
ITER CHECKER: Structure 0600  charged
ITER CHECKER: Structure 0800  charged
ITER CHECKER: Structure 1000  charged
Number of matrices read:  1013
--- 4.387697279453278 minutes ---


In [9]:
# Add CME's to data frame with DFT energies. 
ener = energies.iloc[:len(cm_dscribe_list)]
ener["Coulomb_ds"] = cm_dscribe_list
ener["Coulomb_mm"] = cm_matminer_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ener["Coulomb_ds"] = cm_dscribe_list
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ener["Coulomb_mm"] = cm_matminer_list


In [10]:
# Shuffle data frame (optional).
ener = ener.sample(frac=1,random_state=38)

In [11]:
# Save data frame to .pkl file.
ener.to_pickle('../input_data_cm.pkl')