# Data Preprocessing 
## Swapping atom positions for featurizing

In [10]:
import pymatgen
from pymatgen.analysis.local_env import VoronoiNN

In [11]:
import numpy as np
from numpy import genfromtxt
from os import listdir
import time 
import pandas as pd
import random
import torch
import sklearn
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d, Axes3D 
from torch.autograd import Variable

comp = ['CoS','CuS','MnS','MoS','RuS','ScS','TiS','VS']


####################################################
#loading all xyz file names 
def find_xyz_filenames( path_to_dir, suffix=".xyz" ):
    filenames = listdir(path_to_dir)
    return [ filename for filename in filenames if filename.endswith( suffix ) ]

CoS_filenames = find_xyz_filenames('data_xyz/CoS')
CuS_filenames = find_xyz_filenames('data_xyz/CuS')
MnS_filenames = find_xyz_filenames('data_xyz/MnS')
MoS_filenames = find_xyz_filenames('data_xyz/MoS')
RuS_filenames = find_xyz_filenames('data_xyz/RuS')
ScS_filenames = find_xyz_filenames('data_xyz/ScS')
TiS_filenames = find_xyz_filenames('data_xyz/TiS')
VS_filenames = find_xyz_filenames('data_xyz/VS')


#####################################################
#loading the excel file containing the adsorption site and Gibbs free energy
def find_xlsx_filenames( path_to_dir, suffix=".xlsx" ):
    filenames = listdir(path_to_dir)
    return [ filename for filename in filenames if filename.endswith( suffix ) ]

xlsx_filenames = find_xlsx_filenames('data')



######################################################
#loading xlsx data loading data for each Compound
path ='data/'+xlsx_filenames[0]

##reading the data from each sheet
df = pd.read_excel(path,sheet_name=0)
delta_g_CoS = df[df.columns[0]]
ads_CoS = df[df.columns[1]]

df = pd.read_excel(path,sheet_name=1)
delta_g_CuS = df[df.columns[0]]
ads_CuS = df[df.columns[1]]

df = pd.read_excel(path,sheet_name=2)
delta_g_MnS = df[df.columns[0]]
ads_MnS = df[df.columns[1]]

df = pd.read_excel(path,sheet_name=3)
delta_g_TiS = df[df.columns[0]]
ads_TiS = df[df.columns[1]]

df = pd.read_excel(path,sheet_name=4)
delta_g_RuS = df[df.columns[0]]
ads_RuS = df[df.columns[1]]

df = pd.read_excel(path,sheet_name=5)
delta_g_MoS = df[df.columns[0]]
ads_MoS = df[df.columns[1]]

df = pd.read_excel(path,sheet_name=6)
delta_g_ScS = df[df.columns[0]]
ads_ScS = df[df.columns[1]]


df = pd.read_excel(path,sheet_name=7)
delta_g_VS = df[df.columns[0]]
ads_VS = df[df.columns[1]]

In [12]:
###atom swapping

import numpy as np
from ase.io import read,write

repeat_no = 30 # repeating for creating gaussian distribution of data while featurizing

# Atom Swapping for all xyz files
#using ads and xyz_filenames obtained using pandas in first cell


def swappingdata(filenames,ads, stringname,repeat_no):

    r = 0
    for k in range(0,repeat_no):
        for i in range(0,len(filenames)):
            
            index1=0
            index2=ads[i]
            wire=read('data_xyz/'+stringname+'/'+str(i)+'.xyz') #reading data from file
            wire = wire.repeat((2,2,1)) # creating symmetric copies required for calcuation of coordination number using voronoi
            # symbol swap
            CS=wire.get_chemical_symbols() #swapping the atom symbols
            temp=CS[index1]
            CS[index1]=CS[index2]
            CS[index2]=temp
            wire.set_chemical_symbols(CS)
            
            #swapping atom positions
            wire.positions[[index1,index2]] = wire.positions[[index2,index1]] 
            
            
            #writing the updated data into new file
            wire.write('swapped_data_for_featurizing/'+stringname+'/'+str(r)+'.xyz') 
            r+=1
            
swappingdata(CoS_filenames,ads_CoS,'CoS',repeat_no)  
swappingdata(CuS_filenames,ads_CuS,'CuS',repeat_no)  
swappingdata(MnS_filenames,ads_MnS,'MnS',repeat_no)  
swappingdata(MoS_filenames,ads_MoS,'MoS',repeat_no)  
swappingdata(RuS_filenames,ads_RuS,'RuS',repeat_no)  
swappingdata(ScS_filenames,ads_ScS,'ScS',repeat_no)  
swappingdata(TiS_filenames,ads_TiS,'TiS',repeat_no)  
swappingdata(VS_filenames,ads_VS,'VS',repeat_no)  

