In [2]:
import pandas as pd
import numpy as np


def dmp_to_dat(dmp, dat, tot_lns, lns_per_time, bad_lns, cols, tsteps_on=False, first_tstep=None, last_tstep=None, 
               tstep_size=None):
    # Pull in the .dmp file
    toskip = []
    for idx in range(tot_lns - 1):
        if (idx % lns_per_time) in [i for i in range(bad_lns)]:
            toskip.append(idx)
    data=pd.read_csv(dmp, sep = " ", skiprows=toskip, usecols=cols.keys(), names=cols.values(), engine='python')

    if tsteps_on:
        # Add timesteps to df
        timesteps = np.linspace(first_tstep, last_tstep, int((last_tstep - first_tstep) / tstep_size) + 1) # lazy?
        timesteps_list = []
        for timestep in timesteps:
            timesteps_list.append(round(timestep))
        data["timestep"] = timesteps_list
        cols = data.columns.tolist()
        cols = cols[-1:] + cols[:-1]
        data = data[cols]

    # Print df and write to output file
    data.to_csv(dat, sep=' ', index=False)
    return data


def thermo_to_dat(thermo, dat, header_len, last_ln, tot_len, cols, first_tstep, last_tstep, tstep_size):
    # Pull in log.dmp
    data=pd.read_fwf(thermo, skiprows=header_len, skipfooter=tot_len-last_ln-1, usecols=cols.keys(), names=cols.values(), engine='python', dtype=float)

    # Add timesteps to df, hacky fix should just do above but whatever
    timesteps = np.linspace(first_tstep, last_tstep, int((last_tstep - first_tstep) / tstep_size) + 1)
    timesteps_list = []
    for timestep in timesteps:
        timesteps_list.append(round(timestep))
    data["timestep"] = timesteps_list
    cols = data.columns.tolist()
    cols = cols[-1:] + cols[:-1]
    data = data[cols]

    # Print df and write to output file
    data.to_csv(dat, sep=' ', index=False)
    return data

In [4]:
def main():
    # dmp to dat
    dmp = "ebind/ebind14/coords.dmp"
    dat = "ebind/data/ebind14_coords.dat"

    tot_lns = 10011
    lns_per_time = 10
    bad_lns = 9

    tsteps_on = True
    first_tstep = 10000
    last_tstep = 5010000
    tstep_size = 5000

    cols = {
        0: "c_coords"
    }
    
    print("coords", dmp_to_dat(dmp, dat, tot_lns, lns_per_time, bad_lns, cols, tsteps_on=tsteps_on, first_tstep=first_tstep, 
                     last_tstep=last_tstep, tstep_size=tstep_size))
    
    
    # thermo to dat
    thermo = "ebind/ebind14/log.lammps"
    dat = "ebind/data/ebind14_thermo.dat"

    header_len = 229
    last_ln = 1230
    tot_len = 1262

    first_tstep = 10000
    last_tstep = 5010000
    tstep_size = 5000

    cols = {  
        1: "Temp",  
        2: "E_pair",  
        3: "E_mol",  
        4: "TotEng",  
        5: "Press"
    }
    
    print("thermo", thermo_to_dat(thermo, dat, header_len, last_ln, tot_len, cols, first_tstep, last_tstep, tstep_size))


if __name__ == "__main__":
    main()

coords       timestep  c_coords
0        10000        50
1        15000        73
2        20000        90
3        25000       103
4        30000       121
...        ...       ...
996    4990000       390
997    4995000       390
998    5000000       390
999    5005000       390
1000   5010000       390

[1001 rows x 2 columns]
thermo       timestep       Temp      E_pair       E_mol      TotEng      Press
0        10000  141.98708  -945.44873   636.40922   960.67007  -1.286340
1        15000  300.74389 -1181.19350  1262.98190  2771.16960  16.769213
2        20000  296.74201 -1432.72890  1271.81350  2492.67940  -1.149765
3        25000  300.13820 -1604.21530  1275.51340  2355.26310   1.778833
4        30000  297.96761 -1870.53680  1244.38890  2038.40660 -13.339153
...        ...        ...         ...         ...         ...        ...
996    4990000  296.77043 -5924.87910  1158.54930 -2112.48090  -2.658935
997    4995000  296.25498 -5927.44690  1164.50850 -2113.69890  31.503354
998 