Skip to content

Commit

Permalink
Merge 5359459 into cdba5ee
Browse files Browse the repository at this point in the history
  • Loading branch information
njzjz committed Apr 13, 2019
2 parents cdba5ee + 5359459 commit c91b47c
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 38 deletions.
8 changes: 0 additions & 8 deletions .whitesource

This file was deleted.

50 changes: 35 additions & 15 deletions mddatasetbuilder/datasetbuilder.py
Expand Up @@ -47,14 +47,14 @@ def __init__(
dumpfilename="dump.reaxc", dataset_name="md", cutoff=5,
stepinterval=1, n_clusters=10000,
qmkeywords="%nproc=4\n#mn15/6-31g(d,p)", nproc=None, pbc=True,
fragment=True):
fragment=True, errorfilename=None, errorlimit=0.):
"""Init the builder."""
print(__doc__)
print(f"Author:{__author__} Email:{__email__}")
atomname = np.array(
atomname) if atomname else np.array(["C", "H", "O"])
self.crddetector = Detect.gettype('dump')(
filename=dumpfilename, atomname=atomname, pbc=pbc)
filename=dumpfilename, atomname=atomname, pbc=pbc, errorfilename=errorfilename, errorlimit=errorlimit)
if bondfilename is None:
self.bonddetector = self.crddetector
else:
Expand All @@ -77,6 +77,7 @@ def __init__(
symbol, atomic_numbers[symbol]**2.4/2), atomname))
self._nstructure = 0
self.bondtyperestore = {}
self.errorfilename = errorfilename

def builddataset(self, writegjf=True):
"""Build a dataset."""
Expand Down Expand Up @@ -114,8 +115,9 @@ def _readtimestepsbond(self):
with Pool(self.nproc, maxtasksperchild=10000) as pool:
semaphore = Semaphore(360)
results = pool.imap_unordered(
self.bonddetector.readatombondtype, self._produce(semaphore,
enumerate(self.lineiter(self.bonddetector)), None),
self.bonddetector.readatombondtype,
self._produce(semaphore, enumerate(zip(self.lineiter(self.bonddetector), self.erroriter(
)) if self.errorfilename is not None else self.lineiter(self.bonddetector)), (self.errorfilename is not None)),
100)
nstep = 0
for d, step in tqdm(
Expand Down Expand Up @@ -192,7 +194,7 @@ def _writestepmatrix(self, item):
(step, _), _ = item
results = []
if step in self.dstep:
step_atoms = self.crddetector.readcrd(item)
step_atoms, _ = self.crddetector.readcrd(item)
for atoma in self.dstep[step]:
# atom ID starts from 1
distances = step_atoms.get_distances(
Expand Down Expand Up @@ -323,7 +325,8 @@ def _writestepxyzfile(self, item):
results = 0
if step in self.dstep:
if len(lines) == 2:
step_atoms = self.crddetector.readcrd(((step, lines[0]), None))
step_atoms, _ = self.crddetector.readcrd(
((step, lines[0]), None))
molecules = self.bonddetector.readmolecule(lines[1])
else:
molecules, step_atoms = self.bonddetector.readmolecule(lines)
Expand Down Expand Up @@ -405,20 +408,31 @@ def bytestolist(cls, x):
return pickle.loads(cls._decompress(x, isbytes=True))

def lineiter(self, detector):
f = open(detector.filename)
it = itertools.islice(itertools.zip_longest(
*[f] * detector.steplinenum), 0, None, self.stepinterval)
for line in it:
yield line
f.close()
fns = [detector.filename] if isinstance(
detector.filename, str) else detector.filename
for fn in fns:
with open(fn) as f:
it = itertools.islice(itertools.zip_longest(
*[f] * detector.steplinenum), 0, None, self.stepinterval)
for line in it:
yield line

def erroriter(self):
fns = [self.errorfilename] if isinstance(
self.errorfilename, str) else self.errorfilename
for fn in fns:
with open(fn) as f:
it = itertools.islice(f, 1, None)
for line in it:
yield line


def _commandline():
parser = argparse.ArgumentParser(description='MDDatasetBuilder')
parser.add_argument('-d', '--dumpfile',
parser.add_argument('-d', '--dumpfile', nargs='*',
help='Input dump file, e.g. dump.reaxc', required=True)
parser.add_argument(
'-b', '--bondfile', help='Input bond file, e.g. bonds.reaxc')
'-b', '--bondfile', nargs='*', help='Input bond file, e.g. bonds.reaxc')
parser.add_argument('-a', '--atomname',
help='Atomic names in the trajectory, e.g. C H O',
nargs='*', required=True)
Expand All @@ -439,9 +453,15 @@ def _commandline():
default="%nproc=4\n#mn15/6-31g**")
parser.add_argument(
'-n', '--name', help='Dataset name (default is md)', default="md")
parser.add_argument(
'--errorfile', help='Error file generated by modified DeePMD', nargs='*')
parser.add_argument(
'-e', '--errorlimit', help='Error Limit', type=float, default=0.)
args = parser.parse_args()
DatasetBuilder(
atomname=args.atomname, bondfilename=args.bondfile,
dumpfilename=args.dumpfile, dataset_name=args.name, cutoff=args.cutoff,
stepinterval=args.interval, n_clusters=args.size,
qmkeywords=args.qmkeywords, nproc=args.nproc).builddataset()
qmkeywords=args.qmkeywords, nproc=args.nproc,
errorfilename=args.errorfile, errorlimit=args.errorlimit
).builddataset()
39 changes: 24 additions & 15 deletions mddatasetbuilder/detect.py
Expand Up @@ -11,10 +11,12 @@


class Detect(metaclass=ABCMeta):
def __init__(self, filename, atomname, pbc):
def __init__(self, filename, atomname, pbc, errorlimit=None, errorfilename=None):
self.filename = filename
self.atomname = atomname
self.pbc = pbc
self.errorlimit = errorlimit
self.errorfilename = errorfilename
self.steplinenum = self._readN()

@abstractmethod
Expand Down Expand Up @@ -49,7 +51,7 @@ class DetectBond(Detect):
def _readN(self):
"""Read bondfile N, which should be at very beginning."""
# copy from reacnetgenerator on 2018-12-15
with open(self.filename) as f:
with open(self.filename if isinstance(self.filename, str) else self.filename[0]) as f:
iscompleted = False
for index, line in enumerate(f):
if line.startswith("#"):
Expand Down Expand Up @@ -102,7 +104,7 @@ class DetectDump(Detect):
def _readN(self):
# copy from reacnetgenerator on 2018-12-15
iscompleted = False
with open(self.filename) as f:
with open(self.filename if isinstance(self.filename, str) else self.filename[0]) as f:
for index, line in enumerate(f):
if line.startswith("ITEM:"):
linecontent = self.LineType.linecontent(line)
Expand All @@ -126,18 +128,25 @@ def _readN(self):
return steplinenum

def readatombondtype(self, item):
(step, _), _ = item
(step, lines), needlerror = item
if needlerror:
trajline, errorline = lines
item = (step, trajline), None
lerror = np.fromstring(errorline, dtype=float, sep=' ')[7:]
d = defaultdict(list)
step_atoms = self.readcrd(item)
step_atoms, ids = self.readcrd(item)
if needlerror:
lerror = [x for (y, x) in sorted(zip(ids, lerror))]
level = self._crd2bond(step_atoms, readlevel=True)
for i, (n, l) in enumerate(zip(self.atomnames, level)):
# Note that atom id starts from 1
d[pickle.dumps((n, sorted(l)))].append(i+1)
if not needlerror or lerror[i] > self.errorlimit:
# Note that atom id starts from 1
d[pickle.dumps((n, sorted(l)))].append(i+1)
return d, step

def readmolecule(self, lines):
bond = [None]*self._N
step_atoms = self.readcrd(((None, lines), None))
step_atoms, _ = self.readcrd(((None, lines), None))
bond = self._crd2bond(step_atoms, readlevel=False)
molecules = connectmolecule(bond)
# return atoms as well
Expand Down Expand Up @@ -184,25 +193,25 @@ def readcrd(self, item):
(_, lines), _ = item
boxsize = []
step_atoms = []
ids = []
for line in lines:
if line:
if line.startswith("ITEM:"):
linecontent = self.LineType.linecontent(line)
else:
if linecontent == self.LineType.ATOMS:
s = line.split()
step_atoms.append(
(int(s[0]),
Atom(
self.atomname[int(s[1]) - 1],
tuple(map(float, s[2: 5])))))
ids.append(int(s[0]))
step_atoms.append(Atom(
self.atomname[int(s[1]) - 1],
tuple(map(float, s[2: 5]))))
elif linecontent == self.LineType.BOX:
s = line.split()
boxsize.append(float(s[1])-float(s[0]))
# sort by ID
_, step_atoms = zip(*sorted(step_atoms, key=lambda a: a[0]))
step_atoms = [x for (y, x) in sorted(zip(ids, step_atoms))]
step_atoms = Atoms(step_atoms, cell=boxsize, pbc=self.pbc)
return step_atoms
return step_atoms, ids

class LineType(Enum):
"""Line type in the LAMMPS dump files."""
Expand Down

0 comments on commit c91b47c

Please sign in to comment.