Skip to content

Commit

Permalink
Merge dda6016 into f967ad8
Browse files Browse the repository at this point in the history
  • Loading branch information
njzjz committed Apr 13, 2019
2 parents f967ad8 + dda6016 commit 99992fe
Show file tree
Hide file tree
Showing 7 changed files with 363 additions and 228 deletions.
17 changes: 8 additions & 9 deletions .travis.yml
@@ -1,11 +1,10 @@
language: python
python: 3.7
dist: xenial
sudo: true
cache: pip

language: c
os: linux
env: PYTHON_VERSION=3.7 PIP_DEPENDENCIES="tox-conda tox-travis" CONDA_CHANNEL_PRIORITY=True CONDA_CHANNELS="conda-forge"
cache:
directories:
- $HOME/.cache/pip
install:
- pip install tox-travis

- git clone --depth 1 git://github.com/astropy/ci-helpers.git && source ci-helpers/travis/setup_conda.sh
script:
- tox
- tox -e py37
16 changes: 0 additions & 16 deletions appveyor.yml

This file was deleted.

241 changes: 69 additions & 172 deletions mddatasetbuilder/datasetbuilder.py
Expand Up @@ -22,15 +22,14 @@
import numpy as np
import pybase64
import lz4.frame
from ase import Atom, Atoms
from ase.data import atomic_numbers
from ase.io import write as write_xyz
from pkg_resources import DistributionNotFound, get_distribution
from sklearn import preprocessing
from sklearn.cluster import MiniBatchKMeans
from tqdm import tqdm

from .dps import dps as connectmolecule
from .detect import Detect

try:
__version__ = get_distribution(__name__).version
Expand All @@ -39,25 +38,31 @@
__version__ = ''


class DatasetBuilder(object):
class DatasetBuilder:
"""Dataset Builder."""

def __init__(
self, atomname=None,
clusteratom=None, bondfilename="bonds.reaxc",
clusteratom=None, bondfilename=None,
dumpfilename="dump.reaxc", dataset_name="md", cutoff=5,
stepinterval=1, n_clusters=10000,
qmkeywords="%nproc=4\n#mn15/6-31g(d,p)", nproc=None, pbc=True,
fragment=True):
"""Init the builder."""
print(__doc__)
print(f"Author:{__author__} Email:{__email__}")
self.dumpfilename = dumpfilename
self.bondfilename = bondfilename
atomname = np.array(
atomname) if atomname else np.array(["C", "H", "O"])
self.crddetector = Detect.gettype('dump')(
filename=dumpfilename, atomname=atomname, pbc=pbc)
if bondfilename is None:
self.bonddetector = self.crddetector
else:
self.bonddetector = Detect.gettype('bond')(
filename=bondfilename, atomname=atomname, pbc=pbc)

self.dataset_dir = f"dataset_{dataset_name}"
self.xyzfilename = dataset_name
self.atomname = np.array(
atomname) if atomname else np.array(["C", "H", "O"])
self.clusteratom = clusteratom if clusteratom else atomname
self.atombondtype = []
self.stepinterval = stepinterval
Expand All @@ -67,12 +72,10 @@ def __init__(
self.writegjf = True
self.gjfdir = f'{self.dataset_dir}_gjf'
self.qmkeywords = qmkeywords
self.pbc = pbc
self.fragment = fragment
self._coulumbdiag = dict(map(lambda symbol: (
symbol, atomic_numbers[symbol]**2.4/2), atomname))
self._nstructure = 0
self.atomnames = None
self.bondtyperestore = {}

def builddataset(self, writegjf=True):
Expand All @@ -84,7 +87,6 @@ def builddataset(self, writegjf=True):
if runstep == 0:
self._readtimestepsbond()
elif runstep == 1:
self.steplinenum = self._readlammpscrdN()
with open(os.path.join(self.trajatom_dir, 'chooseatoms'), 'wb') as f:
for bondtype in self.atombondtype:
self._writecoulumbmatrix(bondtype, f)
Expand All @@ -105,30 +107,34 @@ def _produce(cls, semaphore, producelist, parameter):
semaphore.acquire()
yield item, parameter

def _readlammpscrdstep(self, item):
(_, lines), _ = item
boxsize = []
step_atoms = []
for line in lines:
if line:
if line.startswith("ITEM:"):
linecontent = 4 if line.startswith("ITEM: TIMESTEP") else (3 if line.startswith(
"ITEM: ATOMS") else (1 if line.startswith("ITEM: NUMBER OF ATOMS") else 2))
else:
if linecontent == 3:
s = line.split()
step_atoms.append(
(int(s[0]),
Atom(
self.atomname[int(s[1]) - 1],
tuple(map(float, s[2: 5])))))
elif linecontent == 2:
s = line.split()
boxsize.append(float(s[1])-float(s[0]))
# sort by ID
_, step_atoms = zip(*sorted(step_atoms, key=lambda a: a[0]))
step_atoms = Atoms(step_atoms, cell=boxsize, pbc=self.pbc)
return step_atoms
def _readtimestepsbond(self):
# added on 2018-12-15
stepatomfiles = {}
self._mkdir(self.trajatom_dir)
with Pool(self.nproc, maxtasksperchild=10000) as pool:
semaphore = Semaphore(360)
results = pool.imap_unordered(
self.bonddetector.readatombondtype, self._produce(semaphore,
enumerate(self.lineiter(self.bonddetector)), None),
100)
nstep = 0
for d, step in tqdm(
results, desc="Read trajectory", unit="timestep"):
for bondtypebytes, atomids in d.items():
bondtype = self._bondtype(bondtypebytes)
if bondtype not in self.atombondtype:
self.atombondtype.append(bondtype)
stepatomfiles[bondtype] = open(os.path.join(
self.trajatom_dir, f'stepatom.{bondtype}'), 'wb')
stepatomfiles[bondtype].write(
self.listtobytes([step, atomids]))
semaphore.release()
nstep += 1
pool.close()
self._nstep = nstep
for stepatomfile in stepatomfiles.values():
stepatomfile.close()
pool.join()

def _writecoulumbmatrix(self, trajatomfilename, fc):
self.dstep = {}
Expand All @@ -143,18 +149,11 @@ def _writecoulumbmatrix(self, trajatomfilename, fc):
stepatom = np.zeros((n_atoms, 2), dtype=int)
feedvector = np.zeros((n_atoms, 0))
vector_elements = defaultdict(list)
with open(self.dumpfilename) as f, Pool(self.nproc, maxtasksperchild=10000) as pool:
with Pool(self.nproc, maxtasksperchild=10000) as pool:
semaphore = Semaphore(360)
results = pool.imap_unordered(
self._writestepmatrix, self._produce(
semaphore,
enumerate(
itertools.islice(
itertools.zip_longest(
*[f] * self.steplinenum),
0, None, self.stepinterval)),
None),
100)
self._writestepmatrix, self._produce(semaphore,
enumerate(self.lineiter(self.crddetector)), None), 100)
j = 0
for result in tqdm(
results, desc=trajatomfilename, total=self._nstep,
Expand Down Expand Up @@ -193,7 +192,7 @@ def _writestepmatrix(self, item):
(step, _), _ = item
results = []
if step in self.dstep:
step_atoms = self._readlammpscrdstep(item)
step_atoms = self.crddetector.readcrd(item)
for atoma in self.dstep[step]:
# atom ID starts from 1
distances = step_atoms.get_distances(
Expand Down Expand Up @@ -250,7 +249,7 @@ def _mkdir(cls, path):

def _writexyzfiles(self):
self.dstep = defaultdict(list)
with open(os.path.join(self.trajatom_dir, "chooseatoms"), 'rb') as fc, open(self.dumpfilename) as f, open(self.bondfilename) as fb, Pool(self.nproc, maxtasksperchild=10000) as pool, tqdm(desc="Write structures", unit="structure", total=self._nstructure) as pbar:
with open(os.path.join(self.trajatom_dir, "chooseatoms"), 'rb') as fc, Pool(self.nproc, maxtasksperchild=10000) as pool, tqdm(desc="Write structures", unit="structure", total=self._nstructure) as pbar:
semaphore = Semaphore(360)
typecounter = Counter()
for typefile, trajatomfilename in zip(fc, self.atombondtype):
Expand All @@ -271,21 +270,14 @@ def _writexyzfiles(self):
if self.writegjf:
for folder in foldernames:
self._mkdir(os.path.join(self.gjfdir, folder))
results = pool.imap_unordered(
self._writestepxyzfile, self._produce(
semaphore,
enumerate(
zip(
itertools.islice(
itertools.zip_longest(
*[f] * self.steplinenum),
0, None, self.stepinterval),
itertools.islice(
itertools.zip_longest(
*[fb] * self.bondsteplinenum),
0, None, self.stepinterval))),
None),
100)
crditer = self.lineiter(self.crddetector)
if self.crddetector is self.bonddetector:
lineiter = crditer
else:
bonditer = self.lineiter(self.bonddetector)
lineiter = zip(crditer, bonditer)
results = pool.imap_unordered(self._writestepxyzfile, self._produce(
semaphore, enumerate(lineiter), None), 100)
for result in results:
pbar.update(result)
semaphore.release()
Expand Down Expand Up @@ -327,11 +319,14 @@ def _convertgjf(self, gjffilename, takenatomidindex, atoms_whole):
f.write('\n'.join(buff))

def _writestepxyzfile(self, item):
(step, (dumplines, bondlines)), _ = item
(step, lines), _ = item
results = 0
if step in self.dstep:
step_atoms = self._readlammpscrdstep(((step, dumplines), None))
molecules = self._readlammpsbondstepmolecules(bondlines)
if len(lines) == 2:
step_atoms = self.crddetector.readcrd(((step, lines[0]), None))
molecules = self.bonddetector.readmolecule(lines[1])
else:
molecules, step_atoms = self.bonddetector.readmolecule(lines)
for atoma, trajatomfilename, itype, itotal in self.dstep[step]:
# update counter
folder = str(itotal//1000).zfill(self.foldermaxlength)
Expand Down Expand Up @@ -370,111 +365,6 @@ def _writestepxyzfile(self, item):
results += 1
return results

def _readlammpsbondN(self, f):
# copy from reacnetgenerator on 2018-12-15
iscompleted = False
for index, line in enumerate(f):
if line.startswith("#"):
if line.startswith("# Number of particles"):
if iscompleted:
stepbindex = index
break
else:
iscompleted = True
stepaindex = index
N = [int(s) for s in line.split() if s.isdigit()][0]
atomtype = np.zeros(N, dtype=np.int)
else:
s = line.split()
atomtype[int(s[0])-1] = int(s[1])
steplinenum = stepbindex-stepaindex
self._N = N
self.atomtype = atomtype
self.atomnames = self.atomname[self.atomtype-1]
return steplinenum

def _readlammpscrdN(self):
# copy from reacnetgenerator on 2018-12-15
with open(self.dumpfilename) as f:
iscompleted = False
for index, line in enumerate(f):
if line.startswith("ITEM:"):
linecontent = 4 if line.startswith("ITEM: TIMESTEP") else (3 if line.startswith(
"ITEM: ATOMS") else (1 if line.startswith("ITEM: NUMBER OF ATOMS") else 2))
else:
if linecontent == 1:
if iscompleted:
stepbindex = index
break
else:
iscompleted = True
stepaindex = index
steplinenum = stepbindex-stepaindex
return steplinenum

def _readlammpsbondstepmolecules(self, lines):
# copy from reacnetgenerator on 2018-12-15
bond = [None]*self._N
for line in lines:
if line:
if not line.startswith("#"):
s = line.split()
bond[int(s[0])-1] = map(lambda x: int(x) -
1, s[3:3+int(s[2])])
molecules = connectmolecule(bond)
return molecules

def _readlammpsbondstep(self, item):
# copy from reacnetgenerator on 2018-12-15
(step, lines), _ = item
d = defaultdict(list)
for line in lines:
if line:
if line[0] != "#":
s = line.split()
atombond = sorted(
map(lambda x: max(1, round(float(x))), s[4 + int(s[2]): 4 + 2 * int(s[2])]))
d[pickle.dumps((self.atomnames[int(s[0]) - 1],
atombond))].append(int(s[0]))
return d, step

def _readtimestepsbond(self):
# added on 2018-12-15
stepatomfiles = {}
self._mkdir(self.trajatom_dir)
with open(self.bondfilename) as f, Pool(self.nproc, maxtasksperchild=10000) as pool:
self.bondsteplinenum = self._readlammpsbondN(f)
f.seek(0)
semaphore = Semaphore(360)
results = pool.imap_unordered(
self._readlammpsbondstep, self._produce(
semaphore,
enumerate(
itertools.islice(
itertools.zip_longest(
*[f] * self.bondsteplinenum),
0, None, self.stepinterval)),
None),
100)
nstep = 0
for d, step in tqdm(
results, desc="Read trajectory", unit="timestep"):
for bondtypebytes, atomids in d.items():
bondtype = self._bondtype(bondtypebytes)
if bondtype not in self.atombondtype:
self.atombondtype.append(bondtype)
stepatomfiles[bondtype] = open(os.path.join(
self.trajatom_dir, f'stepatom.{bondtype}'), 'wb')
stepatomfiles[bondtype].write(
self.listtobytes([step, atomids]))
semaphore.release()
nstep += 1
pool.close()
self._nstep = nstep
for stepatomfile in stepatomfiles.values():
stepatomfile.close()
pool.join()

def _bondtype(self, typebytes):
if typebytes in self.bondtyperestore:
return self.bondtyperestore[typebytes]
Expand Down Expand Up @@ -514,14 +404,21 @@ def listtobytes(cls, x):
def bytestolist(cls, x):
return pickle.loads(cls._decompress(x, isbytes=True))

def lineiter(self, detector):
f = open(detector.filename)
it = itertools.islice(itertools.zip_longest(
*[f] * detector.steplinenum), 0, None, self.stepinterval)
for line in it:
yield line
f.close()


def _commandline():
parser = argparse.ArgumentParser(description='MDDatasetBuilder')
parser.add_argument('-d', '--dumpfile',
help='Input dump file, e.g. dump.reaxc', required=True)
parser.add_argument(
'-b', '--bondfile', help='Input bond file, e.g. bonds.reaxc',
required=True)
'-b', '--bondfile', help='Input bond file, e.g. bonds.reaxc')
parser.add_argument('-a', '--atomname',
help='Atomic names in the trajectory, e.g. C H O',
nargs='*', required=True)
Expand Down

0 comments on commit 99992fe

Please sign in to comment.