Refs #8587 Refactored loading functions to be faster with large files.

mantidproject · Jan 7, 2014 · 1ae5eaf · 1ae5eaf
1 parent feca0cb
commit 1ae5eaf
Showing 1 changed file with 181 additions and 96 deletions.
diff --git a/...Mantid/Framework/PythonInterface/plugins/algorithms/WorkflowAlgorithms/DensityOfStates.py b/...Mantid/Framework/PythonInterface/plugins/algorithms/WorkflowAlgorithms/DensityOfStates.py
@@ -11,7 +11,7 @@
 import re
 import os.path
 import math
- 
+
 class DensityOfStates(PythonAlgorithm):
 
 		def PyInit(self):
@@ -236,17 +236,21 @@ def computeRaman(self, freqs, intensities, weights):
 			self.computeDos(freqs, crossSections, weights)
 
 		def readDataFromFile(self, fname):
+			'''
+			Select the appropriate file parser and check data was successfully 
+			loaded from file.
 
-			with open(fname, 'r') as fhandle:
-				content = fhandle.read()
-				
+			@param fname - path to the file.
+			@return tuple of the frequencies, ir and raman intensities and weights
+			'''
 			ext = os.path.splitext(fname)[1]
 			self.fileType = ext
 
 			if ext == '.phonon':
-				freqs, irIntens, ramanIntens, weights = self.readPhononFile(content)
+				freqs, irIntens, ramanIntens, weights = self.parsePhononFile(fname)
+
 			elif ext == '.castep':
-				freqs, irIntens, ramanIntens, weights = self.readCastepFile(content)
+				freqs, irIntens, ramanIntens, weights = self.parseCastepFile(fname)
 
 			if ( len(freqs) == 0 ):
 				raise ValueError("Failed to load any frequencies from file.")
@@ -256,127 +260,208 @@ def readDataFromFile(self, fname):
 
 			return freqs, irIntens, ramanIntens, weights
 
-		def readCastepFile(self, content):
+		def parseBlockHeader(self, header_match, block_count):
+			"""
+			Parse the header of a block of frequencies and intensities
+
+			@param header_match - the regex match to the header
+			@param block_count - the count of blocks found so far 
+			@return weight for this block of values
+			"""
+			#found header block at start of frequencies
+			q1, q2, q3, weight = map(float, header_match.groups())
+			if block_count > 1 and sum([q1,q2,q3]) == 0 and self.specType != 'DOS':
+				weight = 0.0
+			return weight
+
+		def parsePhononFileHeader(self, f_handle):
 			'''
-			Read frequencies from <>.castep file
+			Read information from the header of a <>.phonon file
 
-			@param content - string of the data read from file
-			@return the frequencies, infra red and raman intensities and weights of frequency blocks
+			@param f_handle - handle to the file.
+			@return tuple of the number of ions and branches in the file
 			'''
-			#regex patterns to extract data the header and data information from each block
 
-			#Header regex. Looks for lines in the following format:
-			# +  q-pt=    1 (  0.000000  0.000000  0.000000)     1.0000000000              +
-			headerRegexStr = r" +\+ +q-pt= +\d+ \( *(?: *(%(s)s)) *(%(s)s) *(%(s)s)\) +(%(s)s) +\+" % {'s' : self.fnumber}
+			while True:
+				line = f_handle.readline()
 
-			#Data regex. Looks for lines in the following format:
- 			# +     1      -0.051481   a          0.0000000  N            0.0000000  N     + 
-			dataRegexStr = r" +\+ +\d+ +(%(s)s)(?: +\w)? *((?: *%(s)s?(?: +[YN])?){0,2}) *\+"% {'s': self.fnumber}
+				if not line:
+					raise IOError("Could not find any header information.")
 
-			# Find all of the frequency data blocks, also splits text into header and data groups 
-			freqBlockRegexStr = r" \+ \-+ \+\s+( +\+ +q-pt= +\d+ \( *(?: *%(s)s) *%(s)s *%(s)s\) +%(s)s +\+)((?:[\+\w\-\d\s\<\>\=\(\)\*\/\\]|\-?%(s)s|Frequency irrep\.)*) +\+" % {'s': self.fnumber}
+				if 'Number of ions' in line:
+					num_ions = int(line.strip().split()[-1])
+				elif 'Number of branches' in line:
+					num_branches = int(line.strip().split()[-1])
 
-			return self.parseFrequencies(content, freqBlockRegexStr, headerRegexStr, dataRegexStr)
+				if 'END header' in line: 
+					return num_ions, num_branches
 
-		def readPhononFile(self, content):
+		def parsePhononFile(self, fname):
 			'''
-			Read frequencies from <>.phonon file
+			Read frequencies from a <>.phonon file
 
-			@param content - string of the data read from file
+			@param fname - file path of the file to read
 			@return the frequencies, infra red and raman intensities and weights of frequency blocks
 			'''
-			#regex patterns to extract data the header and data information from each block
-
+
 			#Header regex. Looks for lines in the following format:
 			#     q-pt=    1    0.000000  0.000000  0.000000      1.0000000000    0.000000  0.000000  1.000000
 			headerRegexStr = r"^ +q-pt=\s+\d+ +(%(s)s) +(%(s)s) +(%(s)s) (?: *(%(s)s)){0,4}" % {'s': self.fnumber}
+			headerRegex = re.compile(headerRegexStr)
 
-			#Data regex. Looks for lines in the following format:
-			#       1      -0.051481              0.0000000               0.0000000  
-			dataRegexStr = r"\s+\d+ +(%(s)s)((?: +%(s)s){0,2})"% {'s': self.fnumber}
+			eigenvectors_regex = re.compile(r"\s*Mode\s+Ion\s+X\s+Y\s+Z\s*")
 
-			# Find all of the frequency data blocks, also splits text into header and data groups 
-			freqBlockRegexStr = r"( +q-pt=\s+\d+ +%(s)s +%(s)s +%(s)s (?: *%(s)s){0,4})((?:\s|%(s)s)*)Phonon Eigenvectors" % {'s': self.fnumber}
+			freqs, ir_intensities, raman_intensities, weights = [], [], [], []
+			with open(fname, 'r') as f_handle:
+				num_ions, num_branches = self.parsePhononFileHeader(f_handle)
+				block_count = 0
+				prog_reporter = Progress(self,start=0.0,end=1.0, nreports=1)
+
+				while True:
+					line = f_handle.readline()
+					#check we've reached the end of file
+					if not line: break
+
+					header_match = headerRegex.match(line)
+					if header_match:
+						#found header block at start of frequencies
+						block_count+=1
+						weight = self.parseBlockHeader(header_match, block_count)
+						weights.append(weight)
+						prog_reporter.setNumSteps(block_count+1)
+
+						#parse block of frequencies
+						block_freqs, block_ir, block_raman = [], [], []
+						for _ in xrange(num_branches):
+							line = f_handle.readline()
+							line_data = line.strip().split()[1:]
+
+							data_lists = (block_freqs, block_ir, block_raman)
+							for data_list, data_item in zip(data_lists, line_data):
+								if data_item:
+									data_list.append(float(data_item))
+
+						freqs.append(block_freqs)
+						ir_intensities.append(block_ir)
+						raman_intensities.append(block_raman)
+
+						prog_reporter.report("Reading intensities.")
+
+					#skip over eigenvectors
+					vector_match = eigenvectors_regex.match(line)
+					if vector_match:
+						for _ in xrange(num_ions*num_branches):
+							line = f_handle.readline()
+							if not line:
+								raise IOError("Could not parse file. Invalid file format.")
+
+			freqs = np.asarray(freqs) 
+			ir_intensities = np.asarray(ir_intensities)
+			raman_intensities = np.asarray(raman_intensities)
+			warray = np.repeat(weights, len(freqs[0]))
 
-			return self.parseFrequencies(content, freqBlockRegexStr, headerRegexStr, dataRegexStr)
+			return freqs, ir_intensities, raman_intensities, warray
 
-		def parseFrequencies(self, content, blockRegex, headerRegexStr, dataRegexStr):
+		def parseCastepFileHeader(self, f_handle):
 			'''
-			Extract frequencies, infra-red and raman intensities from the contents of a file.
+			Read information from the header of a <>.castep file
 
-			@param content - string of the data read from file
-			@param freqBlockRegexStr - regex to split the frequncy blocks
-			@param headerRegexStr - regex to extract the header from the block
-			@param dataRegexStr - regex to extract a line of data from the block
-			@return the frequencies, infra red and raman intensities and weights of frequency blocks
+			@param f_handle - handle to the file.
+			@return tuple of the number of ions and branches in the file
 			'''
 
-			headerRegex = re.compile(headerRegexStr)
-			dataRegex = re.compile(dataRegexStr)
+			num_species, num_ions = 0,0
+			while True:
+				line = f_handle.readline()
 
-			freqBlocks = re.findall(blockRegex, content)
-			nFreqBlocks = len(freqBlocks)
+				if not line:
+					raise IOError("Could not find any header information.")
 
-			if(nFreqBlocks <= 0): 
-				raise Exception('Error parsing file. Failed to find any frequency blocks.')
+				if 'Total number of ions in cell =' in line:
+					num_ions = int(line.strip().split()[-1])
+				elif 'Total number of species in cell = ' in line:
+					num_species = int(line.strip().split()[-1])
 
-			freqs, ir_intensities, raman_intensities, weights = [], [], [], []
+				if num_species > 0 and num_ions > 0:
+					num_branches = num_species*num_ions 
+					return num_ions, num_branches
 
-			#iterate over each block of frequencies and intensities
-			for i, freqBlock in enumerate(freqBlocks):
-				header, data = freqBlock
+		def parseCastepFile(self, fname):
+			'''
+			Read frequencies from a <>.castep file
 
-				#extract data from header
-				freqHeaderData = headerRegex.match(header)
-				freqHeaderData = map (float, freqHeaderData.groups())
-				q1, q2, q3, weight = freqHeaderData
+			@param fname - file path of the file to read
+			@return the frequencies, infra red and raman intensities and weights of frequency blocks
+			'''
+			#Header regex. Looks for lines in the following format:
+			# +  q-pt=    1 (  0.000000  0.000000  0.000000)     1.0000000000              +
+			headerRegexStr = r" +\+ +q-pt= +\d+ \( *(?: *(%(s)s)) *(%(s)s) *(%(s)s)\) +(%(s)s) +\+" % {'s' : self.fnumber}
+			headerRegex = re.compile(headerRegexStr)
 
-				if i > 0 and sum([q1,q2,q3]) == 0 and self.specType != 'DOS':
-					weight = 0.0
-
-				#extract the frequency and intensity data from this block
-				block_freqs, block_ir, block_raman = [], [], []
-				for freq, intensities in dataRegex.findall(data):
-					block_freqs.append(freq)
-
-					#reformat intensities into list
-					intensities = intensities.strip()
-
-					if self.fileType == '.castep':
-						#if using IR active we need to check which intensities were active
-						if self.specType == 'IR Active':
-
-							intensities = intensities.split()
-							ir, active = intensities[:2]
-
-							if active == 'N':
-								intensities[0] = '0.0'
-
-							intensities = ' '.join(intensities)
-
-					intensities = re.sub('[YN]', '', intensities)
-					intensities = intensities.split()
-
-					#append any intensities found on this line to our arrays
-					for value, arr in zip(intensities, (block_ir, block_raman)):
-						arr.append(value.strip())
-
-				#convert to numpy arrays and convert to floats
-				block_freqs = np.array(block_freqs).astype(np.float)
-				block_ir = np.array(block_ir).astype(np.float)
-				block_raman = np.array(block_raman).astype(np.float)
-
-				#append any columns found to the existing arrays
-				for column, arr in zip((block_freqs, block_ir, block_raman), (freqs, ir_intensities, raman_intensities)):
-					if len(column) > 0:
-						arr.append(column)
+			#Data regex. Looks for lines in the following format:
+ 			# +     1      -0.051481   a          0.0000000  N            0.0000000  N     + 
+			dataRegexStr = r" +\+ +\d+ +(%(s)s)(?: +\w)? *(%(s)s)? *([YN])? *(%(s)s)? *([YN])? *\+"% {'s': self.fnumber}
+			dataRegex = re.compile(dataRegexStr)
 
-				weights.append(weight)
-
-			#convert weights into list of weights
+			freqs, ir_intensities, raman_intensities, weights = [], [], [], []
+			with open(fname, 'r') as f_handle:
+				num_ions, num_branches = self.parseCastepFileHeader(f_handle)
+				block_count = 0
+				prog_reporter = Progress(self,start=0.0,end=1.0, nreports=1)
+
+				while True:
+					line = f_handle.readline()
+					#check we've reached the end of file
+					if not line: break
+
+					header_match = headerRegex.match(line)
+					if header_match:
+						#found header block at start of frequencies
+						block_count+=1
+						weight = self.parseBlockHeader(header_match, block_count)
+						weights.append(weight)
+						prog_reporter.setNumSteps(block_count+1)
+
+						#move file pointer forward to start of intensity data
+						while True:
+							line = f_handle.readline()
+							if not line: 
+								raise IOError("Could not parse frequency block. Invalid file format.")
+							if dataRegex.match(line): break
+
+						#parse block of frequencies
+						block_freqs, block_ir, block_raman = [], [], []
+						for _ in xrange(num_branches):
+							line_data = line.strip().split()[1:-1]
+							freq = line_data[1]
+							intensity_data = line_data[3:]
+
+							#remove non-active intensities from data
+							intensities = []
+							for value, active in zip(intensity_data[::2], intensity_data[1::2]):
+								if active == 'N':	value = 0.0
+								intensities.append(value)
+
+							#append data to block lists
+							data_lists = (block_freqs, block_ir, block_raman)
+							for data_list, data_item in zip(data_lists, [freq] + intensities):
+								if data_item:
+									data_list.append(float(data_item))
+
+							line = f_handle.readline()
+
+						freqs.append(block_freqs)
+						ir_intensities.append(block_ir)
+						raman_intensities.append(block_raman)
+
+						prog_reporter.report("Reading intensities.")
+
+			freqs = np.asarray(freqs) 
+			ir_intensities = np.asarray(ir_intensities)
+			raman_intensities = np.asarray(raman_intensities)
 			warray = np.repeat(weights, len(freqs[0]))
-			
-			return np.array(freqs), np.array(ir_intensities), np.array(raman_intensities), np.array(warray)
+
+			return freqs, ir_intensities, raman_intensities, warray
 
 # Register algorithm with Mantid
 AlgorithmFactory.subscribe(DensityOfStates)