Skip to content

Commit

Permalink
Added regular expression support to MagicDetector and MimeTypesReader…
Browse files Browse the repository at this point in the history
… for PDFs
  • Loading branch information
pmay committed Dec 5, 2011
1 parent 5b87b7f commit b8de9e7
Show file tree
Hide file tree
Showing 3 changed files with 150 additions and 30 deletions.
115 changes: 106 additions & 9 deletions tika-core/src/main/java/org/apache/tika/detect/MagicDetector.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,11 @@

import java.io.IOException;
import java.io.InputStream;
import java.nio.ByteBuffer;
import java.nio.CharBuffer;
import java.nio.charset.Charset;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
Expand All @@ -41,6 +46,11 @@ public class MagicDetector implements Detector {
*/
private final int length;

/**
* PSM: Add pattern type (regex or String)
*/
private final String patternType;

/**
* The magic match pattern. If this byte pattern is equal to the
* possibly bit-masked bytes from the input stream, then the type
Expand Down Expand Up @@ -95,6 +105,70 @@ public MagicDetector(MediaType type, byte[] pattern, int offset) {
this(type, pattern, null, offset, offset);
}

/**
* Creates a detector capable of handling regular expression pattern types
*
* @param type
* @param patternType
* @param pattern
* @param mask
* @param offsetRangeBegin
* @param offsetRangeEnd
*/
public MagicDetector(
MediaType type, String patternType, byte[] pattern, byte[] mask,
int offsetRangeBegin, int offsetRangeEnd) {
if (type == null) {
throw new IllegalArgumentException("Matching media type is null");
} else if (pattern == null) {
throw new IllegalArgumentException("Magic match pattern is null");
} else if (offsetRangeBegin < 0
|| offsetRangeEnd < offsetRangeBegin) {
throw new IllegalArgumentException(
"Invalid offset range: ["
+ offsetRangeBegin + "," + offsetRangeEnd + "]");
}

this.type = type;

// Length of regular expression does not reflect the buffer byte length.
// Simple solution to handle with an 8K buffer, but this currently causes test failures
// if (patternType.equals("regex")){
// this.length = 8*1024; // 8K buffer
// } else {
this.length = Math.max(pattern.length, mask != null ? mask.length : 0);
// }

this.mask = new byte[length];
this.pattern = new byte[length];
this.patternType = patternType;

for (int i = 0; i < length; i++) {
if (mask != null && i < mask.length) {
this.mask[i] = mask[i];
} else {
this.mask[i] = -1;
}

if (i < pattern.length) {
this.pattern[i] = (byte) (pattern[i] & this.mask[i]);
} else {
this.pattern[i] = 0;
}
}

this.offsetRangeBegin = offsetRangeBegin;
this.offsetRangeEnd = offsetRangeEnd;

// Build the string representation. Needs to be unique, as
// these get compared. Compute now as may get compared a lot!
this.asString = "Magic Detection for " + type.toString() +
" looking for " + pattern.length +
" bytes = " + this.pattern +
" mask = " + this.mask;

}

/**
* Creates a detector for input documents that meet the specified
* magic match.
Expand All @@ -119,6 +193,7 @@ public MagicDetector(

this.mask = new byte[length];
this.pattern = new byte[length];
this.patternType = "";

for (int i = 0; i < length; i++) {
if (mask != null && i < mask.length) {
Expand Down Expand Up @@ -187,16 +262,38 @@ public MediaType detect(InputStream input, Metadata metadata)
if (offset < offsetRangeBegin + length) {
return MediaType.OCTET_STREAM;
}

// PSM: Introduce matching using Regular Expressions
if( this.patternType.equals("regex") ) {
// System.out.println("Matching on regular expression: "+new String(this.pattern));
Pattern p = Pattern.compile(new String(this.pattern));

// Loop until we've covered the entire offset range
for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
boolean match = true;
for (int j = 0; match && j < length; j++) {
match = (buffer[i + j] & mask[j]) == pattern[j];
}
if (match) {
return type;
}
ByteBuffer bb = ByteBuffer.wrap(buffer);
CharBuffer result = Charset.forName("ISO-8859-1").decode(bb);
Matcher m = p.matcher(result);
//Matcher m = p.matcher(new String(buffer));
boolean match = false;
for (int i=0; i<= offsetRangeEnd - offsetRangeBegin; i++){
// System.out.println("Buffer["+i+"]: "+(new String(buffer)).substring(i, i+length));

m.region(i, length+i);
match = m.lookingAt(); // match regular expression from start of region
}
if(match){
return type;
}
} else {
// System.out.println("Matching using non-regex ("+new String(this.pattern)+")");
// Loop until we've covered the entire offset range
for (int i = 0; i <= offsetRangeEnd - offsetRangeBegin; i++) {
boolean match = true;
for (int j = 0; match && j < length; j++) {
match = (buffer[i + j] & mask[j]) == pattern[j];
}
if (match) {
return type;
}
}
}

return MediaType.OCTET_STREAM;
Expand Down
22 changes: 18 additions & 4 deletions tika-core/src/main/java/org/apache/tika/mime/MimeTypesReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ private Clause readMatch(Element element, MediaType mediaType) throws MimeTypeEx
}

MagicDetector detector = new MagicDetector(
mediaType, patternBytes, maskBytes, start, end);
mediaType, type, patternBytes, maskBytes, start, end);
Clause clause = new MagicMatch(detector, length);

List<Clause> subClauses = readMatches(element, mediaType);
Expand Down Expand Up @@ -285,10 +285,14 @@ private byte[] decodeValue(String type, String value)
radix = 8;
}

if (type.equals("string") || type.equals("unicodeLE") || type.equals("unicodeBE")) {
if (type.equals("string") || type.equals("regex") || type.equals("unicodeLE") || type.equals("unicodeBE")) {
decoded = decodeString(value, type);

} else if (type.equals("byte")) {

// }else if (type.equals("regex")) {
// // PSM: Added regex support
// decoded = decodeRegex(value);
//
}else if (type.equals("byte")) {
decoded = tmpVal.getBytes();

} else if (type.equals("host16") || type.equals("little16")) {
Expand Down Expand Up @@ -382,6 +386,16 @@ else if("unicodeBE".equals(type)) {
throw new MimeTypeException("Invalid string value: " + value, e);
}
}

/**
* Decodes the regular expression into a byte
* @param value
* @return
* @throws MimeTypeException
*/
// private byte[] decodeRegex(String value) throws MimeTypeException {
//
// }

/** Read Element named root-XML. */
private void readRootXML(Element element, MimeType mimeType) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,7 +300,8 @@
<acronym>PDF</acronym>
<_comment>Portable Document Format</_comment>
<magic priority="50">
<match value="%PDF-" type="string" offset="0"/>
<!-- <match value="%PDF-" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-" type="regex" offset="0"/>
</magic>
<glob pattern="*.pdf"/>
</mime-type>
Expand All @@ -309,71 +310,79 @@
<acronym>PDF 1.0</acronym>
<_comment>Portable Document Format - Version 1.0</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.0" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.0" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.0" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.1">
<acronym>PDF1.1</acronym>
<_comment>Portable Document Format - Version 1.1</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.1" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.1" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.1" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.2">
<acronym>PDF 1.2</acronym>
<_comment>Portable Document Format - Version 1.2</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.2" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.2" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.2" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.3">
<acronym>PDF 1.3</acronym>
<_comment>Portable Document Format - Version 1.3</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.3" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.3" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.3" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.4">
<acronym>PDF 1.4</acronym>
<_comment>Portable Document Format - Version 1.4</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.4" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.4" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.4" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.5">
<acronym>PDF 1.5</acronym>
<_comment>Portable Document Format - Version 1.5</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.5" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.5" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.5" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.6">
<acronym>PDF 1.6</acronym>
<_comment>Portable Document Format - Version 1.6</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.6" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.6" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.6" type="regex" offset="0"/>
</magic>
</mime-type>

<mime-type type="application/pdf; version=1.7">
<acronym>PDF 1.7</acronym>
<_comment>Portable Document Format - Version 1.7</_comment>
<sub-class-of type="application/pdf"/>
<magic priority="50">
<match value="%PDF-1.7" type="string" offset="0"/>
<magic priority="55">
<!-- <match value="%PDF-1.7" type="string" offset="0"/> -->
<match value="(?s)\\A.{0,144}%PDF-1\\.7" type="regex" offset="0"/>
</magic>
</mime-type>

Expand Down

0 comments on commit b8de9e7

Please sign in to comment.