Skip to content

Commit

Permalink
Fixed up the overall-dist script.
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Apr 18, 2012
1 parent af08ee0 commit 860ddee
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 9 deletions.
37 changes: 29 additions & 8 deletions nanite-hadoop/src/main/python/overall-dist.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,22 +19,43 @@ def __iter__(self):
dst = {}
for row in tsv_file:
#print row
#
fmtS = row[0]
fmtT = row[1]
fmtD = row[2]
year = row[3]
count = row[4]
fmt = fmtD
if( fmt == "application/octet-stream" or fmt.startswith("application/x-ole2-compound-document-format") ):
fmt = fmtT
if( fmt.startswith("text/")):
fmt = fmtT
if( fmt == "null" ):
fmt = "application/octet-stream"
if( fmt == "application/octet-stream" ):# or fmt.startswith("application/x-puid-fmt-111") ):
fmt = fmtD
# Fall back on DROID if it has a version and tika does not:
if( fmtD.find('version=') != -1 and fmt.find("version=") == -1):
fmt = fmtD
# For unrecognised formats, or
if( fmt.startswith("text/plain") or fmt == "application/octet-stream" ):
fmt = fmtS
# Normalise, lower case and no space after the ;
fmt = fmt.lower()
fmt = fmt.strip();
fmt = fmt.rstrip(";");
(type, subtype, params) = mimeparse.parse_mime_type(fmt)
#fmt = fmt.lower()
fmt = fmt.strip()
fmt = fmt.rstrip(";")
if( fmt == "null" ):
fmt = "null/null"
if( fmt == "text" ):
fmt = "text/plain"
try:
(type, subtype, params) = mimeparse.parse_mime_type(fmt)
except:
print "ERROR: Could not parse: "+fmt
exit
fmt = type+"/"+subtype
# Add version, if required:
if False and params.has_key('version'):
v = params['version']
if not v.startswith('"'):
v = '"'+v+'"'
fmt = fmt+'; version='+v
if not fmt in dst:
dst[fmt] = 0
dst[fmt] += int(count)
Expand Down
24 changes: 24 additions & 0 deletions nanite-tika/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,30 @@ Ideas
-----

* Should normalise/limit the extended MIME type work, and create a new job that spews out ALL the properties Tika reports in some suitable form.
* DROID still return double-forms sometimes: 'application/rtf, text/rtf'. Note that application/rtf is a superset (according to http://www.iana.org/assignments/media-types/application/rtf).
* SAME for 'audio/vnd.rn-realaudio, audio/x-pn-realaudio', well go with the unregistered 'audio/vnd.rn-realaudio'.
* 'application/lotus123, application/vnd.lotus-1-2-3;', but Tika returns 'application/x-123'!
* 'application/vnd.lotus-1-2-3, application/x-123' from DROID!
* 'application/lwp, application/vnd.lotus-wordpro'
* 'image/vnd.microsoft.icon, image/x-icon', vnd again
* 'application/x-endnote-connect, application/x-endnote-connection', use the long one.
* Gah, sometime contain NULL bytes. 'Digipath^@', 'Acrobat 4.05 Import Plug-in for Windows^@', 'Acrobat 3.0 Scan Plug-in^@', 'Document Project PDF Creator 0.2^@'. So, %s/<Ctrl-V><Ctrl-2>//g
* And some ^K, ^L, ^A, <92>
* Replace some ';' in parmeters with ',' and '=' with ''.
* TODO Check allowed characters and enforce.
* Need to create MimeTypes consistently, so values are "enquoted" consisently.

Could not parse: text/html; charset: utf-8; charset=utf-8
Could not parse: image/jpeg; software="adobe photoshop cs3 (10.0x20061208 [20061208.beta.1251 2006/12/08:02:00:00 cutoff; m branch]) macintosh"
Could not parse: image/jpeg; software="adobe photoshop cs3 (10.0x20061208 [20061208.beta.1251 2006/12/08:02:00:00 cutoff; m branch]) macintosh"; hardware="canon eos 5d"
Could not parse: application/pdf; producer="itext by lowagie.com (r1.02b;p128)"; version=1.4
Could not parse: application/xhtml+xml; software="mozilla/4.0 (compatible; www.precedent.co.uk|webpilot v2.3.0.5; windows nt 5.0)"; encoding=iso-8859-1; charset=iso-8859-1

ERROR: Could not parse: text/html; software="mozilla/4.0 (compatible; xhtml-cms; windows nt 5.0) mshtml->xhtml-cms/2.2.1.1"; charset=us-ascii
ERROR: Could not parse: text/html; software="adobe photoshop(r) cs web photo gallery> <meta http-equiv="; charset=iso-8859-1
ERROR: Could not parse: text/html; software="mozilla/4.0 (compatible; xhtml-cms; windows nt 5.0) mshtml->xhtml-cms/2.3.0.0"; charset=us-ascii
ERROR: Could not parse: application/pdf; producer="itext by lowagie.com (r1.02b;p128)"; version=1.4
ERROR: Could not parse: application/xhtml+xml; software="mozilla/4.0 (compatible; www.precedent.co.uk|webpilot v2.3.0.5; windows nt 5.0)"; encoding=iso-8859-1; charset=iso-8859-1



Expand Down
13 changes: 13 additions & 0 deletions nanite-tika/src/etc/header.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Copyright (C) ${year} ${user.name} <${email}>

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,10 @@ public void parse(InputStream stream, ContentHandler handler,
tikaType.setHardware( metadata.get( Metadata.EQUIPMENT_MODEL));

// Return extended MIME Type:
metadata.set(EXT_MIME_TYPE, tikaType.toString());
if( tikaType != null ) {
metadata.set(EXT_MIME_TYPE, tikaType.toString());
// } else { \\ Set to application/octet-stream
}

// Other sources of modification time?
//md.get(Metadata.LAST_MODIFIED); //might be useful, as would any embedded version
Expand Down

0 comments on commit 860ddee

Please sign in to comment.