Skip to content

Commit

Permalink
Now produces basic corpus overview.
Browse files Browse the repository at this point in the history
  • Loading branch information
anjackson committed Nov 18, 2012
1 parent 1166691 commit bee5964
Show file tree
Hide file tree
Showing 5 changed files with 65 additions and 22 deletions.
7 changes: 1 addition & 6 deletions tools/coverage/pom.xml
Expand Up @@ -22,7 +22,7 @@
<configuration>
<archive>
<manifest>
<mainClass>org.opf_labs.fmts.CoverageAnalysis</mainClass>
<mainClass>org.opf_labs.fmts.CorpusCoverageAnalysis</mainClass>
</manifest>
</archive>
<descriptorRefs>
Expand Down Expand Up @@ -81,11 +81,6 @@
<type>jar</type>
<scope>compile</scope>
</dependency>
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>org.opf-labs.planets</groupId>
<artifactId>core-techreg</artifactId>
Expand Down
@@ -1,3 +1,18 @@
/**
* Copyright (C) 2012 Andrew Jackson <Andrew.Jackson@bl.uk>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
Expand All @@ -7,10 +22,7 @@
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Properties;

import org.apache.tika.Tika;
import org.opf_labs.fmts.fidget.IdentificationResult;
Expand All @@ -23,23 +35,21 @@
public class CorpusCoverageAnalysis {

private File root;
// NOTE that using Tika directly picks up custom sigs.
private Tika tika = new Tika();
private TikaSigTester tBase = TikaSigTester.justTika();
private TikaSigTester tCustom = TikaSigTester.justCustom();
private TikaSigTester tAll = TikaSigTester.vanilla();

public CorpusCoverageAnalysis(File file) throws IOException {
this.root = file.getCanonicalFile();
System.out.println("Tika Version: "+CoverageAnalysis.getComponentVersion("org.apache.tika", "tika-core"));
System.out.println("Fidget Version: "+CoverageAnalysis.getComponentVersion("org.opf-labs.fmt", "fidget"));
System.exit(1);

}

public void ident() throws FileNotFoundException, URISyntaxException {
public void ident() throws URISyntaxException, IOException {
this.ident(root);
}

private void ident( File folder ) throws FileNotFoundException, URISyntaxException {
private void ident( File folder ) throws URISyntaxException, IOException {
for( File f : folder.listFiles() ) {
if( !f.isHidden() && !f.getName().equals("tools") ) {
if( f.isDirectory() ) {
Expand All @@ -50,12 +60,14 @@ private void ident( File folder ) throws FileNotFoundException, URISyntaxExcepti
IdentificationResult ibs = tBase.identify( new FileInputStream(f) );
IdentificationResult iaf = tAll.identify(f);
IdentificationResult ias = tAll.identify( new FileInputStream(f) );
System.out.println("\""+ibf.getLocation().normalize().toASCIIString().replaceFirst(root.toURI().toASCIIString(), "")+"\""
+", \""+ibf.getMime()+"\""
+", \""+ibs.getMime()+"\""
+", \""+iaf.getMime()+"\""
+", \""+ias.getMime()+"\""
);
System.out.println("\""+iaf.getLocation().normalize().toASCIIString().replaceFirst(root.toURI().toASCIIString(), "")+"\""
// +", \""+tika.detect(f)+"\""
// +", \""+tika.detect( new FileInputStream(f) )+"\""
+", \""+ibf.getMime()+"\""
+", \""+ibs.getMime()+"\""
+", \""+iaf.getMime()+"\""
+", \""+ias.getMime()+"\""
);
}
}
}
Expand All @@ -67,8 +79,11 @@ private void ident( File folder ) throws FileNotFoundException, URISyntaxExcepti
* @throws IOException
*/
public static void main(String[] args) throws URISyntaxException, IOException {
CorpusCoverageAnalysis cca = new CorpusCoverageAnalysis(new File("../.."));
CorpusCoverageAnalysis cca = new CorpusCoverageAnalysis(new File(args[0]));
System.out.println("# Path, Tika, Tika (stream only), Fidget, Fidget (stream only)");
cca.ident();
System.out.println("# Fidget Version: "+CoverageAnalysis.getComponentVersion("org.opf-labs.fmt", "fidget"));
System.out.println("# Tika Version: "+CoverageAnalysis.getComponentVersion("org.apache.tika", "tika-core"));
}

}
Expand Up @@ -101,6 +101,9 @@ public static void main(String[] args) throws FileNotFoundException, URISyntaxEx
*/
public static String getComponentVersion(String groupId, String artifactId) {
InputStream r = CoverageAnalysis.class.getResourceAsStream( "/META-INF/maven/"+groupId+"/"+artifactId+"/pom.properties");
if( r == null ) {
return null;
}
Properties p = new Properties();
try {
p.load(r);
Expand Down
15 changes: 15 additions & 0 deletions tools/coverage/src/main/java/org/opf_labs/fmts/DroidTypes.java
@@ -1,3 +1,18 @@
/**
* Copyright (C) 2012 Andrew Jackson <Andrew.Jackson@bl.uk>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
Expand Down
15 changes: 15 additions & 0 deletions tools/coverage/src/main/java/org/opf_labs/fmts/TikaTypes.java
@@ -1,3 +1,18 @@
/**
* Copyright (C) 2012 Andrew Jackson <Andrew.Jackson@bl.uk>
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
*
*/
Expand Down

0 comments on commit bee5964

Please sign in to comment.