Skip to content
This repository has been archived by the owner on Jun 18, 2020. It is now read-only.

Commit

Permalink
aho-corasick for persons
Browse files Browse the repository at this point in the history
  • Loading branch information
builtofire committed Jul 4, 2016
1 parent 63b4f43 commit 0f30c3f
Show file tree
Hide file tree
Showing 30 changed files with 216,192 additions and 36 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ public abstract class AExpander implements Expander {
static public final String EXPKEY_OCREDTEXT="ocredText";

@Override
public void expand(IngestedFile file) {
final public void expand(IngestedFile file) {
file.getAppliedExpanders().add(name);
file.setCntExpanders(file.getCntExpanders()+1);
expandSuper(file);
Expand Down
68 changes: 68 additions & 0 deletions entitycore/src/main/java/org/occrp/entityman/AExtractor.java
Original file line number Diff line number Diff line change
@@ -1,8 +1,10 @@
package org.occrp.entityman;

import java.util.List;
import java.util.Map;

import org.occrp.entityman.Extractor;
import org.occrp.entityman.model.IngestedFile;
import org.occrp.entityman.model.entities.AEntity;
import org.occrp.entityman.utils.EntitymanUtils;

Expand All @@ -11,6 +13,10 @@ public abstract class AExtractor implements Extractor {

private String name;

protected List<AFilter> filters;

protected List<Enricher> enrichers;

public String getName() {
return name;
}
Expand All @@ -23,9 +29,71 @@ public void setName(String name) {

private int excerptRadius = 50;

@Override
final public List<AEntity> extract(IngestedFile file) {
file.getAppliedExtractors().add(name);
file.setCntExtractors(file.getCntExtractors()+1);
List<AEntity> entities = extractSuper(file);

entities = doEnrich(entities,
String.valueOf(file.getExpandedData().get(AExpander.EXPKEY_SIMPLETEXT)));
entities = doFilter(entities);

return entities;
}

abstract public List<AEntity> extractSuper(IngestedFile file);

public List<AEntity> doFilter(List<AEntity> entities) {
List<AEntity> res = entities;
if (filters!=null) {
for (Filter filter : filters) {
res = filter.filter(res);
}
}
return res;
}

public List<AEntity> doEnrich(List<AEntity> aes, String src) {
List<AEntity> res = aes;
if (enrichers!=null) {
for (AEntity ae : res) {
for (Enricher enricher : enrichers) {
enricher.tryEnrich(ae, src);
}
}
}

return res;
}

public String findExcerpt(String s, int start, int end) {
return EntitymanUtils.findExcerpt(s, start, end, excerptRadius);
}

public List<AFilter> getFilters() {
return filters;
}

public void setFilters(List<AFilter> filters) {
this.filters = filters;
}

public int getExcerptRadius() {
return excerptRadius;
}

public void setExcerptRadius(int excerptRadius) {
this.excerptRadius = excerptRadius;
}

public List<Enricher> getEnrichers() {
return enrichers;
}

public void setEnrichers(List<Enricher> enrichers) {
this.enrichers = enrichers;
}

}

Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.occrp.entityman.glutton.filters;
package org.occrp.entityman;

public abstract class AFilter implements Filter {

Expand Down
7 changes: 7 additions & 0 deletions entitycore/src/main/java/org/occrp/entityman/Enricher.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
package org.occrp.entityman;

import org.occrp.entityman.model.entities.AEntity;

public interface Enricher {
public void tryEnrich(AEntity ae, String src);
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
package org.occrp.entityman.glutton.filters;
package org.occrp.entityman;

import java.util.List;

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
package org.occrp.entityman;
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,8 @@ public class Fact extends AEntity {

private long position = -1;

private long positionEnd = -1;

private Map<String,Object> data = new HashMap<String, Object>();

public String getEntity() {
Expand Down Expand Up @@ -55,6 +57,14 @@ public Map<String, Object> getData() {
return data;
}

public long getPositionEnd() {
return positionEnd;
}

public void setPositionEnd(long positionEnd) {
this.positionEnd = positionEnd;
}

public void setData(Map<String, Object> data) {
this.data = data;
}
Expand Down
10 changes: 8 additions & 2 deletions glutton/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -118,10 +118,16 @@
<dependency>
<groupId>net.sourceforge.tess4j</groupId>
<artifactId>tess4j</artifactId>
<!-- <version>2.0.1</version> -->
<version>1.5.0</version>
<version>2.0.1</version>
<!-- <version>1.5.0</version> -->
</dependency>

<!-- https://mvnrepository.com/artifact/org.ahocorasick/ahocorasick -->
<dependency>
<groupId>org.ahocorasick</groupId>
<artifactId>ahocorasick</artifactId>
<version>0.3.0</version>
</dependency>

</dependencies>

Expand Down
61 changes: 54 additions & 7 deletions glutton/src/main/java/org/occrp/entityman/glutton/Gluttony.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import org.apache.logging.log4j.Logger;
import org.occrp.entityman.Expander;
import org.occrp.entityman.Extractor;
import org.occrp.entityman.glutton.filters.Filter;
import org.occrp.entityman.Filter;
import org.occrp.entityman.model.IngestedFile;
import org.occrp.entityman.model.entities.AEntity;
import org.springframework.beans.factory.annotation.Autowired;
Expand All @@ -31,10 +31,7 @@ public class Gluttony {
@Autowired
private List<Filter> filters;

public List<AEntity> call(IngestedFile file) throws Exception {

List<AEntity> res = new LinkedList<>();

public void doExpand(IngestedFile file) {
for (Expander e : expanders) {
try {
log.debug("{} expanding : {}",e.getName(),file.getFileUri());
Expand All @@ -43,7 +40,11 @@ public List<AEntity> call(IngestedFile file) throws Exception {
log.error("Failed expander : {}",e.getName(),ex);
}
}

}

public List<AEntity> doExtract(IngestedFile file) {
List<AEntity> res = new LinkedList<>();

for (Extractor e : extractors) {
try {
log.debug("{} expanding : {}",e.getName(),file.getFileUri());
Expand All @@ -53,7 +54,12 @@ public List<AEntity> call(IngestedFile file) throws Exception {
log.error("Failed expander : {}",e.getName(),ex);
}
}

return res;
}

public List<AEntity> doFilter(List<AEntity> entities) {
List<AEntity> res = entities;
if (filters!=null) {
for (Filter f : filters) {
try {
Expand All @@ -63,8 +69,49 @@ public List<AEntity> call(IngestedFile file) throws Exception {
log.error("Failed filter : {}",f.getFilterName(),ex);
}
}

}
return res;
}

public List<AEntity> call(IngestedFile file) throws Exception {

// List<AEntity> res = new LinkedList<>();

// for (Expander e : expanders) {
// try {
// log.debug("{} expanding : {}",e.getName(),file.getFileUri());
// e.expand(file);
// } catch (Exception ex) {
// log.error("Failed expander : {}",e.getName(),ex);
// }
// }

doExpand(file);

// for (Extractor e : extractors) {
// try {
// log.debug("{} expanding : {}",e.getName(),file.getFileUri());
// res.addAll(e.extract(file));
//
// } catch (Exception ex) {
// log.error("Failed expander : {}",e.getName(),ex);
// }
// }

List<AEntity> res = doExtract(file);

// if (filters!=null) {
// for (Filter f : filters) {
// try {
// log.debug("{} filtering entities : {}",f.getFilterName(),res.size());
// res = f.filter(res);
// } catch (Exception ex) {
// log.error("Failed filter : {}",f.getFilterName(),ex);
// }
// }
// }

res = doFilter(res);

//res = entityManager.merge(res);

Expand Down
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
package org.occrp.entityman.glutton;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;

import org.occrp.entityman.Expander;
import org.occrp.entityman.Extractor;
import org.occrp.entityman.Filter;
import org.occrp.entityman.glutton.ets.AhoCorasickExtractor;
import org.occrp.entityman.glutton.ets.PersonNameEnricher;
import org.occrp.entityman.glutton.ets.PrefixPostfixListEnricher;
import org.occrp.entityman.glutton.ets.RegexpExtractor;
import org.occrp.entityman.glutton.ets.RestStanfordExtractor;
import org.occrp.entityman.glutton.ets.StanfordExtractor;
import org.occrp.entityman.glutton.expanders.OpenocrExpander;
import org.occrp.entityman.glutton.expanders.TikaExpander;
import org.occrp.entityman.glutton.filters.DictionaryBstFilter;
import org.occrp.entityman.glutton.filters.DictionaryFilter;
import org.occrp.entityman.glutton.filters.Filter;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.context.annotation.Bean;
Expand Down Expand Up @@ -124,6 +128,71 @@ public RestStanfordExtractor extractorStanfordSpanish() {
return stanfordExtractor;
}

@Value("${ace.dictionary.company.md}")
String aceDictionaryCompaniesMd;

@Value("${ace.prefixes.company.md}")
String acePrefixesCompanyMd;

@Bean
public AhoCorasickExtractor aceCompaniesMd() {
AhoCorasickExtractor ace = new AhoCorasickExtractor();

ace.setDictionary(aceDictionaryCompaniesMd);
ace.setName("aceCompaniesMd");
ace.setEntityName("Company");
ace.setEntityKey("name");

PrefixPostfixListEnricher pple = new PrefixPostfixListEnricher();
pple.setFieldName("name");
pple.setFile(acePrefixesCompanyMd);

ace.setEnrichers(Arrays.asList(pple));

DictionaryFilter df = new DictionaryFilter();
df.setEntityType("Company");
df.setFieldName("name");
df.setFilterName("Company prefix filter");
df.setWhitelistResource(acePrefixesCompanyMd);

ace.setFilters(Arrays.asList(df));

return ace;
}

@Value("${ace.dictionary.persons}")
String aceDictionaryPersons;

@Bean
public PersonNameEnricher pne() {
PersonNameEnricher pne = new PersonNameEnricher();
pne.setFieldName("name");

return pne;
}

@Bean
public AhoCorasickExtractor acePerson() {
AhoCorasickExtractor ace = new AhoCorasickExtractor();

ace.setDictionary(aceDictionaryPersons);
ace.setName("acePersons");
ace.setEntityName("Person");
ace.setEntityKey("name");

ace.setEnrichers(Arrays.asList(pne()));

DictionaryBstFilter dbf = new DictionaryBstFilter();
dbf.setEntityType("Person");
dbf.setFieldName("name");
dbf.setFilterName("Person name filter");
dbf.setWhitelistResource(aceDictionaryPersons);

ace.setFilters(Arrays.asList(dbf));

return ace;
}

@Bean
public List<Extractor> extractors() {
List<Extractor> extractors = new ArrayList<>();
Expand All @@ -133,6 +202,8 @@ public List<Extractor> extractors() {
extractors.add(extractorPersonIdno());
extractors.add(extractorStanford());
extractors.add(extractorStanfordSpanish());
extractors.add(aceCompaniesMd());
extractors.add(acePerson());

return extractors;
}
Expand Down
Loading

0 comments on commit 0f30c3f

Please sign in to comment.