Skip to content

Commit

Permalink
Use all annotations for activities
Browse files Browse the repository at this point in the history
  • Loading branch information
Barry Coughlan committed Mar 5, 2013
1 parent 0ce15d8 commit eabfec0
Show file tree
Hide file tree
Showing 3 changed files with 14 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -13,19 +13,14 @@ private static void usage() {
}

public static void main(String[] args) throws Exception {
// It is better to batch convert because Gate.init() takes a few seconds
// to run.
// Yes, this process is tied together with string.
// It is better to batch convert because Gate.init() takes a few seconds to run.

if (args.length <= 2) {
usage();
return;
}

String gappFile = args[0];
System.out.println(args[0]);
System.out.println(args[1]);
System.out.println(args[2]);
GATEAnnotator batchProcessApp = new GATEAnnotatorImpl();
batchProcessApp.loadGappFile(new File(gappFile));

Expand Down
5 changes: 3 additions & 2 deletions rdfconverters/ie2rdf/gatexmlgenerator/icbnltk.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,8 @@ def remove_parents(icb, uris):

def remove_business_gunk(phrase):
gunk = ['wide range', 'company', 'companies', 'industry', 'international',
'services?', 'product(?!ion)s?', 'solutions', 'leading', 'sales', 'sector', 'integrated']
'services?', 'product(?!ion)s?', 'solutions', 'leading', 'sales', 'sector', 'integrated',
'largest', 'business', 'industr(y|ies)', 'systems?', 'innovat(e|ive)', 'markets?']
for g in gunk:
phrase = re.sub(g, '', phrase)
return phrase
Expand All @@ -98,7 +99,7 @@ def stem(word):


def tokenize_words(text):
words = tokenize.word_tokenize(text.lower())
words = tokenize.word_tokenize(text.lower().replace('-', ' '))
# Remove stopwords and punctuation
stop = stopwords.words("english") + list(";:'\".,&()")
words = (w for w in words if w not in stop)
Expand Down
18 changes: 10 additions & 8 deletions rdfconverters/ie2rdf/gatexmlgenerator/ie.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,8 @@ def convert(document):
companyName = ""

# Activity - ------------------------------------------------------------------------
activities = {a: icbnltk.icb_matches(a) for a in activityList}
activitySet = set(activityList)
activities = icbnltk.icb_matches(' '.join(activitySet))


# Employee - number ----------------------------------------------------------------------
Expand Down Expand Up @@ -239,18 +240,19 @@ def make_element(tag_name, **attributes):

def make_annotation(value):
annotation = doc.createElement("annotation")
annotationText = doc.createTextNode(source_text)
annotationText = doc.createTextNode(value)
annotation.appendChild(annotationText)
return annotation

#Activity
if len(activities) > 0:
for source_text, matches in activities.items():
for icb_number, icb_activity in matches:
el = make_element("activity", label=icb_activity, id=icb_number)
annotation = make_annotation(source_text)
if activities is not None and len(activities) > 0:
for icb_number, icb_activity in activities:
el = make_element("activity", label=icb_activity, id=icb_number)

for activity in activitySet:
annotation = make_annotation(activity)
el.appendChild(annotation)
cp.appendChild(el)
cp.appendChild(el)

# Location
if locResult != "":
Expand Down

0 comments on commit eabfec0

Please sign in to comment.