In [88]:
# Note: SemCore is *not* XML.

In [89]:
abstract TaggedWord
immutable SenseAnnotatedWord{S<:AbstractString} <: TaggedWord
    pos::S
    lemma::S
    wnsn::S
    lexsn::S
    word::S
end


immutable PosTaggedWord{S<:AbstractString} <: TaggedWord
    pos::S
    word::S
end


LoadError: LoadError: invalid redefinition of constant SenseAnnotatedWord
while loading In[89], in expression starting on line 2

In [154]:
match(r"lexsn=(\d.*:\d*).*", "lexsn=1:03:00:: pn=group")

RegexMatch("lexsn=1:03:00:: pn=group", 1="1:03:00::")

In [198]:

function parse_sense_annotated_word(line::AbstractString)
    captures = match(r"<wf cmd=done.* pos=(.*) lemma=(.*) wnsn=(.*) lexsn=(\d.*:\d*).*>(.*).*</wf>", line).captures
    SenseAnnotatedWord(captures...)
end

function parse_tagged_word(line::AbstractString)
    captures = match(r"<wf cmd=.* pos=(.*).*>(.*)</wf>", line).captures
    PosTaggedWord(captures...)
end


parse_tagged_word (generic function with 1 method)

In [199]:
Base.Test.@testset "Sense Annoations" begin
    Base.Test.@test parse_sense_annotated_word("""<wf cmd=done pos=JJ lemma=recent wnsn=2 lexsn=5:00:00:past:00>recent</wf>""") != false
    
    eg = """<wf cmd=done rdf=group pos=NNP lemma=group wnsn=1 lexsn=1:03:00:: pn=group>Fulton_County_Grand_Jury</wf>"""
    Base.Test.@test parse_sense_annotated_word(eg).pos == "NNP"
    Base.Test.@test parse_sense_annotated_word(eg).lemma == "group"
    Base.Test.@test parse_sense_annotated_word(eg).wnsn == "1"
    Base.Test.@test parse_sense_annotated_word(eg).lexsn == "1:03:00::"
    Base.Test.@test parse_sense_annotated_word(eg).word == "Fulton_County_Grand_Jury"

    
    Base.Test.@test parse_sense_annotated_word("""<wf cmd=done pos=JJ lemma=recent wnsn=2 lexsn=5:00:00:past:00>recent</wf>""") != false
    Base.Test.@test parse_sense_annotated_word("""<wf cmd=done pos=NN lemma=year wnsn=2;1 lexsn=1:28:02::;1:28:01::>year</wf>""") != false
    
    
    
    eg = """<wf cmd=done pos=NN lemma=report wnsn=20 lexsn=1:10:00::>reports</wf>"""
    display(parse_sense_annotated_word(eg))
    Base.Test.@test parse_sense_annotated_word(eg).pos == "NN"
    Base.Test.@test parse_sense_annotated_word(eg).lemma == "report"
    Base.Test.@test parse_sense_annotated_word(eg).wnsn == "20"
    Base.Test.@test parse_sense_annotated_word(eg).lexsn == "1:10:00::"
    Base.Test.@test parse_sense_annotated_word(eg).word == "reports"

end;

Base.Test.@testset "POS Tag Annoations" begin
    eg = """<wf cmd=ignore pos=IN>of</wf>"""
    Base.Test.@test parse_tagged_word(eg).pos == "IN"
    Base.Test.@test parse_tagged_word(eg).word == "of"

    
end;

SenseAnnotatedWord{SubString{String}}("NN","report","20","1:10:00::","reports")

Test Summary:    | Pass  Total
  Sense Annoations |   13     13
Test Summary:      | Pass  Total
  POS Tag Annoations |    2      2


In [200]:
function parse_semcore(lines)
    #Note: 
    
    #Goldplate: There is a nicer way to do this, lazily, with coroutines.
    sents = Vector{TaggedWord}[]
        
    ignore(line) = nothing
    get_tagged(line) = push!(sents[end], parse_tagged_word(line))
    get_sense_annotated(line) = push!(sents[end], parse_sense_annotated_word(line))
    get_entity_annodated(line) = push!(sents[end], parse_entity_annotated_word(line))
    
    
    subparsers = Dict(
        "<wf cmd=tag"=> get_tagged,
        "<wf cmd=ignore"=> get_tagged,
        "<wf cmd=done"=> line -> contains(line,"lemma=") ? get_sense_annotated(line) : get_tagged(line),
        "<punc>"=>ignore,
        "<context"=> ignore,
        "</context" => ignore,
        "<p" => ignore,#parastart,
        "</p" => ignore, #paraend,
        "<s" => line -> push!(sents,TaggedWord[]), #sentstart,
        "</s" => ignore#sentend
    )
    
    for line in lines
        try
            found = false
            for (prefix, subparse) in subparsers
                if startswith(line, prefix)
                    found=true
                    subparse(line)
                    break
                end
            end
            @assert(found, "No parser for \"$line\"") 
        catch ee
            error("Error parsing \"$line\". $ee")
        end
    end
    
    return sents
end

parse_semcore (generic function with 1 method)

In [201]:
function load_semcor(tagdir_path)
    Task() do
        for filename in [joinpath(tagdir_path, d) for d in readdir(tagdir_path)]
            produce.(parse_semcore(eachline(filename)))
        end
    end
end

load_semcor (generic function with 1 method)

In [202]:
sents = collect(load_semcor("corpora/semcor2.1/brown1/tagfiles/"));

In [204]:
sents[end-1]

46-element Array{TaggedWord,1}:
 PosTaggedWord{SubString{String}}("PRP","She")                                                    
 SenseAnnotatedWord{SubString{String}}("VB","be","1","2:42:03::","was")                           
 PosTaggedWord{SubString{String}}("DT","a")                                                       
 SenseAnnotatedWord{SubString{String}}("JJ","living","4","5:00:00:absolute:00","living")          
 SenseAnnotatedWord{SubString{String}}("NN","doll","1","1:06:00::","doll")                        
 PosTaggedWord{SubString{String}}("CC","and")                                                     
 PosTaggedWord{SubString{String}}("DT","no")                                                      
 SenseAnnotatedWord{SubString{String}}("NN","mistake","2","1:09:00::","mistake")                  
 PosTaggedWord{SubString{String}}("DT","the")                                                     
 SenseAnnotatedWord{SubString{String}}("JJ","blue-black","1","5:00:00:achroma

In [None]:
;g