In [1]:
using TypedDelegation   
using StaticArrays
using Base.Test

In [2]:
using Tokenize
import Tokenize: Tokens
import Tokens: Token, startpos, endpos, untokenize, kind, exactkind



In [3]:
src = """
a=1
a+3
c=a+4; b=4

#A comment

function f(x)
    (1+2)/3
end

#= a 
multiline
comment
=#

println("abc ; efg")

\"""
a multiline
string
\"""

a+=2

map(1:5) do x
    a=2*x
    3*a
end

begin 
    c=1
end

if c
    d=3
end

"""
tokens = tokenize(src) |> collect;


In [4]:
immutable AnnotatedToken
    tok::Token
    notes::Dict{Symbol, Any}
end

AnnotatedToken(vv::Token) = AnnotatedToken(vv, Dict{Symbol, Any}())
@delegate_oneField(AnnotatedToken, tok, [exactkind, startpos, endpos, untokenize, kind])
token(atok::AnnotatedToken) = atok.tok

token (generic function with 1 method)

In [102]:

function Base.show(io::IO, atok::AnnotatedToken)
  start_r, start_c = startpos(atok)
  end_r, end_c = endpos(atok)
  str = kind(atok) == Tokens.ENDMARKER ? "" : untokenize(atok)
    pos_str = string(start_r, ",", start_c, "-", end_r,   ",", end_c, ": ")
    
    print(io, rpad(pos_str, 21, " "))
    show(io, str)
    print(io, "   ", exactkind(atok), " (",lowercase(string(kind(atok))),")","\t")
    
    for (tag,val) in atok.notes
        print(io, " | ", string(tag),": "))
        if isa(val, AnnotatedToken)
            show(io, token(val))
        else
            show(val)
        end
    end
    println(io)
end

LoadError: syntax: extra token ")" after end of expression

- **New line** always ends expressions
 - Unless Immediately within a block opened by `(`
 - Or where the last nonwhitespace token was binary operator (including a `,`?)

In [103]:
isnewline(tok) = kind(tok)==Tokens.WHITESPACE && '\n' ∈ untokenize(tok)




isnewline (generic function with 1 method)

In [104]:
@test isnewline(tokens[end-1])

[1m[32mTest Passed
[0m  Expression: isnewline(tokens[end - 1])

In [105]:
const identifier_phrase_end_kinds = SVector[Tokens.IDENTIFIER, Tokens.RPAREN, Tokens.RSQUARE]



3-element Array{StaticArrays.SVector,1}:
 Tokenize.Tokens.Kind[IDENTIFIER]
 Tokenize.Tokens.Kind[RPAREN]    
 Tokenize.Tokens.Kind[RSQUARE]   

In [106]:
function annotate(toks)
    subclause_markers = Dict(
            Tokens.LBRACE => Tokens.RBRACE,
            Tokens.LPAREN => Tokens.RPAREN,
            Tokens.LSQUARE => Tokens.RSQUARE,
            Tokens.FOR => Tokens.END,
            Tokens.WHILE => Tokens.END,
            Tokens.IF => Tokens.END,
            Tokens.TRY => Tokens.END,
            Tokens.BEGIN => Tokens.END,
            Tokens.QUOTE => Tokens.END
            
    )
    open_subclauses = AnnotatedToken[]
    has_open() = length(open_subclauses) > 0
    last_open_exactkind() = exactkind(open_subclauses[end])
    is_next_closer(atok) = has_open() && exactkind(atok) == subclause_markers[last_open_exactkind()]
    
    in_paren() = has_open() && last_open_exactkind() == Tokens.LPAREN
    in_square() = has_open() && last_open_exactkind() == Tokens.LSQUARE
    

    
    
    
    history = SVector(Tokens.ENDMARKER, Tokens.ENDMARKER)
    function add_history!(atok)
        keep = true
        keep &= kind(atok) != Tokens.COMMENT
        keep &= kind(atok) != Tokens.WHITESPACE 
        keep |= isnewline(atok) && history[end] != Tokens.WHITESPACE
        
        if keep
            history = @SVector([shift(history)..., kind(atok)])
        end
    end
    
    am_continuing_operation() = history[end]==Tokens.OP && history[end-1] ∈ identifier_phrase_end_kinds

    
    function handle_ifclose!(atok)
        if  is_next_closer(atok)
            opener = pop!(open_subclauses)
            opener.notes[:closedby] = atok
            atok.notes[:openedby] = opener
        end
    end
    
    function handle_ifopen!(atok)
        if haskey(subclause_markers, exactkind(atok))
            push!(open_subclauses, atok)
        end
    end
    
    
    
    "Handless expression enders, (this excludes closers)."
    function handle_expression_enders!(atok)
        is_ender = ((!in_paren() &&  !am_continuing_operation() && isnewline(atok)) 
                        || exactkind(atok) == Tokens.COMMA    
                        || exactkind(atok) == Tokens.SEMICOLON)    
        if is_ender
            atok.notes[:ender] = true
        end
    end
    
    
    outstream = AnnotatedToken[]
    for tok in toks
        atok = AnnotatedToken(tok)
        push!(outstream, atok)
                
        handle_ifopen!(atok)
        handle_ifclose!(atok)
        handle_expression_enders!(atok)
        
        add_history!(atok)         
    end
    outstream
end



annotate (generic function with 1 method)

In [107]:
show.(annotate(tokens));

1,1-1,1:             "a"   IDENTIFIER (identifier)	
1,2-1,2:             "="   EQ (op)	
1,3-1,3:             "1"   INTEGER (integer)	
1,4-2,0:             "\n"   WHITESPACE (whitespace)	 | ender:true
2,1-2,1:             "a"   IDENTIFIER (identifier)	
2,2-2,2:             "+"   PLUS (op)	
2,3-2,3:             "3"   INTEGER (integer)	
2,4-3,0:             "\n"   WHITESPACE (whitespace)	 | ender:true
3,1-3,1:             "c"   IDENTIFIER (identifier)	
3,2-3,2:             "="   EQ (op)	
3,3-3,3:             "a"   IDENTIFIER (identifier)	
3,4-3,4:             "+"   PLUS (op)	
3,5-3,5:             "4"   INTEGER (integer)	
3,6-3,6:             ";"   SEMICOLON (semicolon)	 | ender:true
3,7-3,7:             " "   WHITESPACE (whitespace)	
3,8-3,8:             "b"   IDENTIFIER (identifier)	
3,9-3,9:             "="   EQ (op)	
3,10-3,10:           "4"   INTEGER (integer)	
3,11-5,0:            "\n\n"   WHITESPACE (whitespace)	 | ender:true
5,1-5,10:            "#A comment"   COMMENT (comment)	
5,

In [109]:
show.((tokenize("""
(a +
b)
{a +
b}
[a +
b]


""") |> collect ) |> annotate);

1,1-1,1:             "("   LPAREN (lparen)	 | closedby:2,2-2,2:   RPAREN	")"
1,2-1,2:             "a"   IDENTIFIER (identifier)	
1,3-1,3:             " "   WHITESPACE (whitespace)	
1,4-1,4:             "+"   PLUS (op)	
1,5-2,0:             "\n"   WHITESPACE (whitespace)	
2,1-2,1:             "b"   IDENTIFIER (identifier)	
2,2-2,2:             ")"   RPAREN (rparen)	 | openedby:1,1-1,1:   LPAREN	"("
2,3-3,0:             "\n"   WHITESPACE (whitespace)	 | ender:true
3,1-3,1:             "{"   LBRACE (lbrace)	 | closedby:4,2-4,2:   RBRACE	"}"
3,2-3,2:             "a"   IDENTIFIER (identifier)	
3,3-3,3:             " "   WHITESPACE (whitespace)	
3,4-3,4:             "+"   PLUS (op)	
3,5-4,0:             "\n"   WHITESPACE (whitespace)	 | ender:true
4,1-4,1:             "b"   IDENTIFIER (identifier)	
4,2-4,2:             "}"   RBRACE (rbrace)	 | openedby:3,1-3,1:   LBRACE	"{"
4,3-5,0:             "\n"   WHITESPACE (whitespace)	 | ender:true
5,1-5,1:             "["   LSQUARE (lsquare)	 | close