In [1]:
using TypedDelegation   
using StaticArrays
using Base.Test
using AbstractTrees

In [2]:
using Tokenize
import Tokenize: Tokens
import Tokens: Token, startpos, endpos, untokenize, kind, exactkind

import Base: getindex, setindex!, pop!, push!, endof

In [3]:
const identifier_phrase_end_kinds = SVector[Tokens.IDENTIFIER, Tokens.RPAREN, Tokens.RSQUARE]

const subclause_markers = Dict(
    Tokens.LBRACE => Tokens.RBRACE,
    Tokens.LPAREN => Tokens.RPAREN,
    Tokens.LSQUARE => Tokens.RSQUARE,
    Tokens.FOR => Tokens.END,
    Tokens.WHILE => Tokens.END,
    Tokens.IF => Tokens.END,
    Tokens.TRY => Tokens.END,
    Tokens.BEGIN => Tokens.END,
    Tokens.QUOTE => Tokens.END,
    Tokens.MODULE => Tokens.END,
    Tokens.TYPE => Tokens.END,
    Tokens.BAREMODULE => Tokens.END,
    Tokens.IMMUTABLE => Tokens.END,
    Tokens.DO => Tokens.END,
    Tokens.FUNCTION => Tokens.END
    )

Dict{Tokenize.Tokens.Kind,Tokenize.Tokens.Kind} with 15 entries:
  LSQUARE    => RSQUARE
  WHILE      => END
  FUNCTION   => END
  LPAREN     => RPAREN
  TYPE       => END
  TRY        => END
  LBRACE     => RBRACE
  DO         => END
  IMMUTABLE  => END
  BAREMODULE => END
  BEGIN      => END
  QUOTE      => END
  IF         => END
  MODULE     => END
  FOR        => END

In [4]:
src = """
a=1
a+3
c=a+4; b=4

#A comment

function f(x)
    (1+2)/3
end

#= a 
multiline
comment
=#

println("abc ; efg")

\"""
a multiline
string
\"""

a+=2

map(1:5) do x
    a=2*x
    3*a
end

begin 
    c=1
end

if c
    d=3
end

"""
tokens = tokenize(src) |> collect;


In [5]:
type AnnotatedToken
    tok::Token
    buddy::Nullable{AnnotatedToken}
    ender::Bool
end

AnnotatedToken(vv::Token) = AnnotatedToken(vv, Nullable{AnnotatedToken}(), false)
@delegate_oneField(AnnotatedToken, tok, [exactkind, startpos, endpos, untokenize, kind])
token(atok::AnnotatedToken) = atok.tok


"""
Identifies if a token ends an expression
Requres atok to be properly initialised
"""
is_ender(atok::AnnotatedToken) = atok.ender


"""
Identifies if a token opens a subclause
Requres atok to be properly initialised
"""
function is_opener(atok::AnnotatedToken)
    haskey(subclause_markers, exactkind(atok)) && !isnull(atok.buddy)
end


"""
Identifies if a token closes a subclause
Requres atok to be properly initialised
"""
function is_closer(atok::AnnotatedToken)
    exactkind(atok) ∈ values(subclause_markers)  && !isnull(atok.buddy)
end

is_closer

In [6]:

function Base.show(io::IO, atok::AnnotatedToken)
    start_r, start_c = startpos(atok)
    end_r, end_c = endpos(atok)
    
    
    
    pos_str = string(start_r, ",", start_c, "-", end_r,   ",", end_c, ": ")
    print(io, rpad(pos_str, 21, " "))
    
    str = kind(atok) == Tokens.ENDMARKER ? "" : untokenize(atok)
    show(io, str)
    print(io, "   ", exactkind(atok), " (",lowercase(string(kind(atok))),")","\t")
    
    is_ender(atok) && print(io, "| ender ")
    
    if is_closer(atok)
        print(io, "| closer ↑(")
        show(io,token(get(atok.buddy)))
        print(io, ")↑ ")
    end
    
    if is_opener(atok)
        print(io, "| opener ↓(")
        show(io,token(get(atok.buddy)))
        print(io, ")↓ ")
    end
    
    println(io)
end

- **New line** always ends expressions
 - Unless Immediately within a block opened by `(`
 - Or where the last nonwhitespace token was binary operator (including a `,`?)

In [7]:
isnewline(tok) = kind(tok)==Tokens.WHITESPACE && '\n' ∈ untokenize(tok)


isnewline (generic function with 1 method)

In [8]:
@test isnewline(tokens[end-1])

[1m[32mTest Passed
[0m  Expression: isnewline(tokens[end - 1])

In [9]:
function annotate(toks)
    
    open_subclauses = AnnotatedToken[]
    has_open() = length(open_subclauses) > 0
    last_open_exactkind() = exactkind(open_subclauses[end])
    is_next_closer(atok) = has_open() && exactkind(atok) == subclause_markers[last_open_exactkind()]
    
    in_paren() = has_open() && last_open_exactkind() == Tokens.LPAREN
    in_square() = has_open() && last_open_exactkind() == Tokens.LSQUARE
    

    
    
    
    history = SVector(Tokens.ENDMARKER, Tokens.ENDMARKER)
    function add_history!(atok)
        keep = true
        keep &= kind(atok) != Tokens.COMMENT
        keep &= kind(atok) != Tokens.WHITESPACE 
        keep |= isnewline(atok) && history[end] != Tokens.WHITESPACE
        
        if keep
            history = @SVector([shift(history)..., kind(atok)])
        end
    end
    
    am_continuing_operation() = history[end]==Tokens.OP && history[end-1] ∈ identifier_phrase_end_kinds

    
    function handle_ifclose!(atok)
        if  is_next_closer(atok)
            opener = pop!(open_subclauses)
            opener.buddy = atok
            atok.buddy = opener
        end
    end
    
    function handle_ifopen!(atok)
        if haskey(subclause_markers, exactkind(atok))
            push!(open_subclauses, atok)
        end
    end
    
    
    
    "Handless expression enders, (this excludes closers)."
    function handle_expression_enders!(atok)
        is_ender = ((!in_paren() &&  !am_continuing_operation() && isnewline(atok)) 
                        || exactkind(atok) == Tokens.COMMA    
                        || exactkind(atok) == Tokens.SEMICOLON)    
        if is_ender
            atok.ender = true
        end
    end
    
    
    outstream = AnnotatedToken[]
    for tok in toks
        atok = AnnotatedToken(tok)
        push!(outstream, atok)
                
        handle_ifopen!(atok)
        handle_ifclose!(atok)
        handle_expression_enders!(atok)
        
        add_history!(atok)         
    end
    outstream
end

annotate (generic function with 1 method)

In [10]:
atokens = annotate(tokens)
show.(atokens);

1,1-1,1:             "a"   IDENTIFIER (identifier)	
1,2-1,2:             "="   EQ (op)	
1,3-1,3:             "1"   INTEGER (integer)	
1,4-2,0:             "\n"   WHITESPACE (whitespace)	| ender 
2,1-2,1:             "a"   IDENTIFIER (identifier)	
2,2-2,2:             "+"   PLUS (op)	
2,3-2,3:             "3"   INTEGER (integer)	
2,4-3,0:             "\n"   WHITESPACE (whitespace)	| ender 
3,1-3,1:             "c"   IDENTIFIER (identifier)	
3,2-3,2:             "="   EQ (op)	
3,3-3,3:             "a"   IDENTIFIER (identifier)	
3,4-3,4:             "+"   PLUS (op)	
3,5-3,5:             "4"   INTEGER (integer)	
3,6-3,6:             ";"   SEMICOLON (semicolon)	| ender 
3,7-3,7:             " "   WHITESPACE (whitespace)	
3,8-3,8:             "b"   IDENTIFIER (identifier)	
3,9-3,9:             "="   EQ (op)	
3,10-3,10:           "4"   INTEGER (integer)	
3,11-5,0:            "\n\n"   WHITESPACE (whitespace)	| ender 
5,1-5,10:            "#A comment"   COMMENT (comment)	
5,11-7,0:            "

In [11]:
""" \n    "   WHITESPACE (whitespace)"""|> length

29

In [12]:
show.((tokenize("""
(a +
b)
{a +
b}
[a +
b]


""") |> collect ) |> annotate);

1,1-1,1:             "("   LPAREN (lparen)	| opener ↓(2,2-2,2:   RPAREN	")")↓ 
1,2-1,2:             "a"   IDENTIFIER (identifier)	
1,3-1,3:             " "   WHITESPACE (whitespace)	
1,4-1,4:             "+"   PLUS (op)	
1,5-2,0:             "\n"   WHITESPACE (whitespace)	
2,1-2,1:             "b"   IDENTIFIER (identifier)	
2,2-2,2:             ")"   RPAREN (rparen)	| closer ↑(1,1-1,1:   LPAREN	"(")↑ 
2,3-3,0:             "\n"   WHITESPACE (whitespace)	| ender 
3,1-3,1:             "{"   LBRACE (lbrace)	| opener ↓(4,2-4,2:   RBRACE	"}")↓ 
3,2-3,2:             "a"   IDENTIFIER (identifier)	
3,3-3,3:             " "   WHITESPACE (whitespace)	
3,4-3,4:             "+"   PLUS (op)	
3,5-4,0:             "\n"   WHITESPACE (whitespace)	| ender 
4,1-4,1:             "b"   IDENTIFIER (identifier)	
4,2-4,2:             "}"   RBRACE (rbrace)	| closer ↑(3,1-3,1:   LBRACE	"{")↑ 
4,3-5,0:             "\n"   WHITESPACE (whitespace)	| ender 
5,1-5,1:             "["   LSQUARE (lsquare)	| opener ↓(6,2-

In [13]:
@enum ClauseKind ExternalScope GlobalScope HardLocalScope SoftLocalScope NonScopingBlock NonScopingClause  Expression

const clause_type = Dict(
    Tokens.MODULE => GlobalScope,
    Tokens.BAREMODULE => GlobalScope,

    Tokens.LBRACE => NonScopingClause,
    Tokens.LPAREN => NonScopingClause,
    Tokens.LSQUARE => NonScopingClause,
    Tokens.QUOTE => NonScopingClause,

    Tokens.BEGIN => NonScopingBlock,
    Tokens.IF => NonScopingBlock,

    Tokens.FOR => SoftLocalScope,
    Tokens.WHILE => SoftLocalScope,
    Tokens.TRY => SoftLocalScope,
    
    
    
    Tokens.TYPE => HardLocalScope,    
    Tokens.IMMUTABLE => HardLocalScope,
    Tokens.DO => HardLocalScope,
    Tokens.FUNCTION => HardLocalScope,
)

@assert(Set(keys(subclause_markers)) == Set(keys(clause_types)))

LoadError: UndefVarError: clause_types not defined

In [14]:
type CstNode{T}
    subs::Vector
    CstNode() = new(Any[])
end

@delegate_oneField(CstNode, subs, [getindex, setindex!, pop!, push!, endof])


endof (generic function with 15 methods)

In [30]:
function tree_up(annotated_tokens)
    tree = CstNode{ExternalScope}()
    scope_stack = CstNode[tree]
    
    function enter_child!(node)
        push!(scope_stack[end], node) #To keep in the tree
        push!(scope_stack, node) #To track scope
    end
    function exit_child!()
        pop!(scope_stack)
    end
    
    
    function add_current!(atok)
        push!(scope_stack[end], atok)
    end
    
    
    enter_child!(CstNode{GlobalScope}()) #True Globel;
    
    
    awaited_closers = AnnotatedToken[]
    is_awaited(atok) = length(awaited_closers) > 0 && awaited_closers[end]==atok
    for atok in annotated_tokens
        if is_opener(atok)
            push!(awaited_closers, get(atok.buddy))
            node_type = clause_type[exactkind(atok)]
            enter_child!(CstNode{node_type}())
        end
        
        add_current!(atok)
        
        if is_awaited(atok)
           pop!(awaited_closers)
           exit_child!() 
        end
                
        #print(untokenize(atok), " ")
        #is_ender(atok) && println("\n┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅┅")
    end
    tree
end



tree_up (generic function with 1 method)

In [31]:
AbstractTrees.printnode{T}(io::IO, f::CstNode{T}) = print(io, T)
AbstractTrees.children(node::CstNode) = node.subs



In [32]:


print_tree(STDOUT, tree_up(atokens))

ExternalScope
└─ GlobalScope
   ├─ 1,1-1,1:             "a"   IDENTIFIER (identifier)	
   │  
   ├─ 1,2-1,2:             "="   EQ (op)	
   │  
   ├─ 1,3-1,3:             "1"   INTEGER (integer)	
   │  
   ├─ 1,4-2,0:             "\n"   WHITESPACE (whitespace)	| ender 
   │  
   ├─ 2,1-2,1:             "a"   IDENTIFIER (identifier)	
   │  
   ├─ 2,2-2,2:             "+"   PLUS (op)	
   │  
   ├─ 2,3-2,3:             "3"   INTEGER (integer)	
   │  
   ├─ 2,4-3,0:             "\n"   WHITESPACE (whitespace)	| ender 
   │  
   ├─ 3,1-3,1:             "c"   IDENTIFIER (identifier)	
   │  
   ├─ 3,2-3,2:             "="   EQ (op)	
   │  
   ├─ 3,3-3,3:             "a"   IDENTIFIER (identifier)	
   │  
   ├─ 3,4-3,4:             "+"   PLUS (op)	
   │  
   ├─ 3,5-3,5:             "4"   INTEGER (integer)	
   │  
   ├─ 3,6-3,6:             ";"   SEMICOLON (semicolon)	| ender 
   │  
   ├─ 3,7-3,7:             " "   WHITESPACE (whitespace)	
   │  
   ├─ 3,8-3,8:             "b"   IDENTIFIER (ident

In [18]:
?seek

search: [1ms[22m[1me[22m[1me[22m[1mk[22m [1ms[22m[1me[22m[1me[22m[1mk[22mend [1ms[22m[1me[22m[1me[22m[1mk[22mstart Par[1ms[22m[1me[22m[1mE[22mrror [1ms[22m[1me[22mt[1me[22mnv [1ms[22m[1me[22ml[1me[22mct [1ms[22m[1me[22ml[1me[22mct! [1ms[22m[1me[22ml[1me[22mctperm



```
seek(s, pos)
```

Seek a stream to the given position.


In [19]:
?parse

search: [1mp[22m[1ma[22m[1mr[22m[1ms[22m[1me[22m [1mp[22m[1ma[22m[1mr[22m[1ms[22m[1me[22mip [1mP[22m[1ma[22m[1mr[22m[1ms[22m[1me[22mError s[1mp[22m[1ma[22m[1mr[22m[1ms[22m[1me[22m s[1mp[22m[1ma[22m[1mr[22m[1ms[22m[1me[22mvec S[1mp[22m[1ma[22m[1mr[22m[1ms[22m[1me[22mVector S[1mp[22m[1ma[22m[1mr[22m[1ms[22m[1me[22mArrays



```
parse(str, start; greedy=true, raise=true)
```

Parse the expression string and return an expression (which could later be passed to eval for execution). `start` is the index of the first character to start parsing. If `greedy` is `true` (default), `parse` will try to consume as much input as it can; otherwise, it will stop as soon as it has parsed a valid expression. Incomplete but otherwise syntactically valid expressions will return `Expr(:incomplete, "(error message)")`. If `raise` is `true` (default), syntax errors other than incomplete expressions will raise an error. If `raise` is `false`, `parse` will return an expression that will raise an error upon evaluation.

```
parse(str; raise=true)
```

Parse the expression string greedily, returning a single expression. An error is thrown if there are additional characters after the first expression. If `raise` is `true` (default), syntax errors will raise an error; otherwise, `parse` will return an expression that will raise an error upon evaluation.

```
parse(type, str, [base])
```

Parse a string as a number. If the type is an integer type, then a base can be specified (the default is 10). If the type is a floating point type, the string is parsed as a decimal floating point number. If the string does not contain a valid number, an error is raised.


In [20]:


immutable Variable

immutable Scope <:
    vars_declared::Set{}
end

LoadError: syntax: incomplete: "immutable" at In[20]:3 requires end