Skip to content

Commit

Permalink
Add a robust representation of selectors
Browse files Browse the repository at this point in the history
This is a huge refactor in the `Floki.find/2` function that enables more
complex searches using a mix of selectors. You can mix selectors
like you would normally do in other tools like jQuery or to apply rules
using CSS selectors.

Examples of queries now supported:
- "a.foo"
- ".foo.bar"
- ".baz[data='something']"
- "[title][href$='.html']"
- "a b.foo c"

To archive this, it was necessary to write a tokenizer and a parser for
the inputted selector. It's quite easy to understand after read this
article by Andrea Leopardi (@whatyouhide) about tokenizing and parsing
in Elixir:
http://andrealeopardi.com/posts/tokenizing-and-parsing-in-elixir-using-leex-and-yecc/

The tokenizer partially covers the specs of CSS3 selectors, that you can
find at http://www.w3.org/TR/css3-selectors/

Knowning issues:
- There is no support for pseudo-selectors;
- The only combinator supported is descendant combinator;
- If there is a group of selectors in the same query, and two selectors
  matches the same node, this node will appear twice in the resultant
  list.

Closes #18 and #20.
  • Loading branch information
philss committed Sep 17, 2015
1 parent 2466210 commit 62e24ff
Show file tree
Hide file tree
Showing 14 changed files with 682 additions and 266 deletions.
1 change: 1 addition & 0 deletions .gitignore
Expand Up @@ -6,3 +6,4 @@ erl_crash.dump
*.swp
/doc
*.beam
/src/*_lexer.erl
23 changes: 23 additions & 0 deletions CHANGELOG.md
Expand Up @@ -5,6 +5,29 @@ This project adheres to [Semantic Versioning](http://semver.org/).

## [Unreleased][unreleased]

### Added

- A robust representation of selectors in order to enable queries using a mix of selector types,
such as classes with attributes, attributes with types, classes with classes and so on.
Here is a list with examples of what is possible now:
- `Floki.find(html, "a.foo")`
- `Floki.find(html, "a.foo[data-action=post]")`
- `Floki.find(html, ".foo.bar")`
- `Floki.find(html, "a.foo[href$='.org']")`
Thanks to @licyeus to point out the [issue](https://github.com/philss/floki/issues/18)!

### Changed

- `Floki.find/2` will now return a list instead of tuple when searching only by IDs.
For now on, Floki should always return the results inside a list, even if it's an ID match.

### Removed

- `Floki.find/2` does not accept tuples as selectors anymore.
This is because with the robust selectors representation, it won't be necessary to query directly using
tuples or another data structures rather than string.


## [0.3.3] - 2015-08-23

### Fixed
Expand Down
96 changes: 58 additions & 38 deletions lib/floki.ex
Expand Up @@ -81,18 +81,13 @@ defmodule Floki do
Finds elements inside a HTML tree or string.
You can search by class, tag name or id.
It is possible to compose searches:
Floki.find(html_string, ".class")
|> Floki.find(".another-class-inside-small-scope")
## Examples
iex> Floki.find("<p><span class=hint>hello</span></p>", ".hint")
[{"span", [{"class", "hint"}], ["hello"]}]
iex> "<body><div id=important><div>Content</div></div></body>" |> Floki.find("#important")
{"div", [{"id", "important"}], [{"div", [], ["Content"]}]}
iex> Floki.find("<body><div id=important><div>Content</div></div></body>", "#important")
[{"div", [{"id", "important"}], [{"div", [], ["Content"]}]}]
iex> Floki.find("<p><a href='https://google.com'>Google</a></p>", "a")
[{"a", [{"href", "https://google.com"}], ["Google"]}]
Expand All @@ -101,10 +96,46 @@ defmodule Floki do

@spec find(binary | html_tree, binary) :: html_tree

def find(html, selector) when is_binary(html) do
Floki.Parser.parse(html) |> Finder.find(selector)
end
def find(html, selector) do
Finder.find(html, selector)
end

@doc """
Returns the text nodes from a HTML tree.
By default, it will perform a deep search through the HTML tree.
You can disable deep search with the option `deep` assigned to false.
## Examples
iex> Floki.text("<div><span>hello</span> world</div>")
"hello world"
iex> Floki.text("<div><span>hello</span> world</div>", deep: false)
" world"
"""

@spec text(html_tree | binary) :: binary

def text(html, opts \\ [deep: true]) do
html_tree =
case is_binary(html) do
true -> parse(html)
false -> html
end

search_strategy =
case opts[:deep] do
true -> Floki.DeepText
false -> Floki.FlatText
end

search_strategy.get(html_tree)
end

@doc """
Returns a list with attribute values for a given selector.
Expand All @@ -120,7 +151,7 @@ defmodule Floki do
def attribute(html, selector, attribute_name) do
html
|> find(selector)
|> Finder.attribute_values(attribute_name)
|> attribute_values(attribute_name)
end

@doc """
Expand All @@ -138,42 +169,31 @@ defmodule Floki do
def attribute(html_tree, attribute_name) when is_binary(html_tree) do
html_tree
|> parse
|> Finder.attribute_values(attribute_name)
|> attribute_values(attribute_name)
end
def attribute(elements, attribute_name) do
Finder.attribute_values(elements, attribute_name)
attribute_values(elements, attribute_name)
end

@doc """
Returns the text nodes from a HTML tree.
By default, it will perform a deep search through the HTML tree.
You can disable deep search with the option `deep` assigned to false.
## Examples
iex> Floki.text("<div><span>hello</span> world</div>")
"hello world"
iex> Floki.text("<div><span>hello</span> world</div>", deep: false)
" world"
"""

@spec text(html_tree | binary) :: binary

def text(html, opts \\ [deep: true]) do
html_tree =
case is_binary(html) do
true -> parse(html)
false -> html
defp attribute_values(element, attr_name) when is_tuple(element) do
attribute_values([element], attr_name)
end
defp attribute_values(elements, attr_name) do
values = Enum.reduce elements, [], fn({_, attributes, _}, acc) ->
case attribute_match?(attributes, attr_name) do
{_attr_name, value} ->
[value|acc]
_ ->
acc
end
end

search_strategy =
case opts[:deep] do
true -> Floki.DeepText
false -> Floki.FlatText
end
Enum.reverse(values)
end

search_strategy.get(html_tree)
defp attribute_match?(attributes, attribute_name) do
Enum.find attributes, fn({attr_name, _}) ->
attr_name == attribute_name
end
end
end
45 changes: 45 additions & 0 deletions lib/floki/attribute_selector.ex
@@ -0,0 +1,45 @@
defmodule Floki.AttributeSelector do
alias Floki.AttributeSelector

defstruct match_type: nil, attribute: nil, value: nil

def match?(attributes, s = %AttributeSelector{match_type: nil, value: nil}) do
attribute_present?(s.attribute, attributes)
end
def match?(attributes, s = %AttributeSelector{match_type: :equal}) do
get_value(s.attribute, attributes) == s.value
end
def match?(attributes, s = %AttributeSelector{match_type: :includes}) do
value = get_value(s.attribute, attributes)

whitespace_values = String.split(value, " ")

Enum.any? whitespace_values, fn(v) -> v == s.value end
end
def match?(attributes, s = %AttributeSelector{match_type: :dash_match}) do
value = get_value(s.attribute, attributes)

value == s.value || String.starts_with?(value, "#{s.value}-")
end
def match?(attributes, s = %AttributeSelector{match_type: :prefix_match}) do
get_value(s.attribute, attributes) |> String.starts_with?(s.value)
end
def match?(attributes, s = %AttributeSelector{match_type: :sufix_match}) do
get_value(s.attribute, attributes) |> String.ends_with?(s.value)
end
def match?(attributes, s = %AttributeSelector{match_type: :substring_match}) do
get_value(s.attribute, attributes) |> String.contains?(s.value)
end

defp get_value(attr_name, attributes) do
{_attr_name, value} = Enum.find attributes, {attr_name, ""}, fn({k, _v}) ->
k == attr_name
end

value
end

defp attribute_present?(name, attributes) do
Enum.any? attributes, fn({k, _v}) -> k == name end
end
end
3 changes: 3 additions & 0 deletions lib/floki/combinator.ex
@@ -0,0 +1,3 @@
defmodule Floki.Combinator do
defstruct match_type: nil, selector: nil
end
134 changes: 46 additions & 88 deletions lib/floki/finder.ex
@@ -1,102 +1,60 @@
defmodule Floki.Finder do
@moduledoc false

import Floki.Matchers

def find(html, selector) when is_binary(html) do
Floki.Parser.parse(html) |> do_find(selector)
end

def find(html_tree, selector), do: do_find(html_tree, selector)

def attribute_values(element, attr_name) when is_tuple(element) do
attribute_values([element], attr_name)
end
def attribute_values(elements, attr_name) do
values = Enum.reduce elements, [], fn({_, attributes, _}, acc) ->
case attribute_match?(attributes, attr_name) do
{_attr_name, value} ->
[value|acc]
_ ->
acc
end
alias Floki.Selector
alias Floki.SelectorParser
alias Floki.SelectorTokenizer

def find(html_tree, selector) do
selectors = Enum.map String.split(selector, ","), fn(s) ->
SelectorTokenizer.tokenize(s)
|> SelectorParser.parse
end

Enum.reverse(values)
html_tree
|> transverse(selectors, [])
|> Enum.reverse
end

defp do_find(html_tree, selector) when is_tuple(selector) do
{:ok, nodes} = find_by_selector(selector, html_tree, &attr_matcher/3, {:ok, []})
Enum.reverse(nodes)
defp transverse(_, [], acc), do: acc
defp transverse({}, _, acc), do: acc
defp transverse([], _, acc), do: acc
defp transverse(string, _, acc) when is_binary(string), do: acc
defp transverse({:comment, _comment}, _, acc), do: acc
defp transverse({:pi, _xml, _xml_attrs}, _, acc), do: acc
defp transverse([head_node|tail_nodes], selectors, acc) do
acc = transverse(head_node, selectors, acc)
transverse(tail_nodes, selectors, acc)
end

defp do_find(html_tree, selector) do
tag_attr_val_regex = ~r/(?'tag'.+)\[(?'attr'.+)=(?'val'.+)\]/
attr_val_regex = ~r/\[(?'attr'.+)=(?'val'.+)\]/

cond do
String.contains?(selector, ",") ->
selectors = String.split(selector, ",")

Enum.reduce selectors, [], fn(selector, acc) ->
selector = String.strip(selector)

nodes = do_find(html_tree, selector)

unless is_list(nodes), do: nodes = [nodes]

Enum.concat(acc, nodes)
end
String.contains?(selector, "\s") ->
descendent_selector = String.split(selector)

Enum.reduce descendent_selector, html_tree, fn(selector, tree) ->
do_find(tree, selector)
end
String.starts_with?(selector, ".") ->
"." <> class = selector
{:ok, nodes} = find_by_selector(class, html_tree, &class_matcher/3, {:ok, []})

Enum.reverse(nodes)
String.starts_with?(selector, "#") ->
"#" <> id = selector
{_status, nodes} = find_by_selector(id, html_tree, &id_matcher/3, {:ok, []})

List.first(nodes)
Regex.match?(attr_val_regex, selector) ->
%{"attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector)
{:ok, nodes} = find_by_selector({attr, val}, html_tree, &attr_matcher/3, {:ok, []})

Enum.reverse(nodes)
Regex.match?(tag_attr_val_regex, selector) ->
%{"tag" => tag, "attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector)
{:ok, nodes} = find_by_selector({tag, attr, val}, html_tree, &attr_matcher/3, {:ok, []})

Enum.reverse(nodes)
true ->
{:ok, nodes} = find_by_selector(selector, html_tree, &tag_matcher/3, {:ok, []})

Enum.reverse(nodes)
end
defp transverse(node, [head_selector|tail_selectors], acc) do
acc = transverse(node, head_selector, acc)
transverse(node, tail_selectors, acc)
end
defp transverse({_, _, children_nodes} = node, selector = %Selector{combinator: nil}, acc) do
acc =
case Selector.match?(node, selector) do
true ->
[node|acc]
false ->
acc
end

defp find_by_selector(_selector, {}, _, acc), do: acc
defp find_by_selector(_selector, [], _, acc), do: acc
defp find_by_selector(_selector, _, _, {:done, nodes}), do: {:done, nodes}
defp find_by_selector(_selector, tree, _, acc) when is_binary(tree), do: acc
defp find_by_selector(selector, [h|t], matcher, acc) do
acc = find_by_selector(selector, h, matcher, acc)
find_by_selector(selector, t, matcher, acc)
transverse(children_nodes, selector, acc)
end
# Ignore comments
defp find_by_selector(_selector, {:comment, _comment}, _, acc), do: acc
# Ignore XML document version
defp find_by_selector(_selector, {:pi, _xml, _xml_attrs}, _, acc), do: acc
defp find_by_selector(selector, node, matcher, acc) do
{_, _, child_node} = node

acc = matcher.(selector, node, acc)
defp transverse({_, _, children_nodes} = node, selector = %Selector{combinator: combinator}, acc) do
acc =
case Selector.match?(node, selector) do
true ->
case combinator.match_type do
:descendant ->
transverse(children_nodes, combinator.selector, acc)
other ->
raise "Combinator of type \"#{other}\" not implemented yet"
end
false ->
acc
end

find_by_selector(selector, child_node, matcher, acc)
transverse(children_nodes, selector, acc)
end
end

2 comments on commit 62e24ff

@whatyouhide
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@philss hey, thanks for the mention :D

@philss
Copy link
Owner Author

@philss philss commented on 62e24ff Sep 17, 2015

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@whatyouhide oh, thank you for the great post! It would be harder for me to implement this without your blog post! :)

Please sign in to comment.