Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add a robust representation of selectors
This is a huge refactor in the `Floki.find/2` function that enables more complex searches using a mix of selectors. You can mix selectors like you would normally do in other tools like jQuery or to apply rules using CSS selectors. Examples of queries now supported: - "a.foo" - ".foo.bar" - ".baz[data='something']" - "[title][href$='.html']" - "a b.foo c" To archive this, it was necessary to write a tokenizer and a parser for the inputted selector. It's quite easy to understand after read this article by Andrea Leopardi (@whatyouhide) about tokenizing and parsing in Elixir: http://andrealeopardi.com/posts/tokenizing-and-parsing-in-elixir-using-leex-and-yecc/ The tokenizer partially covers the specs of CSS3 selectors, that you can find at http://www.w3.org/TR/css3-selectors/ Knowning issues: - There is no support for pseudo-selectors; - The only combinator supported is descendant combinator; - If there is a group of selectors in the same query, and two selectors matches the same node, this node will appear twice in the resultant list. Closes #18 and #20.
- Loading branch information
Showing
14 changed files
with
682 additions
and
266 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ erl_crash.dump | |
*.swp | ||
/doc | ||
*.beam | ||
/src/*_lexer.erl |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
defmodule Floki.AttributeSelector do | ||
alias Floki.AttributeSelector | ||
|
||
defstruct match_type: nil, attribute: nil, value: nil | ||
|
||
def match?(attributes, s = %AttributeSelector{match_type: nil, value: nil}) do | ||
attribute_present?(s.attribute, attributes) | ||
end | ||
def match?(attributes, s = %AttributeSelector{match_type: :equal}) do | ||
get_value(s.attribute, attributes) == s.value | ||
end | ||
def match?(attributes, s = %AttributeSelector{match_type: :includes}) do | ||
value = get_value(s.attribute, attributes) | ||
|
||
whitespace_values = String.split(value, " ") | ||
|
||
Enum.any? whitespace_values, fn(v) -> v == s.value end | ||
end | ||
def match?(attributes, s = %AttributeSelector{match_type: :dash_match}) do | ||
value = get_value(s.attribute, attributes) | ||
|
||
value == s.value || String.starts_with?(value, "#{s.value}-") | ||
end | ||
def match?(attributes, s = %AttributeSelector{match_type: :prefix_match}) do | ||
get_value(s.attribute, attributes) |> String.starts_with?(s.value) | ||
end | ||
def match?(attributes, s = %AttributeSelector{match_type: :sufix_match}) do | ||
get_value(s.attribute, attributes) |> String.ends_with?(s.value) | ||
end | ||
def match?(attributes, s = %AttributeSelector{match_type: :substring_match}) do | ||
get_value(s.attribute, attributes) |> String.contains?(s.value) | ||
end | ||
|
||
defp get_value(attr_name, attributes) do | ||
{_attr_name, value} = Enum.find attributes, {attr_name, ""}, fn({k, _v}) -> | ||
k == attr_name | ||
end | ||
|
||
value | ||
end | ||
|
||
defp attribute_present?(name, attributes) do | ||
Enum.any? attributes, fn({k, _v}) -> k == name end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
defmodule Floki.Combinator do | ||
defstruct match_type: nil, selector: nil | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,102 +1,60 @@ | ||
defmodule Floki.Finder do | ||
@moduledoc false | ||
|
||
import Floki.Matchers | ||
|
||
def find(html, selector) when is_binary(html) do | ||
Floki.Parser.parse(html) |> do_find(selector) | ||
end | ||
|
||
def find(html_tree, selector), do: do_find(html_tree, selector) | ||
|
||
def attribute_values(element, attr_name) when is_tuple(element) do | ||
attribute_values([element], attr_name) | ||
end | ||
def attribute_values(elements, attr_name) do | ||
values = Enum.reduce elements, [], fn({_, attributes, _}, acc) -> | ||
case attribute_match?(attributes, attr_name) do | ||
{_attr_name, value} -> | ||
[value|acc] | ||
_ -> | ||
acc | ||
end | ||
alias Floki.Selector | ||
alias Floki.SelectorParser | ||
alias Floki.SelectorTokenizer | ||
|
||
def find(html_tree, selector) do | ||
selectors = Enum.map String.split(selector, ","), fn(s) -> | ||
SelectorTokenizer.tokenize(s) | ||
|> SelectorParser.parse | ||
end | ||
|
||
Enum.reverse(values) | ||
html_tree | ||
|> transverse(selectors, []) | ||
|> Enum.reverse | ||
end | ||
|
||
defp do_find(html_tree, selector) when is_tuple(selector) do | ||
{:ok, nodes} = find_by_selector(selector, html_tree, &attr_matcher/3, {:ok, []}) | ||
Enum.reverse(nodes) | ||
defp transverse(_, [], acc), do: acc | ||
defp transverse({}, _, acc), do: acc | ||
defp transverse([], _, acc), do: acc | ||
defp transverse(string, _, acc) when is_binary(string), do: acc | ||
defp transverse({:comment, _comment}, _, acc), do: acc | ||
defp transverse({:pi, _xml, _xml_attrs}, _, acc), do: acc | ||
defp transverse([head_node|tail_nodes], selectors, acc) do | ||
acc = transverse(head_node, selectors, acc) | ||
transverse(tail_nodes, selectors, acc) | ||
end | ||
|
||
defp do_find(html_tree, selector) do | ||
tag_attr_val_regex = ~r/(?'tag'.+)\[(?'attr'.+)=(?'val'.+)\]/ | ||
attr_val_regex = ~r/\[(?'attr'.+)=(?'val'.+)\]/ | ||
|
||
cond do | ||
String.contains?(selector, ",") -> | ||
selectors = String.split(selector, ",") | ||
|
||
Enum.reduce selectors, [], fn(selector, acc) -> | ||
selector = String.strip(selector) | ||
|
||
nodes = do_find(html_tree, selector) | ||
|
||
unless is_list(nodes), do: nodes = [nodes] | ||
|
||
Enum.concat(acc, nodes) | ||
end | ||
String.contains?(selector, "\s") -> | ||
descendent_selector = String.split(selector) | ||
|
||
Enum.reduce descendent_selector, html_tree, fn(selector, tree) -> | ||
do_find(tree, selector) | ||
end | ||
String.starts_with?(selector, ".") -> | ||
"." <> class = selector | ||
{:ok, nodes} = find_by_selector(class, html_tree, &class_matcher/3, {:ok, []}) | ||
|
||
Enum.reverse(nodes) | ||
String.starts_with?(selector, "#") -> | ||
"#" <> id = selector | ||
{_status, nodes} = find_by_selector(id, html_tree, &id_matcher/3, {:ok, []}) | ||
|
||
List.first(nodes) | ||
Regex.match?(attr_val_regex, selector) -> | ||
%{"attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector) | ||
{:ok, nodes} = find_by_selector({attr, val}, html_tree, &attr_matcher/3, {:ok, []}) | ||
|
||
Enum.reverse(nodes) | ||
Regex.match?(tag_attr_val_regex, selector) -> | ||
%{"tag" => tag, "attr" => attr, "val" => val} = Regex.named_captures(attr_val_regex, selector) | ||
{:ok, nodes} = find_by_selector({tag, attr, val}, html_tree, &attr_matcher/3, {:ok, []}) | ||
|
||
Enum.reverse(nodes) | ||
true -> | ||
{:ok, nodes} = find_by_selector(selector, html_tree, &tag_matcher/3, {:ok, []}) | ||
|
||
Enum.reverse(nodes) | ||
end | ||
defp transverse(node, [head_selector|tail_selectors], acc) do | ||
acc = transverse(node, head_selector, acc) | ||
transverse(node, tail_selectors, acc) | ||
end | ||
defp transverse({_, _, children_nodes} = node, selector = %Selector{combinator: nil}, acc) do | ||
acc = | ||
case Selector.match?(node, selector) do | ||
true -> | ||
[node|acc] | ||
false -> | ||
acc | ||
end | ||
|
||
defp find_by_selector(_selector, {}, _, acc), do: acc | ||
defp find_by_selector(_selector, [], _, acc), do: acc | ||
defp find_by_selector(_selector, _, _, {:done, nodes}), do: {:done, nodes} | ||
defp find_by_selector(_selector, tree, _, acc) when is_binary(tree), do: acc | ||
defp find_by_selector(selector, [h|t], matcher, acc) do | ||
acc = find_by_selector(selector, h, matcher, acc) | ||
find_by_selector(selector, t, matcher, acc) | ||
transverse(children_nodes, selector, acc) | ||
end | ||
# Ignore comments | ||
defp find_by_selector(_selector, {:comment, _comment}, _, acc), do: acc | ||
# Ignore XML document version | ||
defp find_by_selector(_selector, {:pi, _xml, _xml_attrs}, _, acc), do: acc | ||
defp find_by_selector(selector, node, matcher, acc) do | ||
{_, _, child_node} = node | ||
|
||
acc = matcher.(selector, node, acc) | ||
defp transverse({_, _, children_nodes} = node, selector = %Selector{combinator: combinator}, acc) do | ||
acc = | ||
case Selector.match?(node, selector) do | ||
true -> | ||
case combinator.match_type do | ||
:descendant -> | ||
transverse(children_nodes, combinator.selector, acc) | ||
other -> | ||
raise "Combinator of type \"#{other}\" not implemented yet" | ||
end | ||
false -> | ||
acc | ||
end | ||
|
||
find_by_selector(selector, child_node, matcher, acc) | ||
transverse(children_nodes, selector, acc) | ||
end | ||
end |
Oops, something went wrong.
62e24ff
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@philss hey, thanks for the mention :D
62e24ff
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@whatyouhide oh, thank you for the great post! It would be harder for me to implement this without your blog post! :)