Skip to content

Commit

Permalink
Remove html_entities dependency and use built-in entities module (#434
Browse files Browse the repository at this point in the history
)

* WIP: using built in code for decode

* Fix encoding/decoding

* Remove html_entities as dependency

Also remove old unused deps.

* Change "generate_entities" mix task to use another file

This is because we now have some rules in the main file.

* Add tests to Entities module

* Better organize
  • Loading branch information
philss committed Nov 3, 2022
1 parent 8200b59 commit 48168b3
Show file tree
Hide file tree
Showing 9 changed files with 2,375 additions and 2,248 deletions.
2,301 changes: 65 additions & 2,236 deletions lib/floki/entities.ex

Large diffs are not rendered by default.

2,242 changes: 2,242 additions & 0 deletions lib/floki/entities/codepoints.ex

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions lib/floki/html/tokenizer.ex
Expand Up @@ -2576,7 +2576,7 @@ defmodule Floki.HTML.Tokenizer do
when c == ?; or is_letter(c) or
is_digit(c) do
buffer = IO.chardata_to_string([s.buffer | [c]])
candidate = Floki.Entities.get(buffer)
candidate = Floki.Entities.Codepoints.get(buffer)

charref_state =
if candidate != [] do
Expand Down Expand Up @@ -2675,7 +2675,7 @@ defmodule Floki.HTML.Tokenizer do

defp character_buffer(%State{charref_state: %CharrefState{candidate: candidate}, buffer: buffer}) do
if candidate do
Floki.Entities.get(candidate)
Floki.Entities.Codepoints.get(candidate)
else
buffer
end
Expand Down
2 changes: 1 addition & 1 deletion lib/floki/raw_html.ex
Expand Up @@ -31,7 +31,7 @@ defmodule Floki.RawHTML do
else: @default_self_closing_tags
end

@encoder &HtmlEntities.encode/1
@encoder &Floki.Entities.encode/1

def raw_html(html_tree, options) do
encoder =
Expand Down
6 changes: 3 additions & 3 deletions lib/mix/tasks/generate_entities.ex
Expand Up @@ -2,7 +2,7 @@ defmodule Mix.Tasks.GenerateEntities do
@shortdoc "Generate the entities module"

@json_entities_path "priv/entities.json"
@destination_module_path "lib/floki/entities.ex"
@destination_module_path "lib/floki/entities/codepoints.ex"

use Mix.Task

Expand All @@ -14,7 +14,7 @@ defmodule Mix.Tasks.GenerateEntities do
{:ok, json} = Jason.decode(content)

headers = """
defmodule Floki.Entities do
defmodule Floki.Entities.Codepoints do
# This file was generated by "Mix.Tasks.GenerateEntities"
@moduledoc false
Expand Down Expand Up @@ -52,6 +52,6 @@ defmodule Mix.Tasks.GenerateEntities do

File.write!(@destination_module_path, contents)

Mix.shell().info("Entities module is located in lib/floki/entities.ex")
Mix.shell().info("Entities module is located in lib/floki/entities/codepoints.ex")
end
end
1 change: 0 additions & 1 deletion mix.exs
Expand Up @@ -56,7 +56,6 @@ defmodule Floki.Mixfile do
|> Enum.map(fn name -> Keyword.fetch!(parsers_deps, name) end)

[
{:html_entities, "~> 0.5.0"},
{:jason, "~> 1.1", only: [:dev, :test, :docs]},
{:earmark, "~> 1.2", only: :dev},
{:ex_doc, "~> 0.29.0", only: :dev, runtime: false},
Expand Down
3 changes: 0 additions & 3 deletions mix.lock
Expand Up @@ -13,15 +13,12 @@
"fast_html": {:hex, :fast_html, "2.0.5", "c61760340606c1077ff1f196f17834056cb1dd3d5cb92a9f2cabf28bc6221c3c", [:make, :mix], [{:elixir_make, "~> 0.4", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:nimble_pool, "~> 0.2.0", [hex: :nimble_pool, repo: "hexpm", optional: false]}], "hexpm", "605f4f4829443c14127694ebabb681778712ceecb4470ec32aa31012330e6506"},
"file_system": {:hex, :file_system, "0.2.10", "fb082005a9cd1711c05b5248710f8826b02d7d1784e7c3451f9c1231d4fc162d", [:mix], [], "hexpm", "41195edbfb562a593726eda3b3e8b103a309b733ad25f3d642ba49696bf715dc"},
"html5ever": {:hex, :html5ever, "0.14.0", "a42762469dcba564a8986dce688f911913dd056221bcf11464bb14c6f262e29a", [:mix], [{:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.5.2", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "09b73075a8a1b0936f0484ec34309332e59d74549c94f75ed41624259950092d"},
"html_entities": {:hex, :html_entities, "0.5.2", "9e47e70598da7de2a9ff6af8758399251db6dbb7eebe2b013f2bbd2515895c3c", [:mix], [], "hexpm", "c53ba390403485615623b9531e97696f076ed415e8d8058b1dbaa28181f4fdcc"},
"jason": {:hex, :jason, "1.4.0", "e855647bc964a44e2f67df589ccf49105ae039d4179db7f6271dfd3843dc27e6", [:mix], [{:decimal, "~> 1.0 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: true]}], "hexpm", "79a3791085b2a0f743ca04cec0f7be26443738779d09302e01318f97bdb82121"},
"makeup": {:hex, :makeup, "1.1.0", "6b67c8bc2882a6b6a445859952a602afc1a41c2e08379ca057c0f525366fc3ca", [:mix], [{:nimble_parsec, "~> 1.2.2 or ~> 1.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "0a45ed501f4a8897f580eabf99a2e5234ea3e75a4373c8a52824f6e873be57a6"},
"makeup_elixir": {:hex, :makeup_elixir, "0.16.0", "f8c570a0d33f8039513fbccaf7108c5d750f47d8defd44088371191b76492b0b", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}, {:nimble_parsec, "~> 1.2.3", [hex: :nimble_parsec, repo: "hexpm", optional: false]}], "hexpm", "28b2cbdc13960a46ae9a8858c4bebdec3c9a6d7b4b9e7f4ed1502f8159f338e7"},
"makeup_erlang": {:hex, :makeup_erlang, "0.1.1", "3fcb7f09eb9d98dc4d208f49cc955a34218fc41ff6b84df7c75b3e6e533cc65f", [:mix], [{:makeup, "~> 1.0", [hex: :makeup, repo: "hexpm", optional: false]}], "hexpm", "174d0809e98a4ef0b3309256cbf97101c6ec01c4ab0b23e926a9e17df2077cbb"},
"nimble_parsec": {:hex, :nimble_parsec, "1.2.3", "244836e6e3f1200c7f30cb56733fd808744eca61fd182f731eac4af635cc6d0b", [:mix], [], "hexpm", "c8d789e39b9131acf7b99291e93dae60ab48ef14a7ee9d58c6964f59efb570b0"},
"nimble_pool": {:hex, :nimble_pool, "0.2.4", "1db8e9f8a53d967d595e0b32a17030cdb6c0dc4a451b8ac787bf601d3f7704c3", [:mix], [], "hexpm", "367e8071e137b787764e6a9992ccb57b276dc2282535f767a07d881951ebeac6"},
"rustler": {:hex, :rustler, "0.24.0", "b8362a2fee1c9d2c7373b0bfdc98f75bbc02864efcec50df173fe6c4f72d4cc4", [:mix], [{:jason, "~> 1.0", [hex: :jason, repo: "hexpm", optional: false]}, {:toml, "~> 0.6", [hex: :toml, repo: "hexpm", optional: false]}], "hexpm", "2773167fca68a6525822ad977b41368ea3c2af876c42ebaa7c9d6bb69b67f1ce"},
"rustler_precompiled": {:hex, :rustler_precompiled, "0.5.2", "7619fff0309a012eac7441993da4f6e257022bd456449a366756696a9a18fb19", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "4e3716fd7cf6fbb806a9ed2b1449c987cfe578b24e3deb3ca4b8645638cc644c"},
"statistex": {:hex, :statistex, "1.0.0", "f3dc93f3c0c6c92e5f291704cf62b99b553253d7969e9a5fa713e5481cd858a5", [:mix], [], "hexpm", "ff9d8bee7035028ab4742ff52fc80a2aa35cece833cf5319009b52f1b5a86c27"},
"toml": {:hex, :toml, "0.6.2", "38f445df384a17e5d382befe30e3489112a48d3ba4c459e543f748c2f25dd4d1", [:mix], [], "hexpm", "d013e45126d74c0c26a38d31f5e8e9b83ea19fc752470feb9a86071ca5a672fa"},
}
4 changes: 2 additions & 2 deletions src/floki_mochi_html.erl
Expand Up @@ -697,8 +697,8 @@ tokenize_charref_raw(Bin, S=#decoder{offset=O}, Start) ->
%% but the functionality below is equivalent;
<<_:Start/binary, Raw:Len/binary, _/binary>> = Bin,

case 'Elixir.HtmlEntities':decode(<<$&, Raw/binary, $;>>) of
<<CP/utf8>> ->
case 'Elixir.Floki.Entities':decode(<<$&, Raw/binary, $;>>) of
{ok, <<CP/utf8>>} ->
{CP, ?INC_COL(S)};
_ ->
throw(invalid_charref)
Expand Down
60 changes: 60 additions & 0 deletions test/floki/entities_test.exs
@@ -0,0 +1,60 @@
defmodule Floki.EntitiesTest do
use ExUnit.Case, async: true

alias Floki.Entities

describe "encode/1" do
test "encode single-quote" do
assert Entities.encode("'") == "&#39;"
end

test "encode double-quote" do
assert Entities.encode("\"") == "&quot;"
end

test "ampersand" do
assert Entities.encode("&") == "&amp;"
end

test "encode less-than sign" do
assert Entities.encode("<") == "&lt;"
end

test "encode greater-than sign" do
assert Entities.encode(">") == "&gt;"
end

test "does not encode others" do
assert Entities.encode("!") == "!"
assert Entities.encode("?") == "?"
assert Entities.encode("aaaa") == "aaaa"
end
end

describe "decode/1" do
test "decode all known entities" do
{:ok, json} = Jason.decode(File.read!("priv/entities.json"))
entities = Map.keys(json)

for entity <- entities do
assert {:ok, valid} = Entities.decode(entity)
assert String.valid?(valid)
end
end

test "decode some numeric references" do
entities =
~w(&#x202B; &#1585; &#1602; &#1605; &#1575; &#1604; &#1607; &#1575; &#1578; &#1601; &#x202E;)

for entity <- entities do
assert {:ok, valid} = Entities.decode(entity)
assert String.valid?(valid)
end
end

test "returns not found for unknown entities" do
assert {:error, :not_found} = Entities.decode("&pastel;")
assert {:error, :not_found} = Entities.decode("&churrasco;")
end
end
end

0 comments on commit 48168b3

Please sign in to comment.