This repository has been archived by the owner on Apr 29, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
html.cljc
131 lines (123 loc) · 5.11 KB
/
html.cljc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
(ns oc.lib.html
"Functions related to processing HTML."
(:require [cuerdas.core :as str]
#?(:clj [jsoup.soup :as soup]))
#?(:clj (:import [org.owasp.html HtmlPolicyBuilder Sanitizers])))
(defn- thumbnail-elements [body]
(let [thumbnail-selector "img:not(.emojione):not([data-media-type='image/gif']), iframe"]
#?(:clj
(let [parsed-body (soup/parse body)
els (.select parsed-body thumbnail-selector)]
{:elements els
:count (count els)})
:cljs
(let [$body (js/$ (str "<div>" body "</div>"))
els (js->clj (js/$ thumbnail-selector $body))]
{:elements els
:count (.-length els)}))))
(defn- $el [el]
#?(:clj
el
:cljs
(js/$ el)))
(defn- tag-name [el]
#?(:clj
(.tagName el)
:cljs
(.-tagName el)))
(defn- read-size [size]
#?(:clj
(Integer/parseInt (re-find #"\A-?\d+" size))
:cljs
size))
(defn first-body-thumbnail
"
Given an entry body get the first thumbnail available.
Thumbnail type: image, video or chart.
This rely on the similitudes between jQuery and soup parsed objects like the attr function.
"
[html-body]
(let [{els-count :count thumb-els :elements} (thumbnail-elements html-body)
found (atom nil)]
(dotimes [el-num els-count]
(let [el #?(:clj (nth thumb-els el-num) :cljs (aget thumb-els el-num))
$el ($el el)]
(when-not @found
(if (= (str/lower (tag-name el)) "img")
(let [width (read-size (.attr $el "width"))
height (read-size (.attr $el "height"))]
(when (and (not @found)
(or (<= width (* height 2))
(<= height (* width 2))))
(reset! found
{:type "image"
:thumbnail (if (.attr $el "data-thumbnail")
(.attr $el "data-thumbnail")
(.attr $el "src"))})))
(reset! found {:type (.attr $el "data-media-type") :thumbnail (.attr $el "data-thumbnail")})))))
@found))
#?(:clj
(def user-input-html-policy
(let [string-array (fn [sa] (into-array java.lang.String sa))
iframe-src-regex #"^https://((www\.)?youtube.com|player.vimeo.com|(www\.)?loom.com)/.*"]
(.. (HtmlPolicyBuilder.)
;; -- common --
(allowCommonBlockElements)
(allowCommonInlineFormattingElements)
(allowStyling)
(allowStandardUrlProtocols)
(allowElements (string-array ["span" "img" "a" "iframe"]))
;; -- span --
(allowWithoutAttributes (string-array ["span"]))
(allowAttributes (string-array ["class"
"data-first-name"
"data-last-name"
"data-slack-username"
"data-user-id"
"data-email"
"data-avatar-url"
"data-found"
"data-auto-link"
"data-href"]))
(onElements (string-array ["span"]))
;; -- images --
(allowAttributes (string-array ["src"
"alt"
"class"
"data-media-type"
"data-thumbnail"]))
(onElements (string-array ["img"]))
;; -- anchors / links --
(allowAttributes (string-array ["href"
"target"]))
(onElements (string-array ["a"]))
(requireRelNofollowOnLinks)
;; -- iframes (embeds) --
(allowAttributes (string-array ["src"]))
(matching iframe-src-regex)
(onElements (string-array ["iframe"]))
(allowAttributes (string-array ["class"
"width"
"height"
"data-media-type"
"frameborder"
"webkitallowfullscreen"
"mozallowfullscreen"
"allowfullscreen"
"data-thumbnail"
"data-video-type"
"data-video-id"]))
(onElements (string-array ["iframe"]))
(toFactory)))))
#?(:clj
(defn sanitize-html
"Sanitizes HTML content assumed to have been created by a (untrusted) user."
[html-str]
(.sanitize user-input-html-policy html-str)
))
#?(:clj
(defn strip-html-tags
"Reduces an html string to only its textual content, removing all tags"
[html-str]
(let [policy (.toFactory (HtmlPolicyBuilder.))]
(.sanitize policy html-str))))