/
css_cleaner.rb
240 lines (204 loc) · 10.1 KB
/
css_cleaner.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
# Use css parser to break up style blocks
require 'css_parser'
include CssParser
module CssCleaner
# constant regexps for css values
ALPHA_REGEX = Regexp.new('[a-z\-]+')
UNITS_REGEX = Regexp.new('deg|cm|em|ex|in|mm|pc|pt|px|s|%', Regexp::IGNORECASE)
NUMBER_REGEX = Regexp.new('-?\.?\d{1,3}\.?\d{0,3}')
NUMBER_WITH_UNIT_REGEX = Regexp.new("#{NUMBER_REGEX}\s*#{UNITS_REGEX}?\s*,?\s*")
PAREN_NUMBER_REGEX = Regexp.new('\(\s*' + NUMBER_WITH_UNIT_REGEX.to_s + '+\s*\)')
PREFIX_REGEX = Regexp.new('moz|ms|o|webkit')
FUNCTION_NAME_REGEX = Regexp.new('scalex?y?|translatex?y?|skewx?y?|rotatex?y?|matrix', Regexp::IGNORECASE)
TRANSFORM_FUNCTION_REGEX = Regexp.new("#{FUNCTION_NAME_REGEX}#{PAREN_NUMBER_REGEX}")
SHAPE_NAME_REGEX = Regexp.new('rect', Regexp::IGNORECASE)
SHAPE_FUNCTION_REGEX = Regexp.new("#{SHAPE_NAME_REGEX}#{PAREN_NUMBER_REGEX}")
RGBA_REGEX = Regexp.new('rgba?' + PAREN_NUMBER_REGEX.to_s, Regexp::IGNORECASE)
COLOR_REGEX = Regexp.new('#[0-9a-f]{3,6}|' + ALPHA_REGEX.to_s + '|' + RGBA_REGEX.to_s)
COLOR_STOP_FUNCTION_REGEX = Regexp.new('color-stop\s*\(' + NUMBER_WITH_UNIT_REGEX.to_s + '\s*\,?\s*' + COLOR_REGEX.to_s + '\s*\)', Regexp::IGNORECASE)
# from the ICANN list at http://www.icann.org/en/registries/top-level-domains.htm
TOP_LEVEL_DOMAINS = %w(ac ad ae aero af ag ai al am an ao aq ar arpa as asia at au aw ax az ba bb bd be bf bg bh bi biz bj bm bn bo br bs bt bv bw by bz ca cat cc cd cf cg ch ci ck cl cm cn co com coop cr cu cv cx cy cz de dj dk dm do dz ec edu ee eg er es et eu fi fj fk fm fo fr ga gb gd ge gf gg gh gi gl gm gn gov gp gq gr gs gt gu gw gy hk hm hn hr ht hu id ie il im in info int io iq ir is it je jm jo jobs jp ke kg kh ki km kn kp kr kw ky kz la lb lc li lk lr ls lt lu lv ly ma mc md me mg mh mil mk ml mm mn mo mobi mp mq mr ms mt mu museum mv mw mx my mz na name nc ne net nf ng ni nl no np nr nu nz om org pa pe pf pg ph pk pl pm pn pr pro ps pt pw py qa re ro rs ru rw sa sb sc sd se sg sh si sj sk sl sm sn so sr st su sv sy sz tc td tel tf tg th tj tk tl tm tn to tp tr travel tt tv tw tz ua ug uk us uy uz va vc ve vg vi vn vu wf ws xn xxx ye yt za zm zw)
DOMAIN_REGEX = Regexp.new('https?://\w[\w\-\.]+\.(' + TOP_LEVEL_DOMAINS.join('|') + ')')
DOMAIN_OR_IMAGES_REGEX = Regexp.new('\/images|' + DOMAIN_REGEX.to_s)
URI_REGEX = Regexp.new(DOMAIN_OR_IMAGES_REGEX.to_s + '/[\w\-\.\/]*[\w\-]\.(' + ArchiveConfig.SUPPORTED_EXTERNAL_URLS.join('|') + ')')
URL_REGEX = Regexp.new(URI_REGEX.to_s + '|"' + URI_REGEX.to_s + '"|\'' + URI_REGEX.to_s + '\'')
URL_FUNCTION_REGEX = Regexp.new('url\(\s*' + URL_REGEX.to_s + '\s*\)')
VALUE_REGEX = Regexp.new("#{TRANSFORM_FUNCTION_REGEX}|#{URL_FUNCTION_REGEX}|#{COLOR_STOP_FUNCTION_REGEX}|#{COLOR_REGEX}|#{NUMBER_WITH_UNIT_REGEX}|#{ALPHA_REGEX}|#{SHAPE_FUNCTION_REGEX}")
# For use in ActiveRecord models
# We parse and clean the CSS line by line in order to provide more helpful error messages.
# The prefix is used if you want to make sure a particular prefix appears on all the selectors in
# this block of css, eg ".userstuff p" instead of just "p"
def clean_css_code(css_code, options = {})
return "" if !css_code.match(/\w/) # only spaces of various kinds
clean_css = ""
parser = CssParser::Parser.new
parser.add_block!(css_code)
prefix = options[:prefix] || ''
caller_check = options[:caller_check]
if parser.to_s.blank?
errors.add(:base, ts("We couldn't find any valid CSS rules in that code."))
else
parser.each_rule_set do |rs|
selectors = rs.selectors.map do |selector|
if selector.match(/@font-face/i)
errors.add(:base, ts("We don't allow the @font-face feature."))
next
end
# remove whitespace and convert > entities back to the > direct child selector
sel = selector.gsub(/\n/, '').gsub('>', '>').strip
(prefix.blank? || sel.start_with?(prefix)) ? sel : "#{prefix} #{sel}"
end
clean_declarations = ""
rs.each_declaration do |property, value, is_important|
if property.blank? || value.blank?
errors.add(:base, ts("The code for #{rs.selectors.join(',')} doesn't seem to be a valid CSS rule."))
elsif sanitize_css_property(property).blank?
errors.add(:base, ts("We don't currently allow the CSS property #{property} -- please notify support if you think this is an error."))
elsif (cleanval = sanitize_css_declaration_value(property, value)).blank?
errors.add(:base, ts("The #{property} property in #{rs.selectors.join(', ')} cannot have the value #{value}, sorry!"))
elsif (!caller_check || caller_check.call(rs, property, value))
clean_declarations += " #{property}: #{cleanval}#{is_important ? ' !important' : ''};\n"
end
end
if clean_declarations.blank?
errors.add(:base, ts("There don't seem to be any rules for #{rs.selectors.join(',')}"))
else
# everything looks ok, add it to the css
clean_css += "#{selectors.join(",\n")} {\n"
clean_css += clean_declarations
clean_css += "}\n\n"
end
end
end
return clean_css
end
def is_legal_property(property)
ArchiveConfig.SUPPORTED_CSS_PROPERTIES.include?(property) ||
property.match(/-(#{PREFIX_REGEX})-(#{ArchiveConfig.SUPPORTED_CSS_PROPERTIES.join('|')})/)
end
def is_legal_shorthand_property(property)
property.match(/#{ArchiveConfig.SUPPORTED_CSS_SHORTHAND_PROPERTIES.join('|')}/)
end
def sanitize_css_property(property)
return (is_legal_property(property) || is_legal_shorthand_property(property)) ? property : ""
end
# A declaration must match the format: property: value;
# All properties must appear in ArchiveConfig.SUPPORTED_CSS_PROPERTIES or ArchiveConfig.SUPPORTED_CSS_SHORTHAND_PROPERTIES,
# or that property and its value will be omitted.
# All values are sanitized. If any values in a declaration are invalid, the value will be blanked out and an
# empty property returned.
def sanitize_css_declaration_value(property, value)
clean = ""
property.downcase!
if property == "font-family"
if !sanitize_css_font(value).blank?
# preserve the original capitalization
clean = value
end
elsif property == "content"
clean = sanitize_css_content(value)
elsif value.match(/\burl\b/) && (!ArchiveConfig.SUPPORTED_CSS_KEYWORDS.include?("url") || !%w(background background-image border border-image list-style list-style-image).include?(property))
# check whether we can use urls in this property
clean = ""
elsif is_legal_shorthand_property(property)
clean = tokenize_and_sanitize_css_value(value)
elsif is_legal_property(property)
clean = sanitize_css_value(value)
end
clean.strip
end
# divide a css value into tokens and clean them individually
def tokenize_and_sanitize_css_value(value)
cleanval = ""
scanner = StringScanner.new(value)
# we scan until we find either a space, a comma, or an open parenthesis
while scanner.exist?(/\s+|,|\(/)
# we have some tokens left to break up
in_paren = 0
token = scanner.scan_until(/\s+|,|\(/)
if token.blank? || token == ","
cleanval += token
next
end
in_paren = 1 if token.match(/\($/)
while in_paren > 0
# scan until closing paren or another opening paren
nextpart = scanner.scan_until(/\(|\)/)
if nextpart
token += nextpart
in_paren += 1 if token.match(/\($/)
in_paren -= 1 if token.match(/\)$/)
else
# mismatched parens
return ""
end
end
# we now have a single token
separator = token.match(/(\s|,)$/) || ""
token.strip!
token.chomp!(',')
cleantoken = sanitize_css_token(token)
return "" if cleantoken.blank?
cleanval += cleantoken + separator.to_s
end
token = scanner.rest
if token && !token.blank?
cleantoken = sanitize_css_token(token)
return "" if cleantoken.blank?
cleanval += cleantoken
end
return cleanval
end
def sanitize_css_token(token)
cleantoken = ""
if token.match(/gradient/)
cleantoken = sanitize_css_gradient(token)
else
cleantoken = sanitize_css_value(token)
end
return cleantoken
end
# sanitize a CSS gradient
# background:-webkit-gradient( linear, left bottom, left top, color-stop(0, rgb(82,82,82)), color-stop(1, rgb(125,124,125)));
# -moz-linear-gradient(bottom, rgba(120,120,120,1) 5%, rgba(94,94,94,1) 50%, rgba(108,108,108,1) 55%, rgba(137,137,137,1) 100%);
def sanitize_css_gradient(value)
if value.match(/^([a-z\-]+)\((.*)\)/)
function = $1
interior = $2
cleaned_interior = tokenize_and_sanitize_css_value(interior)
if function.match(/gradient/) && !cleaned_interior.blank?
return "#{function}(#{cleaned_interior})"
end
end
return ""
end
# all values must either appear in ArchiveConfig.SUPPORTED_CSS_KEYWORDS, be urls of the format url(http://url/) or be
# rgba(), hex (#), or numeric values, or a comma-separated list of same
def sanitize_css_value(value)
value_stripped = value.downcase.gsub(/(!important)/, '').strip
# if it's a comma-separated set of valid values it's fine
return value if value_stripped =~ /^(#{VALUE_REGEX}\,?\s*)+$/i
# If it's explicitly in our keywords it's fine
return value if value_stripped.split(',').all? {|subval| ArchiveConfig.SUPPORTED_CSS_KEYWORDS.include?(subval.strip)}
return ""
end
def sanitize_css_content(value)
# For now we only allow a single completely quoted string
return value if value =~ /^\'([^\']*)\'$/
return value if value =~ /^\"([^\"]*)\"$/
# or a valid img url
return value if value.match(URL_FUNCTION_REGEX)
# or "none"
return value if value == "none"
return ""
end
# Font family names may be alphanumeric values with dashes
def sanitize_css_font(value)
value_stripped = value.downcase.gsub(/(!important)/, '').strip
if value_stripped.split(',').all? {|fontname| fontname.strip =~ /^(\'?[a-z0-9\- ]+\'?|\"?[a-z0-9\- ]+\"?)$/}
return value
else
return ""
end
end
end