Skip to content

Commit

Permalink
Merge pull request #1586 from noirbizarre/factorize-uris-validation
Browse files Browse the repository at this point in the history
Factorize/unify uris validation
  • Loading branch information
noirbizarre committed Apr 16, 2018
2 parents 38f44d4 + bc093ed commit af4eaeb
Show file tree
Hide file tree
Showing 11 changed files with 518 additions and 101 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
- Fix organizations sort by last_modified [#1576](https://github.com/opendatateam/udata/pull/1576)
- Fix dataset creation form (and any other form) [#1584](https://github.com/opendatateam/udata/pull/1584)
- Fix an XSS on client-side markdown parsing [#1585](https://github.com/opendatateam/udata/pull/1585)
- Ensure URLs validation is the same everywhere [#1586](https://github.com/opendatateam/udata/pull/1586)

## 1.3.5 (2018-04-03)

Expand Down
39 changes: 39 additions & 0 deletions docs/adapting-settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,45 @@ The duration used for templates' cache, in minutes.

This is the allowed resources extensions list that user can upload.

## URLs validation

### URLS_ALLOW_PRIVATE

**default**: `False`

Whether or not to allow private URLs (private IPs...) submission

### URLS_ALLOW_LOCAL

**default**: `False`

Whether or not to allow local URLs (localhost...) submission.
When developping you might need to set this to `True`.

### URLS_ALLOW_CREDENTIALS

**default**: `True`

Whether or not to allow credentials in URLs submission.

### URLS_ALLOWED_SCHEMES

**default**: `('http', 'https', 'ftp', 'ftps')`

List of allowed URL schemes.

### URLS_ALLOWED_TLDS

**default**: All IANA registered TLDs

List of allowed TLDs.
When using udata on an intranet, you might want to add your own custom TLDs:

```python
from udata.settings import Defaults

URLS_ALLOWED_TLDS = Defaults.URLS_ALLOWED_TLDS + set(['custom', 'company'])
```

## Spatial configuration

Expand Down
2 changes: 2 additions & 0 deletions requirements/install.pip
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ html2text==2018.1.9
lxml==4.2.1
mongoengine==0.15.0
msgpack-python==0.4.8
netaddr==0.7.19
pillow==5.1.0
bcrypt==3.1.4
pydenticon==0.3.1
Expand All @@ -40,6 +41,7 @@ rdflib-jsonld==0.4.0
redis==2.10.6
requests==2.18.4
StringDist==1.0.9
tlds==2018041200
unicodecsv==0.14.1
voluptuous==0.10.5
wtforms-json==0.3.3
Expand Down
7 changes: 3 additions & 4 deletions udata/forms/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
from flask_mongoengine.wtf import fields as mefields
from flask_fs.mongo import ImageReference
from wtforms import Form as WTForm, Field as WTField, validators, fields
from wtforms.fields import html5
from wtforms.utils import unset_value
from wtforms_json import flatten_json

Expand All @@ -20,7 +19,7 @@
from udata.core.storages import tmp
from udata.core.organization.permissions import OrganizationPrivatePermission
from udata.i18n import lazy_gettext as _
from udata import tags
from udata import tags, uris
from udata.utils import to_iso_date, get_by


Expand Down Expand Up @@ -130,8 +129,8 @@ class URLField(EmptyNone, Field):
def pre_validate(self, form):
if self.data:
try:
db.URLField().validate(self.data)
except db.ValidationError:
uris.validate(self.data)
except uris.ValidationError:
raise validators.ValidationError(_('Invalid URL'))
return True

Expand Down
35 changes: 8 additions & 27 deletions udata/harvest/filters.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,11 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals, absolute_import

import urlparse

import dateutil.parser

from voluptuous import Invalid

from udata import tags
from udata import tags, uris


def boolean(value):
Expand Down Expand Up @@ -88,34 +86,17 @@ def normalize_string(value):
return strip(line_endings(value))


def is_url(add_prefix='http://', full=False, remove_fragment=False,
schemes=('http', 'https')):
def is_url(default_scheme='http', **kwargs):
"""Return a converter that converts a clean string to an URL."""
def converter(value):
if value is None:
return value
split_url = list(urlparse.urlsplit(value))
if full and add_prefix \
and not all((split_url[0], split_url[1], split_url[2])) \
and not split_url[2].startswith('/'):
split_url = list(urlparse.urlsplit(add_prefix + value))
scheme = split_url[0]
if scheme != scheme.lower():
split_url[0] = scheme = scheme.lower()
if full and not scheme:
raise Invalid('URL must be complete')
if scheme and schemes is not None and scheme not in schemes:
raise Invalid('Scheme must belong to {0}'.format(sorted(schemes)))
network_location = split_url[1]
if network_location != network_location.lower():
split_url[1] = network_location = network_location.lower()
if scheme in ('http', 'https') and not split_url[2]:
# By convention a full HTTP URL must always have
# at least a "/" in its path.
split_url[2] = '/'
if remove_fragment and split_url[4]:
split_url[4] = ''
return unicode(urlparse.urlunsplit(split_url))
if '://' not in value and default_scheme:
value = '://'.join((default_scheme, value.strip()))
try:
return uris.validate(value)
except uris.ValidationError as e:
raise Invalid(e.message)
return converter


Expand Down
5 changes: 5 additions & 0 deletions udata/harvest/tests/test_filters.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,8 @@ def test_allowed_scheme_not_allowed(self):
f = filters.is_url()
with self.assertRaises(Invalid):
f('not-allowed://somewhere.com')

def test_valid_url_with_default_scheme(self):
f = filters.is_url()
self.assertEqual(f('somewhere.com/path'),
'http://somewhere.com/path')
93 changes: 28 additions & 65 deletions udata/models/url_field.py
Original file line number Diff line number Diff line change
@@ -1,76 +1,34 @@
# -*- coding: utf-8 -*-
from __future__ import unicode_literals

import re
from mongoengine.fields import StringField

from mongoengine.fields import URLField as MEURLField
from udata import uris


IP_MIDDLE_OCTET = r'(?:\.(?:1?\d{1,2}|2[0-4]\d|25[0-5]))'
IP_LAST_OCTET = r'(?:\.(?:[1-9]\d?|1\d\d|2[0-4]\d|25[0-4]))'

URL_REGEX = re.compile(
r'^'
# scheme is validated separately
r'^(?:[a-z0-9\.\-]*)://'
# user:pass authentication
r'(?:\S+(?::\S*)?@)?'
r'(?:'
r'(?P<private_ip>'
# IP address exclusion
# private & local networks
r'(?:localhost)|'
r'(?:(?:10|127)' + IP_MIDDLE_OCTET + r'{2}' + IP_LAST_OCTET + r')|'
r'(?:(?:169\.254|192\.168)' + IP_MIDDLE_OCTET + IP_LAST_OCTET + r')|'
r'(?:172\.(?:1[6-9]|2\d|3[0-1])' + IP_MIDDLE_OCTET + IP_LAST_OCTET + r'))'
r'|'
# IP address dotted notation octets
# excludes loopback network 0.0.0.0
# excludes reserved space >= 224.0.0.0
# excludes network & broadcast addresses
# (first & last IP address of each class)
r'(?P<public_ip>'
r'(?:[1-9]\d?|1\d\d|2[01]\d|22[0-3])'
r'' + IP_MIDDLE_OCTET + r'{2}'
r'' + IP_LAST_OCTET + r')'
r'|'
# host name
r'(?:(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)'
# domain name
r'(?:\.(?:[a-z\u00a1-\uffff0-9]-?)*[a-z\u00a1-\uffff0-9]+)*'
# TLD identifier
r'(?:\.(?:[a-z\u00a1-\uffff]{2,}))'
r')'
# port number
r'(?::\d{2,5})?'
# resource path
r'(?:/\S*)?'
# query string
r'(?:\?\S*)?'
r'$',
re.UNICODE | re.IGNORECASE
)


class URLField(MEURLField):
class URLField(StringField):
'''
An URL field that automatically strips extra spaces
and support uncode domain and paths.
An URL field using the udata URL normalization and validation rules.
Public URL can be enforced with `public=True`
The URL spaces are automatically stripped.
URL_REGEX has been extracted and adapted from:
https://github.com/kvesteri/validators/blob/master/validators/url.py
Non-specified parameters fallback app level settings,
ie. ``URLS_ALLOW_PRIVATE``, ``URLS_ALLOW_LOCAL``
``URLS_ALLOWED_SCHEMES`` and ``URLS_ALLOWED_TLDS``
Main changes are:
- scheme validation is handled separately instead of being hard coded
- handle `localhost` as a valid private url
'''
_URL_REGEX = URL_REGEX
:params bool private: Allow private URLs
:params bool local: Allow local URLs
:params list schemes: List of allowed schemes
:params list tlds: List of allowed TLDs
def __init__(self, public=False, **kwargs):
'''
def __init__(self, private=None, local=None, schemes=None, tlds=None,
**kwargs):
super(URLField, self).__init__(**kwargs)
self.public = public
self.private = private
self.local = local
self.schemes = schemes
self.tlds = tlds

def to_python(self, value):
value = super(URLField, self).to_python(value)
Expand All @@ -79,7 +37,12 @@ def to_python(self, value):

def validate(self, value):
super(URLField, self).validate(value)
if self.public:
match = self.url_regex.match(value)
if match and match.group('private_ip'):
self.error('Invalid URL: "{0}" is not public URL')
kwargs = {
a: getattr(self, a)
for a in ('private', 'local', 'schemes', 'tlds')
if getattr(self, a) is not None
}
try:
uris.validate(value, **kwargs)
except uris.ValidationError as e:
self.error(e.message)
16 changes: 16 additions & 0 deletions udata/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,12 @@
import pkg_resources

from kombu import Exchange, Queue
from tlds import tld_set

from udata.i18n import lazy_gettext as _



HOUR = 60 * 60


Expand Down Expand Up @@ -251,6 +253,19 @@ class Defaults(object):
# Max number of resources to display uncollapsed in dataset view
DATASET_MAX_RESOURCES_UNCOLLAPSED = 6

# URLs validation settings
###########################################################################
# Whether or not to allow private URLs (private IPs...) submission
URLS_ALLOW_PRIVATE = False
# Whether or not to allow local URLs (localhost...) submission.
URLS_ALLOW_LOCAL = False
# Whether or not to allow credentials in URLs submission.
URLS_ALLOW_CREDENTIALS = True
# List of allowed URL schemes.
URLS_ALLOWED_SCHEMES = ('http', 'https', 'ftp', 'ftps')
# List of allowed TLDs.
URLS_ALLOWED_TLDS = tld_set


class Testing(object):
'''Sane values for testing. Should be applied as override'''
Expand All @@ -272,6 +287,7 @@ class Testing(object):
LOGGER_HANDLER_POLICY = 'never'
CELERYD_HIJACK_ROOT_LOGGER = False
USE_METRICS = False
URLS_ALLOW_LOCAL = True # Test server URL is localhost


class Debug(Defaults):
Expand Down
10 changes: 5 additions & 5 deletions udata/tests/test_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,8 +57,8 @@ class URLTester(db.Document):
url = db.URLField()


class PublicURLTester(db.Document):
url = db.URLField(public=True)
class PrivateURLTester(db.Document):
url = db.URLField(private=True)


class AutoUUIDFieldTest(DBTestMixin, TestCase):
Expand Down Expand Up @@ -261,11 +261,11 @@ def test_handle_unicode(self):
obj.save().reload()
self.assertEqual(obj.url, url)

def test_public(self):
def test_public_private(self):
url = 'http://10.10.0.2/path/'
URLTester(url=url).save()
PrivateURLTester(url=url).save()
with self.assertRaises(ValidationError):
PublicURLTester(url=url).save()
URLTester(url=url).save()


class DatetimedTest(DBTestMixin, TestCase):
Expand Down

0 comments on commit af4eaeb

Please sign in to comment.