Skip to content

Commit

Permalink
Force UTF-8 as filesystem encoding in some cases
Browse files Browse the repository at this point in the history
Fix #635
Fix #428
  • Loading branch information
untitaker committed May 22, 2015
1 parent 00492cd commit bba0cdc
Show file tree
Hide file tree
Showing 11 changed files with 121 additions and 12 deletions.
2 changes: 2 additions & 0 deletions CHANGES
Expand Up @@ -23,6 +23,8 @@ Version 0.11
converted to lowercase.
- Changed cache so that cache never expires if timeout is 0. This also fixes
an issue with redis setex (issue ``#550``)
- Werkzeug now assumes ``UTF-8`` as filesystem encoding on Unix if Python
detected it as ASCII.

Version 0.10.5
--------------
Expand Down
1 change: 1 addition & 0 deletions docs/contents.rst.inc
Expand Up @@ -36,6 +36,7 @@ Reference
wrappers
routing
wsgi
filesystem
http
datastructures
utils
Expand Down
11 changes: 11 additions & 0 deletions docs/filesystem.rst
@@ -0,0 +1,11 @@
====================
Filesystem Utilities
====================

Various utilities for the local filesystem.

.. module:: werkzeug.filesystem

.. autoclass:: BrokenFilesystemWarning

.. autofunction:: get_filesystem_encoding
22 changes: 22 additions & 0 deletions docs/unicode.rst
Expand Up @@ -134,3 +134,25 @@ but not encoding. If Werkzeug encounters an encoding error it will raise a
:exc:`UnicodeEncodeError`. It's your responsibility to not create data that is
not present in the target charset (a non issue with all unicode encodings
such as utf-8).

.. _filesystem-encoding:

The Filesystem
==============

.. versionchanged:: 0.11

Up until version 0.11, Werkzeug used Python's stdlib functionality to detect
the filesystem encoding. However, several bug reports against Werkzeug have
shown that the value of :py:func:`sys.getfilesystemencoding` can not the
trusted under traditional UNIX systems. The usual problems come from
misconfigured systems, where ``LANG`` and similar environment variables are not
set. In such cases, Python would default to ASCII as filesystem encoding, a
very conservative default that is usually wrong and causes more problems than
it avoids.

Therefore Werkzeug will force the filesystem encoding to ``UTF-8`` and issue a
warning whenever it detects that it is running under BSD or Linux, and
:py:func:`sys.getfilesystemencoding` is returning an ASCII encoding.

See also :py:mod:`werkzeug.filesystem`.
3 changes: 3 additions & 0 deletions werkzeug/_compat.py
@@ -1,8 +1,11 @@
# flake8: noqa
# This whole file is full of lint errors
import codecs
import sys
import operator
import functools
import warnings

try:
import builtins
except ImportError:
Expand Down
6 changes: 3 additions & 3 deletions werkzeug/contrib/sessions.py
Expand Up @@ -53,7 +53,6 @@ def application(environ, start_response):
"""
import re
import os
import sys
import tempfile
from os import path
from time import time
Expand All @@ -66,6 +65,7 @@ def application(environ, start_response):
from werkzeug.wsgi import ClosingIterator
from werkzeug.posixemulation import rename
from werkzeug._compat import PY2, text_type
from werkzeug.filesystem import get_filesystem_encoding


_sha1_re = re.compile(r'^[a-f0-9]{40}$')
Expand Down Expand Up @@ -223,7 +223,7 @@ def __init__(self, path=None, filename_template='werkzeug_%s.sess',
self.path = path
if isinstance(filename_template, text_type) and PY2:
filename_template = filename_template.encode(
sys.getfilesystemencoding() or 'utf-8')
get_filesystem_encoding())
assert not filename_template.endswith(_fs_transaction_suffix), \
'filename templates may not end with %s' % _fs_transaction_suffix
self.filename_template = filename_template
Expand All @@ -235,7 +235,7 @@ def get_session_filename(self, sid):
# you might reconfigure the session object to have a more
# arbitrary string.
if isinstance(sid, text_type) and PY2:
sid = sid.encode(sys.getfilesystemencoding() or 'utf-8')
sid = sid.encode(get_filesystem_encoding())
return path.join(self.path, self.filename_template % sid)

def save(self, session):
Expand Down
4 changes: 2 additions & 2 deletions werkzeug/datastructures.py
Expand Up @@ -9,7 +9,6 @@
:license: BSD, see LICENSE for more details.
"""
import re
import sys
import codecs
import mimetypes
from copy import deepcopy
Expand All @@ -19,6 +18,7 @@
from werkzeug._compat import iterkeys, itervalues, iteritems, iterlists, \
PY2, text_type, integer_types, string_types, make_literal_wrapper, \
to_native
from werkzeug.filesystem import get_filesystem_encoding


_locale_delim_re = re.compile(r'[_-]')
Expand Down Expand Up @@ -2565,7 +2565,7 @@ def __init__(self, stream=None, filename=None, name=None,
# This might not be if the name attribute is bytes due to the
# file being opened from the bytes API.
if not PY2 and isinstance(filename, bytes):
filename = filename.decode(sys.getfilesystemencoding(),
filename = filename.decode(get_filesystem_encoding(),
'replace')

self.filename = filename
Expand Down
6 changes: 4 additions & 2 deletions werkzeug/debug/tbtools.py
Expand Up @@ -22,6 +22,7 @@
from werkzeug.debug.console import Console
from werkzeug._compat import range_type, PY2, text_type, string_types, \
to_native, to_unicode
from werkzeug.filesystem import get_filesystem_encoding


_coding_re = re.compile(br'coding[:=]\s*([-\w.]+)')
Expand Down Expand Up @@ -383,7 +384,7 @@ def __init__(self, exc_type, exc_value, tb):
# if it's a file on the file system resolve the real filename.
if os.path.isfile(fn):
fn = os.path.realpath(fn)
self.filename = to_unicode(fn, sys.getfilesystemencoding())
self.filename = to_unicode(fn, get_filesystem_encoding())
self.module = self.globals.get('__name__')
self.loader = self.globals.get('__loader__')
self.code = tb.tb_frame.f_code
Expand Down Expand Up @@ -466,7 +467,8 @@ def sourcelines(self):

if source is None:
try:
f = open(self.filename, mode='rb')
f = open(to_native(self.filename, get_filesystem_encoding()),
mode='rb')
except IOError:
return []
try:
Expand Down
66 changes: 66 additions & 0 deletions werkzeug/filesystem.py
@@ -0,0 +1,66 @@
# -*- coding: utf-8 -*-
"""
werkzeug.filesystem
~~~~~~~~~~~~~~~~~~~
Various utilities for the local filesystem.
:copyright: (c) 2015 by the Werkzeug Team, see AUTHORS for more details.
:license: BSD, see LICENSE for more details.
"""

import codecs
import sys
import warnings

# We do not trust traditional unixes.
has_likely_buggy_unicode_filesystem = \
sys.platform.startswith('linux') or 'bsd' in sys.platform


def _is_ascii_encoding(encoding):
"""
Given an encoding this figures out if the encoding is actually ASCII (which
is something we don't actually want in most cases). This is necessary
because ASCII comes under many names such as ANSI_X3.4-1968.
"""
if encoding is None:
return False
try:
return codecs.lookup(encoding).name == 'ascii'
except LookupError:
return False


class BrokenFilesystemWarning(RuntimeWarning, UnicodeWarning):
'''The warning used by Werkzeug to signal a broken filesystem. Will only be
used once per runtime.'''


_warned_about_filesystem_encoding = False


def get_filesystem_encoding():
"""
Returns the filesystem encoding that should be used. Note that this is
different from the Python understanding of the filesystem encoding which
might be deeply flawed. Do not use this value against Python's unicode APIs
because it might be different. See :ref:`filesystem-encoding` for the exact
behavior.
The concept of a filesystem encoding in generally is not something you
should rely on. As such if you ever need to use this function except for
writing wrapper code reconsider.
"""
global _warned_about_filesystem_encoding
rv = sys.getfilesystemencoding()
if has_likely_buggy_unicode_filesystem and not rv \
or _is_ascii_encoding(rv):
if not _warned_about_filesystem_encoding:
warnings.warn(
'Detected a misconfigured UNIX filesystem: Will use UTF-8 as '
'filesystem encoding instead of {!r}'.format(rv),
BrokenFilesystemWarning)
_warned_about_filesystem_encoding = True
return 'utf-8'
return rv
6 changes: 4 additions & 2 deletions werkzeug/posixemulation.py
Expand Up @@ -22,7 +22,9 @@
import errno
import time
import random

from ._compat import to_unicode
from .filesystem import get_filesystem_encoding


can_rename_open_file = False
Expand All @@ -38,8 +40,8 @@
_MoveFileEx = ctypes.windll.kernel32.MoveFileExW

def _rename(src, dst):
src = to_unicode(src, sys.getfilesystemencoding())
dst = to_unicode(dst, sys.getfilesystemencoding())
src = to_unicode(src, get_filesystem_encoding())
dst = to_unicode(dst, get_filesystem_encoding())
if _rename_atomic(src, dst):
return True
retry = 0
Expand Down
6 changes: 3 additions & 3 deletions werkzeug/wsgi.py
Expand Up @@ -10,7 +10,6 @@
"""
import re
import os
import sys
import posixpath
import mimetypes
from itertools import chain
Expand All @@ -25,6 +24,7 @@
from werkzeug._internal import _empty_stream, _encode_idna
from werkzeug.http import is_resource_modified, http_date
from werkzeug.urls import uri_to_iri, url_quote, url_parse, url_join
from werkzeug.filesystem import get_filesystem_encoding


def responder(f):
Expand Down Expand Up @@ -559,7 +559,7 @@ def loader(path):

def generate_etag(self, mtime, file_size, real_filename):
if not isinstance(real_filename, bytes):
real_filename = real_filename.encode(sys.getfilesystemencoding())
real_filename = real_filename.encode(get_filesystem_encoding())
return 'wzsdm-%d-%s-%s' % (
mktime(mtime.timetuple()),
file_size,
Expand All @@ -569,7 +569,7 @@ def generate_etag(self, mtime, file_size, real_filename):
def __call__(self, environ, start_response):
cleaned_path = get_path_info(environ)
if PY2:
cleaned_path = cleaned_path.encode(sys.getfilesystemencoding())
cleaned_path = cleaned_path.encode(get_filesystem_encoding())
# sanitize the path for non unix systems
cleaned_path = cleaned_path.strip('/')
for sep in os.sep, os.altsep:
Expand Down

0 comments on commit bba0cdc

Please sign in to comment.