Skip to content

Commit

Permalink
Add unicode letters generator
Browse files Browse the repository at this point in the history
This generator is a helper for the gen_utf8 function which will provide
the system supported list of unicode letters. This will avoid generating
unicode string with control characters and other non letters characters.

Also adds tests for the generator in order to ensure it is not
generating unwanted characters.

Closes #69
  • Loading branch information
elyezer committed Feb 13, 2015
1 parent 3ca13df commit 4178f1b
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 31 deletions.
56 changes: 28 additions & 28 deletions fauxfactory/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import re
import string
import sys
import unicodedata
import uuid
import warnings

Expand All @@ -44,6 +45,7 @@
)
from functools import wraps


# Private Functions -----------------------------------------------------------


Expand All @@ -59,7 +61,6 @@ def _make_unicode(data):
return unicode(data) # flake8:noqa pylint:disable=undefined-variable
return data


def _is_positive_int(length):
"""Check that ``length`` argument is an integer greater than zero.
Expand All @@ -72,6 +73,29 @@ def _is_positive_int(length):
if not isinstance(length, int) or length <= 0:
raise ValueError("{0} is an invalid 'length'.".format(length))

def _unicode_letters_generator():
"""Generates unicode characters in the letters category
:return: a generator which will generates all unicode letters available
"""
if sys.version_info.major == 2:
chr_function = unichr
range_function = xrange
else:
range_function = range
chr_function = chr

# Use sys.maxunicode instead of 0x10FFFF to avoid the exception below, in a
# narrow Python build (before Python 3.3)
# ValueError: unichr() arg not in range(0x10000) (narrow Python build)
# For more information, read PEP 261.
for i in range_function(sys.maxunicode):
char = chr_function(i)
if unicodedata.category(char).startswith('L'):
yield char

UNICODE_LETTERS = [c for c in _unicode_letters_generator()]

# Public Functions ------------------------------------------------------------

Expand Down Expand Up @@ -673,33 +697,9 @@ def gen_utf8(length=10):
# Validate length argument
_is_positive_int(length)

# Generate codepoints. The valid range of UTF-8 codepoints is
# 0x0-0x10FFFF, minus the following: 0xC0-0xC1, 0xF5-0xFF and
# 0xD800-0xDFFF. These 2061 invalid codepoints (2 + 11 + 2048) comprise
# 0.2% of 0x0-0x10FFFF. Thus, it should be OK to just check for invalid
# codepoints and generate new ones if need be.
codepoints = []
while len(codepoints) < length:
# Use sys.maxunicode instead of 0x10FFFF to avoid the exception
# below, in a narrow Python build (before Python 3.3)
# ValueError: unichr() arg not in range(0x10000) (narrow Python
# build)
# For more information, read PEP 261.
codepoint = random.randint(0x0, sys.maxunicode)
if (
codepoint not in range(0xC0, 0xC1 + 1)
and codepoint not in range(0xF5, 0xFF + 1)
and codepoint not in range(0xD800, 0xDFFF + 1)):
codepoints.append(codepoint)

# Convert codepoints to characters. Python 2 and 3 support the `unichr`
# and `chr` functions, respectively.
if sys.version_info.major == 2:
# pylint:disable=E0602
output = u''.join(unichr(codepoint) for codepoint in codepoints)
else:
output = u''.join(chr(codepoint) for codepoint in codepoints)
return _make_unicode(output)
return _make_unicode(u''.join(
[random.choice(UNICODE_LETTERS) for _ in range(length)]
))


def gen_uuid():
Expand Down
24 changes: 21 additions & 3 deletions tests/test_strings.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

"""Tests for all string generators."""

import random
import unicodedata
import unittest

from fauxfactory import (
gen_alpha,
gen_alphanumeric,
Expand All @@ -12,12 +16,10 @@
gen_numeric_string,
gen_string,
gen_utf8,
_unicode_letters_generator,
)
from sys import version_info

import unittest
import random


class TestStrings(unittest.TestCase):
"""Test string generators."""
Expand Down Expand Up @@ -710,3 +712,19 @@ def test_gen_string6(self):
alphanumeric_string = gen_string('alphanumeric')
control_string = gen_alphanumeric()
self.assertEqual(len(control_string), len(alphanumeric_string),)


class UnicodeLettersGenerator(unittest.TestCase):
"""Test unicode letters generator"""

def test_chars_in_letters_category(self):
"""@Test: Unicode letters generator generates only unicode letters
@Feature: String Generator
@Assert: All generated characters are unicode letters
"""
# Categories extracted from section 5.5.1 of
# http://www.unicode.org/reports/tr44/tr44-4.html
for char in _unicode_letters_generator():
self.assertIn(
unicodedata.category(char), ('Lu', 'Ll', 'Lt', 'Lm', 'Lo')
)

0 comments on commit 4178f1b

Please sign in to comment.