Skip to content

Commit

Permalink
[js] Rought implementation of the remaining props that need to be done
Browse files Browse the repository at this point in the history
Needs cleanup/refactoring and getting generated data out of the repo
  • Loading branch information
pmurias committed Dec 10, 2017
1 parent 4aa561b commit cfda09a
Show file tree
Hide file tree
Showing 9 changed files with 271 additions and 33 deletions.
10 changes: 0 additions & 10 deletions src/vm/js/Compiler.nqp
@@ -1,16 +1,6 @@
class QAST::CompilerJS does DWIMYNameMangling does SerializeOnce {
has $!nyi;

my sub literal_subst(str $source, str $pattern, str $replacement) {
my int $where := 0;
my str $result := $source;
while (my int $found := nqp::index($result, $pattern, $where)) != -1 {
$where := $found + nqp::chars($replacement);
$result := nqp::replace($result, $found, nqp::chars($pattern), $replacement);
};
$result;
}

#= If the env var NQPJS_LOG is set log to nqpjs.log
method log(*@msgs) {
my %env := nqp::getenvhash();
Expand Down
26 changes: 22 additions & 4 deletions src/vm/js/RegexCompiler.nqp
Expand Up @@ -261,16 +261,34 @@ class RegexCompiler {
}

method uniprop($node) {
if +@($node) == 1 {
my str $try_prop := "nqp.uniprop_{$node.negate ?? 'not_' !! ''}{~$node[0]}($!target, $!pos)";
if +@($node) == 1 || +@($node) == 2 {
my $arg;
if +@($node) == 2 {
$arg := $!compiler.as_js($node[1], :want($T_OBJ));
}

my str $mangled := nqp::lc(~$node[0]);
$mangled := literal_subst($mangled, '_', '');

# TODO remove whitespace and all medial hyphens
# except the hyphen in U+1180 HANGUL JUNGSEONG O-E.

my str $prop := "nqp.uniprop_{$node.negate ?? 'not_' !! ''}{$mangled}";
my str $try_prop := +@($node) == 1
?? "$prop($!target, $!pos)"
!! "$prop($*CTX, $!cursor, $!target, $!pos, {$arg.expr})";

my str $check;
if $node.subtype eq 'zerowidth' {
"if ($try_prop === -1) \{{self.fail}\}\n";
$check := "if ($try_prop === -1) \{{self.fail}\}\n";
}
else {
my str $offset := $*BLOCK.add_tmp;
"$offset = $try_prop;\n"
$check := "$offset = $try_prop;\n"
~ "if ($offset === -1) \{{self.fail}\} else \{$!pos += $offset\}\n";
}

+@($node) == 1 ?? $check !! Chunk.void($arg, $check);
} else {
$!compiler.NYI("NYI uniprop with more arguments");
}
Expand Down
10 changes: 10 additions & 0 deletions src/vm/js/Utils.nqp
Expand Up @@ -40,3 +40,13 @@ sub known_named(@known_named) {
}
'{' ~ nqp::join(',', @pairs) ~ '}'
}

my sub literal_subst(str $source, str $pattern, str $replacement) {
my int $where := 0;
my str $result := $source;
while (my int $found := nqp::index($result, $pattern, $where)) != -1 {
$where := $found + nqp::chars($replacement);
$result := nqp::replace($result, $found, nqp::chars($pattern), $replacement);
};
$result;
}
1 change: 1 addition & 0 deletions src/vm/js/nqp-runtime/package.json
Expand Up @@ -28,6 +28,7 @@
"source-map": "0.5.7",
"stack-trace": "0.0.10",
"unicharadata": "*",
"unicode-trie": "^0.3.1",
"xorshift": "^1.1.0",
"xregexp": "^3.2.0"
}
Expand Down
44 changes: 44 additions & 0 deletions src/vm/js/nqp-runtime/propVals.js
Expand Up @@ -423,6 +423,50 @@ module.exports = {
"CJK": "CJK_Unified_Ideographs",
"CJK_Ext_B": "CJK_Unified_Ideographs_Extension_B",
"Manichaean": "Manichaean"
},
"gc": {
"Z": "Separator",
"Po": "Other_Punctuation",
"Pc": "Connector_Punctuation",
"punct": "Punctuation",
"LC": "Cased_Letter",
"Nd": "Decimal_Number",
"Lu": "Uppercase_Letter",
"Lo": "Other_Letter",
"cntrl": "Control",
"Zs": "Space_Separator",
"Ps": "Open_Punctuation",
"Nl": "Letter_Number",
"Mn": "Nonspacing_Mark",
"Lt": "Titlecase_Letter",
"Cn": "Unassigned",
"Zl": "Line_Separator",
"So": "Other_Symbol",
"Sm": "Math_Symbol",
"Sk": "Modifier_Symbol",
"Sc": "Currency_Symbol",
"Cc": "Control",
"S": "Symbol",
"Pe": "Close_Punctuation",
"No": "Other_Number",
"N": "Number",
"L": "Letter",
"Cf": "Format",
"Pi": "Initial_Punctuation",
"Pf": "Final_Punctuation",
"digit": "Decimal_Number",
"Mc": "Spacing_Mark",
"Ll": "Lowercase_Letter",
"P": "Punctuation",
"Me": "Enclosing_Mark",
"M": "Mark",
"Combining_Mark": "Mark",
"Lm": "Modifier_Letter",
"Cs": "Surrogate",
"C": "Other",
"Zp": "Paragraph_Separator",
"Pd": "Dash_Punctuation",
"Co": "Private_Use"
}
};

Binary file not shown.
Binary file not shown.
1 change: 1 addition & 0 deletions src/vm/js/nqp-runtime/unicode-data/names.js

Large diffs are not rendered by default.

212 changes: 193 additions & 19 deletions src/vm/js/nqp-runtime/unicode-props.js
@@ -1,8 +1,15 @@

const propVals = require('./propVals.js');
const xregexp = require('xregexp');

function matchClass(category, negated) {
let regexp = xregexp('\\' + (negated ? 'P' : 'p') + '{' + category + '}', 'Ay');
const names = require('./unicode-data/names.js');

function mangled(name) {
return name.toLowerCase(name).replace(/_/g, '');
}

function matchClass(shouldMatch, category, negated) {
let regexp = xregexp('\\' + (shouldMatch ? 'p' : 'P') + '{' + category + '}', 'Ay');
return function(target, pos) {
regexp.lastIndex = pos;
if (regexp.test(target)) {
Expand All @@ -14,46 +21,213 @@ function matchClass(category, negated) {
}

let props = {
No: 'No',
Nl: 'Nl',
lower: 'Lowercase',
Letter: 'Letter',
InCyrillicSupplementary: 'InCyrillicSupplement'
upper: 'Uppercase',
Lowercase: 'Lowercase',
Uppercase: 'Uppercase',
'White_Space': 'White_Space',
'space': 'White_Space',
'ASCII': 'ASCII',
'Alpha': 'Alphabetic',
'Alphabetic': 'Alphabetic',
Any: 'Any'
};

for (let prop in props) {
exports['uniprop_' + prop] = matchClass(props[prop], false);
exports['uniprop_not_' + prop] = matchClass(props[prop], true);
exports['uniprop_' + mangled(prop)] = matchClass(true, props[prop]);
exports['uniprop_not_' + mangled(prop)] = matchClass(false, props[prop]);
}


for (let key in propVals.blk) {
if (key === 'NB') {
continue;
}
let alias = 'In' + key;
let long = 'In' + propVals.blk[key];
let alias = mangled('In' + key);

let forXregexp = 'In' + propVals.blk[key];
let long = mangled('In' + propVals.blk[key]);

exports['uniprop_' + alias] = matchClass(long, false);
exports['uniprop_not_' + alias] = matchClass(long, true);
exports['uniprop_' + mangled(alias)] = matchClass(true, forXregexp );
exports['uniprop_not_' + mangled(alias)] = matchClass(false, forXregexp);

if (!(('uniprop_' + long) in exports)) {
exports['uniprop_' + long] = matchClass(long, false);
exports['uniprop_not_' + long] = matchClass(long, true);
exports['uniprop_' + long] = matchClass(true, forXregexp);
exports['uniprop_not_' + long] = matchClass(false, forXregexp);
}
}

for (let alias in propVals.sc) {
if (alias === 'Hrkt' || alias === 'Zzzz') {
continue;
}
let long = propVals.sc[alias];

exports['uniprop_' + alias] = matchClass(long, false);
exports['uniprop_not_' + alias] = matchClass(long, true);
let forXregexp = propVals.sc[alias];
let long = mangled(propVals.sc[alias]);


exports['uniprop_' + mangled(alias)] = matchClass(true, forXregexp );
exports['uniprop_not_' + mangled(alias)] = matchClass(false, forXregexp);

if (!(('uniprop_' + long) in exports)) {
exports['uniprop_' + long] = matchClass(long, false);
exports['uniprop_not_' + long] = matchClass(long, true);
exports['uniprop_' + long] = matchClass(true, forXregexp);
exports['uniprop_not_' + long] = matchClass(false, forXregexp);
}
}

for (let alias in propVals.gc) {
if (alias === 'LC') {
continue;
}

const long = mangled(propVals.gc[alias]);
const forXregexp = propVals.gc[alias].replace(/_/g, '');

exports['uniprop_' + mangled(alias)] = matchClass(true, forXregexp);
exports['uniprop_not_' + mangled(alias)] = matchClass(false, forXregexp);


if (!(('uniprop_' + long) in exports)) {
exports['uniprop_' + long] = matchClass(true, forXregexp);
exports['uniprop_not_' + long] = matchClass(false, forXregexp);
}
}

function categoriesToRegex(categories) {
return categories ? categories.map(
category => names.regexes[names.props[category]] || '\\p{' + category + '}'
).join('|') : '';
}

function matchDerived(shouldMatch, match, avoid) {
const regexp = maybeNegated(shouldMatch, categoriesToRegex(match), categoriesToRegex(avoid));

return function(target, pos) {
regexp.lastIndex = pos;
if (regexp.test(target)) {
return regexp.lastIndex - pos;
} else {
return -1;
}
};
}


const derivedRecipes = {
LC: ['Ll', 'Lu', 'Lt']
};

const derived = {LC: 'Cased_Letter'};

for (let short in derived) {
const long = mangled(derived[short]);
exports['uniprop_' + mangled(short)] = exports['uniprop_' + long] = matchDerived(true, derivedRecipes[short]);
exports['uniprop_not_' + mangled(short)] = exports['uniprop_not_' + long] = matchDerived(false, derivedRecipes[short]);
}


const UnicodeTrie = require('unicode-trie')
const fs = require('fs')

const nativeArgs = require('./native-args.js');
const NativeStrArg = nativeArgs.NativeStrArg;

const numericTypeData = new UnicodeTrie(fs.readFileSync(__dirname + '/unicode-data/NumericType.trie'));
const bidiClassData = new UnicodeTrie(fs.readFileSync(__dirname + '/unicode-data/BidiClass.trie'));


function propWithArgs(negated, trie, propName, longNames) {
const propId = names.props[propName];
return function(ctx, cursor, target, offset, obj) {
const code = target.codePointAt(offset);
if (code === undefined) return -1;
const propValueId = trie.get(code);


let valueName = longNames
? names.propLongValues[propId][propValueId-1]
: names.propShortValues[propId][propValueId-1];


const result = cursor['!DELEGATE_ACCEPTS'](ctx, null, cursor, obj, new NativeStrArg(valueName)).$$getInt();

if (result === (negated ? 1 : 0)) {
return -1;
} else {
let isPair = 0; // TODO codes that take two bytes
return isPair ? 2 : 1;
}


return -1;
};
};

exports.uniprop_numerictype = propWithArgs(false, numericTypeData, 'nt', true);
exports.uniprop_nt = propWithArgs(false, numericTypeData, 'nt', false);
exports.uniprop_bc = propWithArgs(false, bidiClassData, 'bc', false);

exports.uniprop_not_numerictype = propWithArgs(true, numericTypeData, 'nt', true);
exports.uniprop_not_nt = propWithArgs(true, numericTypeData, 'nt', false);
exports.uniprop_not_bc = propWithArgs(true, bidiClassData, 'bc', false);

function maybeNegated(shouldMatch, main, exclude) {
let regex;
if (exclude !== undefined && exclude !== '') {
regex = shouldMatch
? '(?!' + exclude + ')' + main
: '(?:(?!' + main + ')\\p{Any})|' + exclude;
} else {
regex = shouldMatch ? main : '(?!' + main + ')\\p{Any}';
}

return xregexp(regex, 'yA');
}

function matchRegex(shouldMatch, regexString) {
const regexp = maybeNegated(shouldMatch, regexString);
return function(target, pos) {
regexp.lastIndex = pos;
if (regexp.test(target)) {
return regexp.lastIndex - pos;
} else {
return -1;
}
};
}

const propIdToNames = {};
for (const propName in names.props) {
const propId = names.props[propName];
if (!propIdToNames[propId]) propIdToNames[propId] = propIdToNames[propId] || [];
propIdToNames[propId].push(propName);
}

for (const propId in names.regexes) {
const match = matchRegex(true, names.regexes[propId]);
const negatedMatch = matchRegex(false, names.regexes[propId]);
for (const propName of propIdToNames[propId]) {
exports['uniprop_' + mangled(propName)] = match;
exports['uniprop_not_' + mangled(propName)] = negatedMatch;
}
}


const mathRegex = names.regexes[names.props.Other_Math] + '|\\p{Sm}';

exports.uniprop_math = matchRegex(true, mathRegex);
exports.uniprop_not_math = matchRegex(false, mathRegex);

exports.uniprop_assigned = matchRegex(true, '\\P{Cn}');
exports.uniprop_not_assigned = matchRegex(true, '\\p{Cn}');

exports.uniprop_unassigned = matchRegex(true, '\\p{Cn}');
exports.uniprop_not_unassigned = matchRegex(true, '\\P{Cn}');

exports.uniprop_idstart = matchDerived(true, ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Other_ID_Start'], ['Pattern_Syntax', 'Pattern_White_Space']);

exports.uniprop_not_idstart = matchDerived(false, ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Other_ID_Start'], ['Pattern_Syntax', 'Pattern_White_Space']);

exports.uniprop_idcontinue = matchDerived(true, ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc', 'Other_ID_Continue', 'Other_ID_Start'], ['Pattern_Syntax', 'Pattern_White_Space']);

exports.uniprop_not_idcontinue = matchDerived(false, ['Lu', 'Ll', 'Lt', 'Lm', 'Lo', 'Nl', 'Mn', 'Mc', 'Nd', 'Pc', 'Other_ID_Continue', 'Other_ID_Start'], ['Pattern_Syntax', 'Pattern_White_Space']);

0 comments on commit cfda09a

Please sign in to comment.