-
Notifications
You must be signed in to change notification settings - Fork 22
/
__init__.py
1834 lines (1489 loc) · 61.3 KB
/
__init__.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
#
# Copyright (c) nexB Inc. and others. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
# See http://www.apache.org/licenses/LICENSE-2.0 for the license text.
# See https://github.com/nexB/license-expression for support or download.
# See https://aboutcode.org for more information about nexB OSS projects.
#
"""
This module defines a mini language to parse, validate, deduplicate, simplify,
normalize and compare license expressions using a boolean logic engine.
This supports SPDX and ScanCode license expressions and also accepts other
license naming conventions and license identifiers aliases to recognize and
normalize licenses.
Using boolean logic, license expressions can be tested for equality,
containment, equivalence and can be normalized, deduplicated or simplified.
The main entry point is the Licensing object.
"""
import itertools
import json
import re
import string
from collections import defaultdict
from collections import deque
from collections import namedtuple
from copy import copy
from copy import deepcopy
from functools import total_ordering
from os.path import abspath
from os.path import dirname
from os.path import join
import boolean
from boolean import Expression as LicenseExpression
# note these may not all be used here but are imported here to avoid leaking
# boolean.py constants to callers
from boolean.boolean import PARSE_ERRORS
from boolean.boolean import PARSE_INVALID_EXPRESSION
from boolean.boolean import PARSE_INVALID_NESTING
from boolean.boolean import PARSE_INVALID_OPERATOR_SEQUENCE
from boolean.boolean import PARSE_INVALID_SYMBOL_SEQUENCE
from boolean.boolean import PARSE_UNBALANCED_CLOSING_PARENS
from boolean.boolean import PARSE_UNKNOWN_TOKEN
from boolean.boolean import ParseError
from boolean.boolean import TOKEN_SYMBOL
from boolean.boolean import TOKEN_AND
from boolean.boolean import TOKEN_OR
from boolean.boolean import TOKEN_LPAR
from boolean.boolean import TOKEN_RPAR
from license_expression._pyahocorasick import Trie as AdvancedTokenizer
from license_expression._pyahocorasick import Token
curr_dir = dirname(abspath(__file__))
data_dir = join(curr_dir, 'data')
vendored_scancode_licensedb_index_location = join(
data_dir,
'scancode-licensedb-index.json',
)
# append new error codes to PARSE_ERRORS by monkey patching
PARSE_EXPRESSION_NOT_UNICODE = 100
if PARSE_EXPRESSION_NOT_UNICODE not in PARSE_ERRORS:
PARSE_ERRORS[PARSE_EXPRESSION_NOT_UNICODE] = (
'Expression string must be a string.'
)
PARSE_INVALID_EXCEPTION = 101
if PARSE_INVALID_EXCEPTION not in PARSE_ERRORS:
PARSE_ERRORS[PARSE_INVALID_EXCEPTION] = (
'A license exception symbol can only be used as an exception '
'in a "WITH exception" statement.'
)
PARSE_INVALID_SYMBOL_AS_EXCEPTION = 102
if PARSE_INVALID_SYMBOL_AS_EXCEPTION not in PARSE_ERRORS:
PARSE_ERRORS[PARSE_INVALID_SYMBOL_AS_EXCEPTION] = (
'A plain license symbol cannot be used as an exception '
'in a "WITH symbol" statement.'
)
PARSE_INVALID_SYMBOL = 103
if PARSE_INVALID_SYMBOL not in PARSE_ERRORS:
PARSE_ERRORS[PARSE_INVALID_SYMBOL] = (
'A proper license symbol is needed.'
)
class ExpressionError(Exception):
pass
class ExpressionParseError(ParseError, ExpressionError):
pass
# Used for tokenizing
Keyword = namedtuple('Keyword', 'value type')
Keyword.__len__ = lambda self: len(self.value)
# id for the "WITH" token which is not a proper boolean symbol but an expression
# symbol
TOKEN_WITH = 10
# keyword types that include operators and parens
KW_LPAR = Keyword('(', TOKEN_LPAR)
KW_RPAR = Keyword(')', TOKEN_RPAR)
KW_AND = Keyword('and', TOKEN_AND)
KW_OR = Keyword('or', TOKEN_OR)
KW_WITH = Keyword('with', TOKEN_WITH)
KEYWORDS = (KW_AND, KW_OR, KW_LPAR, KW_RPAR, KW_WITH,)
KEYWORDS_STRINGS = set(kw.value for kw in KEYWORDS)
# mapping of lowercase operator strings to an operator object
OPERATORS = {'and': KW_AND, 'or': KW_OR, 'with': KW_WITH}
_simple_tokenizer = re.compile(r'''
(?P<symop>[^\s\(\)]+)
|
(?P<space>\s+)
|
(?P<lpar>\()
|
(?P<rpar>\))
''',
re.VERBOSE | re.MULTILINE | re.UNICODE
).finditer
class ExpressionInfo:
"""
The ExpressionInfo class is returned by Licensing.validate() where it stores
information about a given license expression passed into
Licensing.validate().
The ExpressionInfo class has the following fields:
- original_expression: str.
- This is the license expression that was originally passed into
Licensing.validate()
- normalized_expression: str.
- If a valid license expression has been passed into `validate()`,
then the license expression string will be set in this field.
- errors: list
- If there were errors validating a license expression,
the error messages will be appended here.
- invalid_symbols: list
- If the license expression that has been passed into `validate()` has
license keys that are invalid (either that they are unknown or not used
in the right context), or the syntax is incorrect because an invalid
symbol was used, then those symbols will be appended here.
"""
def __init__(
self,
original_expression,
normalized_expression=None,
errors=None,
invalid_symbols=None,
):
self.original_expression = original_expression
self.normalized_expression = normalized_expression
self.errors = errors or []
self.invalid_symbols = invalid_symbols or []
def __repr__(self):
return (
'ExpressionInfo(\n'
f' original_expression={self.original_expression!r},\n'
f' normalized_expression={self.normalized_expression!r},\n'
f' errors={self.errors!r},\n'
f' invalid_symbols={self.invalid_symbols!r}\n'
')'
)
class Licensing(boolean.BooleanAlgebra):
"""
Licensing defines a mini language to parse, validate and compare license
expressions. This is the main entry point in this library.
Some of the features are:
- licenses can be validated against user-provided lists of known licenses
"symbols" (such as ScanCode licenses or the SPDX list).
- flexible expression parsing and recognition of licenses (including
licenses with spaces and keywords (such as AND, OR WITH) or parens in
their names).
- in an expression licenses can be more than just identifiers such as short
or long names with spaces, symbols and even parenthesis.
- A license can have multiple aliases (such as GPL-2.0, GPLv2 or GPL2) and
each will be properly recognized when parsing. The expression is rendered
normalized using the canononical license keys.
- expressions can be deduplicated, simplified, normalized, sorted and
compared for containment and/or logical equivalence thanks to a built-in
boolean logic engine.
- Once parsed, expressions can be rendered using simple templates (for
instance to render as HTML links in a web UI).
For example::
>>> l = Licensing()
>>> expr = l.parse(" GPL-2.0 or LGPL-2.1 and mit ")
>>> expected = 'GPL-2.0 OR (LGPL-2.1 AND mit)'
>>> assert expected == expr.render('{symbol.key}')
>>> expected = [
... LicenseSymbol('GPL-2.0'),
... LicenseSymbol('LGPL-2.1'),
... LicenseSymbol('mit')
... ]
>>> assert expected == l.license_symbols(expr)
>>> symbols = ['GPL-2.0+', 'Classpath', 'BSD']
>>> l = Licensing(symbols)
>>> expression = 'GPL-2.0+ with Classpath or (bsd)'
>>> parsed = l.parse(expression)
>>> expected = 'GPL-2.0+ WITH Classpath OR BSD'
>>> assert expected == parsed.render('{symbol.key}')
>>> expected = [
... LicenseSymbol('GPL-2.0+'),
... LicenseSymbol('Classpath'),
... LicenseSymbol('BSD')
... ]
>>> assert expected == l.license_symbols(parsed)
>>> assert expected == l.license_symbols(expression)
"""
def __init__(self, symbols=tuple(), quiet=True):
"""
Initialize a Licensing with an optional ``symbols`` sequence of
LicenseSymbol or LicenseSymbol-like objects or license key strings. If
provided and this list data is invalid, raise a ValueError. Print
warning and errors found in the symbols unless ``quiet`` is True.
"""
super(Licensing, self).__init__(
Symbol_class=LicenseSymbol,
AND_class=AND,
OR_class=OR,
)
# FIXME: this should be instead a super class of all symbols
self.LicenseSymbol = self.Symbol
symbols = symbols or tuple()
if symbols:
symbols = tuple(as_symbols(symbols))
warns, errors = validate_symbols(symbols)
if warns and not quiet:
for w in warns:
print(w)
if errors and not quiet:
for e in errors:
print(e)
if errors:
raise ValueError('\n'.join(warns + errors))
# mapping of known symbol key to symbol for reference
self.known_symbols = {
symbol.key: symbol
for symbol in symbols
}
# mapping of known symbol lowercase key to symbol for reference
self.known_symbols_lowercase = {
symbol.key.lower(): symbol
for symbol in symbols
}
# Aho-Corasick automaton-based Advanced Tokenizer
self.advanced_tokenizer = None
def is_equivalent(self, expression1, expression2, **kwargs):
"""
Return True if both ``expression1`` and ``expression2``
LicenseExpression objects are equivalent. If a string is provided, it
will be parsed and simplified. Extra ``kwargs`` are passed down to the
parse() function.
Raise ExpressionError on parse errors.
"""
ex1 = self._parse_and_simplify(expression1, **kwargs)
ex2 = self._parse_and_simplify(expression2, **kwargs)
return ex1 == ex2
def contains(self, expression1, expression2, **kwargs):
"""
Return True if ``expression1`` contains ``expression2``. where each
expression is either a string or a LicenseExpression object. If a string
is provided, it will be parsed and simplified.
Extra ``kwargs`` are passed down to the parse() function.
"""
ex1 = self._parse_and_simplify(expression1, **kwargs)
ex2 = self._parse_and_simplify(expression2, **kwargs)
return ex2 in ex1
def _parse_and_simplify(self, expression, **kwargs):
expression = self.parse(expression, **kwargs)
if expression is None:
return None
if not isinstance(expression, LicenseExpression):
raise TypeError(
f'expression must be LicenseExpression object: {expression!r}'
)
return expression.simplify()
def license_symbols(self, expression, unique=True, decompose=True, **kwargs):
"""
Return a list of LicenseSymbol objects used in an expression in the same
order as they first appear in the expression tree.
``expression`` is either a string or a LicenseExpression object.
If a string is provided, it will be parsed.
If ``unique`` is True only return unique symbols.
If ``decompose`` is True then composite LicenseWithExceptionSymbol
instances are not returned directly; instead their underlying license
and exception symbols are returned.
Extra ``kwargs`` are passed down to the parse() function.
For example:
>>> l = Licensing()
>>> expected = [
... LicenseSymbol('GPL-2.0'),
... LicenseSymbol('LGPL-2.1+')
... ]
>>> result = l.license_symbols(l.parse('GPL-2.0 or LGPL-2.1+'))
>>> assert expected == result
"""
expression = self.parse(expression, **kwargs)
if expression is None:
return []
symbols = (s for s in expression.get_literals() if isinstance(s, BaseSymbol))
if decompose:
symbols = itertools.chain.from_iterable(s.decompose() for s in symbols)
if unique:
symbols = ordered_unique(symbols)
return list(symbols)
def primary_license_symbol(self, expression, decompose=True, **kwargs):
"""
Return the left-most license symbol of an ``expression`` or None.
``expression`` is either a string or a LicenseExpression object.
If ``decompose`` is True, only the left-hand license symbol of a
decomposed LicenseWithExceptionSymbol symbol will be returned if this is
the left most member. Otherwise a composite LicenseWithExceptionSymbol
is returned in this case.
Extra ``kwargs`` are passed down to the parse() function.
"""
symbols = self.license_symbols(expression, decompose=decompose, **kwargs)
if symbols:
return symbols[0]
def primary_license_key(self, expression, **kwargs):
"""
Return the left-most license key of an ``expression`` or None. The
underlying symbols are decomposed.
``expression`` is either a string or a LicenseExpression object.
Extra ``kwargs`` are passed down to the parse() function.
"""
prim = self.primary_license_symbol(
expression=expression,
decompose=True,
**kwargs,
)
if prim:
return prim.key
def license_keys(self, expression, unique=True, **kwargs):
"""
Return a list of licenses keys used in an ``expression`` in the same
order as they first appear in the expression. ``expression`` is either a
string or a LicenseExpression object.
If ``unique`` is True only return unique symbols.
Extra ``kwargs`` are passed down to the parse() function.
For example:
>>> l = Licensing()
>>> expr = ' GPL-2.0 and mit+ with blabla and mit or LGPL-2.1 and mit and mit+ with GPL-2.0'
>>> expected = ['GPL-2.0', 'mit+', 'blabla', 'mit', 'LGPL-2.1']
>>> assert expected == l.license_keys(l.parse(expr))
"""
symbols = self.license_symbols(
expression=expression,
unique=False,
decompose=True,
**kwargs,
)
return self._keys(symbols, unique)
def _keys(self, symbols, unique=True):
keys = [ls.key for ls in symbols]
# note: we only apply this on bare keys strings as we can have the same
# symbol used as symbol or exception if we are not in strict mode
if unique:
keys = ordered_unique(keys)
return keys
def unknown_license_symbols(self, expression, unique=True, **kwargs):
"""
Return a list of unknown license symbols used in an ``expression`` in
the same order as they first appear in the ``expression``.
``expression`` is either a string or a LicenseExpression object.
If ``unique`` is True only return unique symbols.
Extra ``kwargs`` are passed down to the parse() function.
"""
symbols = self.license_symbols(
expression=expression,
unique=unique,
decompose=True,
**kwargs,
)
return [ls for ls in symbols if not ls.key in self.known_symbols]
def unknown_license_keys(self, expression, unique=True, **kwargs):
"""
Return a list of unknown licenses keys used in an ``expression`` in the
same order as they first appear in the ``expression``.
``expression`` is either a string or a LicenseExpression object.
If a string is provided, it will be parsed.
If ``unique`` is True only return unique keys.
Extra ``kwargs`` are passed down to the parse() function.
"""
symbols = self.unknown_license_symbols(
expression=expression,
unique=False,
**kwargs,
)
return self._keys(symbols, unique)
def validate_license_keys(self, expression):
unknown_keys = self.unknown_license_keys(expression, unique=True)
if unknown_keys:
msg = 'Unknown license key(s): {}'.format(', '.join(unknown_keys))
raise ExpressionError(msg)
def parse(
self,
expression,
validate=False,
strict=False,
simple=False,
**kwargs
):
"""
Return a new license LicenseExpression object by parsing a license
``expression``. Check that the ``expression`` syntax is valid and
raise an ExpressionError or an ExpressionParseError on errors.
Return None for empty expressions. ``expression`` is either a string or
a LicenseExpression object. If ``expression`` is a LicenseExpression it
is returned as-is.
Symbols are always recognized from known Licensing symbols if `symbols`
were provided at Licensing creation time: each license and exception is
recognized from known license keys (and from aliases for a symbol if
available).
If ``validate`` is True and a license is unknown, an ExpressionError
error is raised with a message listing the unknown license keys.
If ``validate`` is False, no error is raised if the ``expression``
syntax is correct. You can call further call the
`unknown_license_keys()` or `unknown_license_symbols()` methods to get
unknown license keys or symbols found in the parsed LicenseExpression.
If ``strict`` is True, an ExpressionError will be raised if in a
"WITH" expression such as "XXX with ZZZ" if the XXX symbol has
`is_exception` set to True or the YYY symbol has `is_exception` set to
False. This checks that symbols are used strictly as intended in a
"WITH" subexpression using a license on the left and an exception on thr
right.
If ``simple`` is True, parsing will use a simple tokenizer that assumes
that license symbols are all license keys and do not contain spaces.
For example:
>>> expression = 'EPL-1.0 and Apache-1.1 OR GPL-2.0 with Classpath-exception'
>>> parsed = Licensing().parse(expression)
>>> expected = '(EPL-1.0 AND Apache-1.1) OR GPL-2.0 WITH Classpath-exception'
>>> assert expected == parsed.render(template='{symbol.key}')
"""
if expression is None:
return
if isinstance(expression, LicenseExpression):
return expression
if isinstance(expression, bytes):
try:
expression = str(expression)
except:
ext = type(expression)
raise ExpressionError(
f'expression must be a string and not: {ext!r}'
)
if not isinstance(expression, str):
ext = type(expression)
raise ExpressionError(
f'expression must be a string and not: {ext!r}'
)
if not expression or not expression.strip():
return
try:
# this will raise a ParseError on errors
tokens = list(self.tokenize(
expression=expression,
strict=strict,
simple=simple,
))
expression = super(Licensing, self).parse(tokens)
except ParseError as e:
raise ExpressionParseError(
token_type=e.token_type,
token_string=e.token_string,
position=e.position,
error_code=e.error_code,
) from e
if not isinstance(expression, LicenseExpression):
raise ExpressionError(
'expression must be a LicenseExpression once parsed.')
if validate:
self.validate_license_keys(expression)
return expression
def tokenize(self, expression, strict=False, simple=False):
"""
Return an iterable of 3-tuple describing each token given an
``expression`` string. See boolean.BooleanAlgreba.tokenize() for API
details.
This 3-tuple contains these items: (token, token string, position):
- token: either a Symbol instance or one of TOKEN_* token types..
- token string: the original token string.
- position: the starting index of the token string in the `expr` string.
If ``strict`` is True, additional exceptions will be raised in a
expression such as "XXX with ZZZ" if the XXX symbol has is_exception`
set to True or the ZZZ symbol has `is_exception` set to False.
If ``simple`` is True, use a simple tokenizer that assumes that license
symbols are all license keys that do not contain spaces.
"""
if not expression:
return
if not isinstance(expression, str):
raise ParseError(error_code=PARSE_EXPRESSION_NOT_UNICODE)
if simple:
tokens = self.simple_tokenizer(expression)
else:
advanced_tokenizer = self.get_advanced_tokenizer()
tokens = advanced_tokenizer.tokenize(expression)
# Assign symbol for unknown tokens
tokens = build_symbols_from_unknown_tokens(tokens)
# skip whitespace-only tokens
tokens = (t for t in tokens if t.string and t.string.strip())
# create atomic LicenseWithExceptionSymbol from WITH subexpressions
tokens = replace_with_subexpression_by_license_symbol(tokens, strict)
# finally yield the actual args expected by the boolean parser
for token in tokens:
pos = token.start
token_string = token.string
token_value = token.value
if isinstance(token_value, BaseSymbol):
token_obj = token_value
elif isinstance(token_value, Keyword):
token_obj = token_value.type
else:
raise ParseError(error_code=PARSE_INVALID_EXPRESSION)
yield token_obj, token_string, pos
def get_advanced_tokenizer(self):
"""
Return an AdvancedTokenizer instance for this Licensing either cached or
created as needed.
If symbols were provided when this Licensing object was created, the
tokenizer will recognize known symbol keys and aliases (ignoring case)
when tokenizing expressions.
A license symbol is any string separated by keywords and parens (and it
can include spaces).
"""
if self.advanced_tokenizer is not None:
return self.advanced_tokenizer
self.advanced_tokenizer = tokenizer = AdvancedTokenizer()
add_item = tokenizer.add
for keyword in KEYWORDS:
add_item(keyword.value, keyword)
# self.known_symbols has been created at Licensing initialization time
# and is already validated and trusted here
for key, symbol in self.known_symbols.items():
# always use the key even if there are no aliases.
add_item(key, symbol)
aliases = getattr(symbol, 'aliases', [])
for alias in aliases:
# normalize spaces for each alias. The AdvancedTokenizer will
# lowercase them
if alias:
alias = ' '.join(alias.split())
add_item(alias, symbol)
tokenizer.make_automaton()
return tokenizer
def advanced_tokenizer(self, expression):
"""
Return an iterable of Token from an ``expression`` string.
"""
tokenizer = self.get_advanced_tokenizer()
return tokenizer.tokenize(expression)
def simple_tokenizer(self, expression):
"""
Return an iterable of Token from an ``expression`` string.
The split is done on spaces, keywords and parens. Anything else is a
symbol token, e.g. a typically license key or license id (that contains
no spaces or parens).
If symbols were provided when this Licensing object was created, the
tokenizer will recognize known symbol keys (ignoring case) when
tokenizing expressions.
"""
symbols = self.known_symbols_lowercase or {}
for match in _simple_tokenizer(expression):
if not match:
continue
# set start and end as string indexes
start, end = match.span()
end = end - 1
match_getter = match.groupdict().get
space = match_getter('space')
if space:
yield Token(start, end, space, None)
lpar = match_getter('lpar')
if lpar:
yield Token(start, end, lpar, KW_LPAR)
rpar = match_getter('rpar')
if rpar:
yield Token(start, end, rpar, KW_RPAR)
sym_or_op = match_getter('symop')
if sym_or_op:
sym_or_op_lower = sym_or_op.lower()
operator = OPERATORS.get(sym_or_op_lower)
if operator:
yield Token(start, end, sym_or_op, operator)
else:
sym = symbols.get(sym_or_op_lower)
if not sym:
sym = LicenseSymbol(key=sym_or_op)
yield Token(start, end, sym_or_op, sym)
def dedup(self, expression):
"""
Return a deduplicated LicenseExpression given a license ``expression``
string or LicenseExpression object.
The deduplication process is similar to simplification but is
specialized for working with license expressions. Simplification is
otherwise a generic boolean operation that is not aware of the specifics
of license expressions.
The deduplication:
- Does not sort the licenses of sub-expression in an expression. They
stay in the same order as in the original expression.
- Choices (as in "MIT or GPL") are kept as-is and not treated as
simplifiable. This avoids droping important choice options in complex
expressions which is never desirable.
"""
exp = self.parse(expression)
expressions = []
for arg in exp.args:
if isinstance(arg, (self.AND, self.OR,)):
# Run this recursive function if there is another AND/OR
# expression and add the expression to the expressions list.
expressions.append(self.dedup(arg))
else:
expressions.append(arg)
if isinstance(exp, BaseSymbol):
deduped = exp
elif isinstance(exp, (self.AND, self.OR,)):
relation = exp.__class__.__name__
deduped = combine_expressions(
expressions,
relation=relation,
unique=True,
licensing=self,
)
else:
raise ExpressionError(f'Unknown expression type: {expression!r}')
return deduped
def validate(self, expression, strict=True, **kwargs):
"""
Return a ExpressionInfo object that contains information about
the validation of an ``expression`` license expression string.
If the syntax and license keys of ``expression`` is valid, then
`ExpressionInfo.normalized_license_expression` is set.
If an error was encountered when validating ``expression``,
`ExpressionInfo.errors` will be populated with strings containing the
error message that has occured. If an error has occured due to unknown
license keys or an invalid license symbol, the offending keys or symbols
will be present in `ExpressionInfo.invalid_symbols`
If ``strict`` is True, validation error messages will be included if in
a "WITH" expression such as "XXX with ZZZ" if the XXX symbol has
`is_exception` set to True or the YYY symbol has `is_exception` set to
False. This checks that exception symbols are used strictly as intended
on the right side of a "WITH" statement.
"""
expression_info = ExpressionInfo(original_expression=str(expression))
# Check `expression` type and syntax
try:
parsed_expression = self.parse(expression, strict=strict)
except ExpressionError as e:
expression_info.errors.append(str(e))
expression_info.invalid_symbols.append(e.token_string)
return expression_info
# Check `expression` keys (validate)
try:
self.validate_license_keys(expression)
except ExpressionError as e:
expression_info.errors.append(str(e))
unknown_keys = self.unknown_license_keys(expression)
expression_info.invalid_symbols.extend(unknown_keys)
return expression_info
# If we have not hit an exception, set `normalized_expression` in
# `expression_info` only if we did not encounter any errors
# along the way
if not expression_info.errors and not expression_info.invalid_symbols:
expression_info.normalized_expression = str(parsed_expression)
return expression_info
def get_scancode_licensing(
license_index_location=vendored_scancode_licensedb_index_location
):
"""
Return a Licensing object using ScanCode license keys loaded from a
``license_index_location`` location of a license db JSON index files
See https://scancode-licensedb.aboutcode.org/index.json
"""
return build_licensing(get_license_index(license_index_location))
def get_spdx_licensing(
license_index_location=vendored_scancode_licensedb_index_location
):
"""
Return a Licensing object using SPDX license keys loaded from a
``license_index_location`` location of a license db JSON index files
See https://scancode-licensedb.aboutcode.org/index.json
"""
return build_spdx_licensing(get_license_index(license_index_location))
def get_license_index(
license_index_location=vendored_scancode_licensedb_index_location
):
"""
Return a list of mappings that contain license key information from
``license_index_location``
The default value of `license_index_location` points to a vendored copy
of the license index from https://scancode-licensedb.aboutcode.org/
"""
with open(license_index_location) as f:
return json.load(f)
def load_licensing_from_license_index(license_index):
"""
Return a Licensing object that has been loaded with license keys and
attributes from a ``license_index`` list of license mappings.
"""
syms = [LicenseSymbol(**l) for l in license_index]
return Licensing(syms)
def build_licensing(license_index):
"""
Return a Licensing object that has been loaded with license keys and
attributes from a ``license_index`` list of simple ScanCode license mappings.
"""
lics = [
{
'key': l.get('license_key', ''),
'is_exception': l.get('is_exception', ''),
} for l in license_index if not l.get('is_deprecated', False)
]
return load_licensing_from_license_index(lics)
def build_spdx_licensing(license_index):
"""
Return a Licensing object that has been loaded with license keys and
attributes from a ``license_index`` list of simple SPDX license mappings.
"""
# Massage data such that SPDX license key is the primary license key
lics = [
{
'key': l.get('spdx_license_key', ''),
'aliases': l.get('other_spdx_license_keys', []),
'is_exception': l.get('is_exception', ''),
} for l in license_index
if l.get('spdx_license_key')
and not l.get('is_deprecated', False)
]
return load_licensing_from_license_index(lics)
def build_symbols_from_unknown_tokens(tokens):
"""
Yield Token given a ``token`` sequence of Token replacing unmatched
contiguous tokens by a single token with a LicenseSymbol.
"""
tokens = list(tokens)
unmatched = deque()
def build_token_with_symbol():
"""
Build and return a new Token from accumulated unmatched tokens or None.
"""
if not unmatched:
return
# strip trailing spaces
trailing_spaces = []
while unmatched and not unmatched[-1].string.strip():
trailing_spaces.append(unmatched.pop())
if unmatched:
string = ' '.join(t.string for t in unmatched if t.string.strip())
start = unmatched[0].start
end = unmatched[-1].end
toksym = LicenseSymbol(string)
unmatched.clear()
yield Token(start, end, string, toksym)
for ts in trailing_spaces:
yield ts
for tok in tokens:
if tok.value:
for symtok in build_token_with_symbol():
yield symtok
yield tok
else:
if not unmatched and not tok.string.strip():
# skip leading spaces
yield tok
else:
unmatched.append(tok)
# end remainders
for symtok in build_token_with_symbol():
yield symtok
def build_token_groups_for_with_subexpression(tokens):
"""
Yield tuples of Token given a ``tokens`` sequence of Token such that:
- all "XXX WITH YYY" sequences of 3 tokens are grouped in a three-tuple
- single tokens are just wrapped in a tuple for consistency.
"""
# if n-1 is sym, n is with and n+1 is sym: yield this as a group for a with
# exp otherwise: yield each single token as a group
tokens = list(tokens)
# check three contiguous tokens that may form "lic WITh exception" sequence
triple_len = 3
# shortcut if there are no grouping possible
if len(tokens) < triple_len:
for tok in tokens:
yield (tok,)
return
# accumulate three contiguous tokens
triple = deque()
triple_popleft = triple.popleft
triple_clear = triple.clear
tripple_append = triple.append
for tok in tokens:
if len(triple) == triple_len:
if is_with_subexpression(triple):
yield tuple(triple)
triple_clear()
else:
prev_tok = triple_popleft()
yield (prev_tok,)
tripple_append(tok)
# end remainders
if triple:
if len(triple) == triple_len and is_with_subexpression(triple):
yield tuple(triple)
else:
for tok in triple:
yield (tok,)
def is_with_subexpression(tokens_tripple):
"""
Return True if a ``tokens_tripple`` Token tripple is a "WITH" license sub-
expression.
"""
lic, wit, exc = tokens_tripple
return (isinstance(lic.value, LicenseSymbol)
and wit.value == KW_WITH
and isinstance(exc.value, LicenseSymbol)
)
def replace_with_subexpression_by_license_symbol(tokens, strict=False):
"""
Given a ``tokens`` iterable of Token, yield updated Token(s) replacing any
"XXX WITH ZZZ" subexpression by a LicenseWithExceptionSymbol symbol.
Check validity of WITH subexpessions and raise ParseError on errors.
If ``strict`` is True also raise ParseError if the left hand side
LicenseSymbol has `is_exception` True or if the right hand side
LicenseSymbol has `is_exception` False.
"""
token_groups = build_token_groups_for_with_subexpression(tokens)
for token_group in token_groups:
len_group = len(token_group)
if not len_group: