Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Merge pull request #509 from nexB/358-copyright-years
#358 Rework proper pattern for YR year ranges POS
  • Loading branch information
pombredanne committed Feb 21, 2017
2 parents dbd1b33 + 0094182 commit f1c609a
Show file tree
Hide file tree
Showing 11 changed files with 171 additions and 126 deletions.
129 changes: 117 additions & 12 deletions src/cluecode/copyrights.py
@@ -1,5 +1,5 @@
#
# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
Expand Down Expand Up @@ -77,7 +77,7 @@ def detect(location):
"""
Return lists of detected copyrights, authors, years and holders
in file at location.
Deprecated legacy entry point.
WARNING: Deprecated legacy entry point.
"""
copyrights = []
copyrights_extend = copyrights.extend
Expand All @@ -96,8 +96,69 @@ def detect(location):
return copyrights, authors, years, holders


_YEAR = (r'('
'19[6-9][0-9]' # 1960 to 1999
'|'
'20[0-1][0-9]' # 2000 to 2019
')')

_YEAR_SHORT = (r'('
'[6-9][0-9]' # 19-60 to 19-99
'|'
'[0-1][0-9]' # 20-00 to 20-19
')')

_YEAR_YEAR = (r'('
'19[6-9][0-9][\.,\-]_[6-9][0-9]' # 1960-99
'|'
'19[6-9][0-9][\.,\-]+[0-9]' # 1998-9
'|'
'20[0-1][0-9][\.,\-]+[0-1][0-9]' # 2001-16 or 2012-04
'|'
'200[0-9][\.,\-]+[0-9]' # 2001-4 not 2012
')')

_PUNCT = (r'('
'['
'\W' # not a word (word includes underscore)
'\D' # not a digit
'\_' # underscore
'i' # oddity
'\?'
']'
'|'
'\&nbsp' # html entity sometimes are double escaped
')*') # repeated 0 or more times


_YEAR_PUNCT = _YEAR + _PUNCT
_YEAR_YEAR_PUNCT = _YEAR_YEAR + _PUNCT
_YEAR_SHORT_PUNCT = _YEAR_SHORT + _PUNCT

_YEAR_OR_YEAR_YEAR_WITH_PUNCT = (r'(' +
_YEAR_PUNCT +
'|' +
_YEAR_YEAR_PUNCT +
')')

_YEAR_THEN_YEAR_SHORT = (r'(' +
_YEAR_OR_YEAR_YEAR_WITH_PUNCT +
'(' +
_YEAR_SHORT_PUNCT +
')*' +
')')

pats = [
_YEAR,
_YEAR_SHORT,
_YEAR_YEAR,
_PUNCT,
_YEAR_OR_YEAR_YEAR_WITH_PUNCT
]

# FIXME: multi-tokens patterns are likely not behaving as expected
# FIXME: patterns could be greatly simplified

patterns = [
# TODO: this needs to be simplified:
# TODO: in NLTK 3.0 this will fail because of this bug:
Expand All @@ -111,9 +172,12 @@ def detect(location):
# found in crypto certificates and LDAP
(r'^(O=|OU=|OU|XML)$', 'JUNK'),
(r'^(Parser|Dual|Crypto|NO|PART|[Oo]riginall?y?|[Rr]epresentations?\.?)$', 'JUNK'),
(r'^(Refer|Apt|Agreement|Usage|Please|Based|Upstream|Files?|Filename:?|Description:?|Holder?s|HOLDER?S|[Pp]rocedures?|You|Everyone)$', 'JUNK'),

(r'^(Refer|Apt|Agreement|Usage|Please|Based|Upstream|Files?|Filename:?|'
r'Description:?|Holder?s|HOLDER?S|[Pp]rocedures?|You|Everyone)$', 'JUNK'),
(r'^(Rights?|Unless|rant|Subject|Acknowledgements?|Special)$', 'JUNK'),
(r'^(Derivative|Work|[Ll]icensable|[Ss]ince|[Ll]icen[cs]e[\.d]?|[Ll]icen[cs]ors?|under|COPYING)$', 'JUNK'),
(r'^(Derivative|Work|[Ll]icensable|[Ss]ince|[Ll]icen[cs]e[\.d]?|'
r'[Ll]icen[cs]ors?|under|COPYING)$', 'JUNK'),
(r'^(TCK|Use|[Rr]estrictions?|[Ii]ntroduction)$', 'JUNK'),
(r'^([Ii]ncludes?|[Vv]oluntary|[Cc]ontributions?|[Mm]odifications?)$', 'JUNK'),
(r'^(CONTRIBUTORS?|OTHERS?|Contributors?\:)$', 'JUNK'),
Expand All @@ -130,8 +194,8 @@ def detect(location):
(r'^[Ff]unctions?$', 'JUNK'),

# various trailing words that are junk
(r'^(?:Copyleft|LegalCopyright|AssemblyCopyright|Distributed|Report|Available|true|false|node|jshint|node\':true|node:true)$', 'JUNK'),

(r'^(?:Copyleft|LegalCopyright|AssemblyCopyright|Distributed|Report|'
r'Available|true|false|node|jshint|node\':true|node:true)$', 'JUNK'),

# Bare C char is COPYRIGHT SIGN
# (r'^C$', 'COPY'),
Expand All @@ -153,9 +217,13 @@ def detect(location):
# company suffix
(r'^([Ii]nc[.]?|[I]ncorporated|[Cc]ompany|Limited|LIMITED).?$', 'COMP'),
# company suffix
(r'^(INC(ORPORATED|[.])?|CORP(ORATION|[.])?|FOUNDATION|GROUP|COMPANY|[(]tm[)]).?$|[Ff]orum.?', 'COMP'),
(r'^(INC(ORPORATED|[.])?|CORP(ORATION|[.])?|FOUNDATION|GROUP|COMPANY|'
r'[(]tm[)]).?$|[Ff]orum.?', 'COMP'),
# company suffix
(r'^([cC]orp(oration|[.])?|[fF]oundation|[Aa]lliance|Working|[Gg]roup|[Tt]echnolog(y|ies)|[Cc]ommunit(y|ies)|[Mm]icrosystems.?|[Pp]roject|[Tt]eams?|[Tt]ech).?$', 'COMP'),
(r'^([cC]orp(oration|[.])?|[fF]oundation|[Aa]lliance|Working|[Gg]roup|'
r'[Tt]echnolog(y|ies)|[Cc]ommunit(y|ies)|[Mm]icrosystems.?|[Pp]roject|'
r'[Tt]eams?|[Tt]ech).?$', 'COMP'),

# company suffix : LLC, LTD, LLP followed by one extra char
(r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.,$', 'COMP'),
(r'^([Ll][Ll][CcPp]|[Ll][Tt][Dd])\.?,?$', 'COMP'),
Expand Down Expand Up @@ -218,8 +286,29 @@ def detect(location):
# and Spanish/French Da Siva and De Gaulle
(r'^(([Vv][ao]n)|[Dd][aeu])$', 'VAN'),

# year
(r'^[(]?(19|20)[0-9]{2}((\s)*([,-]|to)(\s)*(19|20)?[0-9]{2})*[)]?', 'YR'),
# year or year ranges
# plain year with various leading and trailing punct
# dual or multi years 1994/1995. or 1994-1995
# 1987,88,89,90,91,92,93,94,95,96,98,99,2000,2001,2002,2003,2004,2006
# multi years
# dual years with second part abbreviated
# 1994/95. or 2002-04 or 1991-9
(r'^' + _PUNCT + _YEAR_OR_YEAR_YEAR_WITH_PUNCT + '+'+
'('+
_YEAR_OR_YEAR_YEAR_WITH_PUNCT +
'|' +
_YEAR_THEN_YEAR_SHORT +
')*' + '$', 'YR'),

(r'^' + _PUNCT + _YEAR_OR_YEAR_YEAR_WITH_PUNCT + '+'+
'('+
_YEAR_OR_YEAR_YEAR_WITH_PUNCT +
'|' +
_YEAR_THEN_YEAR_SHORT +
'|' +
_YEAR_SHORT_PUNCT +
')*' + '$', 'YR'),

# cardinal numbers
(r'^-?[0-9]+(.[0-9]+)?.?$', 'CD'),

Expand All @@ -228,7 +317,7 @@ def detect(location):

# composed proper nouns, ie. Jean-Claude or ST-Microelectronics
# FIXME: what about a variant with spaces around the dash?
(r'^[A-Z][a-zA-Z]*[-][A-Z]?[a-zA-Z]+.?$', 'NNP'),
(r'^[A-Z][a-zA-Z]*\s?[\-]\s?[A-Z]?[a-zA-Z]+.?$', 'NNP'),

# proper nouns with digits
(r'^[A-Z][a-z0-9]+.?$', 'NNP'),
Expand Down Expand Up @@ -275,6 +364,9 @@ def detect(location):
# .\" is not a noun
(r'^\.\\\?"?$', 'JUNK'),

# Mixed cap nouns (rare) LeGrande
(r'^[A-Z][a-z]+[A-Z][a-z]+[\.\,]?$', 'MIXEDCAP'),

# nouns (default)
(r'.+', 'NN'),
]
Expand Down Expand Up @@ -319,9 +411,12 @@ def detect(location):
COMPANY: {<COMPANY> <DASH> <NNP|NN> <EMAIL>?}
# Typical names
#John Robert LoVerso
NAME: {<NNP> <NNP> <MIXEDCAP>}
NAME: {<NNP|PN>+ <NNP>+}
NAME: {<NNP> <PN>? <NNP>+}
NAME: {<NNP> <NNP>}
NAME: {<NNP> <NN> <EMAIL>}
NAME: {<NNP> <PN|VAN>? <PN|VAN>? <NNP>}
NAME: {<NNP> <NN> <NNP>}
Expand Down Expand Up @@ -389,6 +484,7 @@ def detect(location):
# John Doe and Myriam Doe
NAME: {<NAME|NNP> <CC> <NNP|NAME>}
# Various forms of copyright statements
COPYRIGHT: {<COPY> <NAME> <COPY> <YR-RANGE>}
Expand Down Expand Up @@ -421,6 +517,9 @@ def detect(location):
COPYRIGHT: {<COPY> <COPY> <NNP>+}
# Copyright (c) 2016 Project Admins foobar
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE>+ <COMP> <NNP> <NN>}
# Copyright (c) 1995, 1996 The President and Fellows of Harvard University
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE> <NN> <NNP> <ANDCO>}
Expand Down Expand Up @@ -449,6 +548,9 @@ def detect(location):
# Copyright (c) 2012-2016, Project contributors
COPYRIGHT2: {<COPY> <COPY>? <YR-RANGE> <COMP> <AUTH>}
COPYRIGHT2: {<COPY>+ <YR-RANGE> <COMP>}
COPYRIGHT2: {<COPY> <COPY> <YR-RANGE>+ <CAPS>? <MIXEDCAP>}
COPYRIGHT2: {<NAME> <COPY> <YR-RANGE>}
COPYRIGHT2: {<COPY> <COPY>? <NN|CAPS>? <YR-RANGE>+ <NN|CAPS>*}
Expand Down Expand Up @@ -498,6 +600,9 @@ def detect(location):
COPYRIGHT: {<AUTHOR> <COPYRIGHT2>}
COPYRIGHT: {<AUTHOR> <YR-RANGE>}
COPYRIGHT: {<COPYRIGHT> <NAME3>}
"""


Expand Down Expand Up @@ -825,7 +930,7 @@ def get_tokens(self, numbered_lines):
tok = tok.lstrip('@').strip()
if tok and tok not in (':',):
tokens_append(tok)
logger.debug('CopyrightDetector:tokens: ' + repr(list(tokens)))
logger.debug('CopyrightDetector:tokens: ' + repr(tokens))
return tokens


Expand Down
1 change: 0 additions & 1 deletion src/cluecode/copyrights_hint.py
Expand Up @@ -130,7 +130,6 @@
2017
2018
2019
2020
'''.split()


Expand Down
1 change: 1 addition & 0 deletions tests/cluecode/data/copyrights/access_strings.txt
@@ -0,0 +1 @@
2005charchar? 7 DDLSQL Server 2005smalldatetimedatetimeLDDDDDD7
Binary file added tests/cluecode/data/copyrights/win-archive.lib
Binary file not shown.
Binary file added tests/cluecode/data/copyrights/windows.dll
Binary file not shown.
37 changes: 29 additions & 8 deletions tests/cluecode/test_copyrights.py
@@ -1,6 +1,6 @@
# -*- coding: utf-8 -*-
#
# Copyright (c) 2016 nexB Inc. and others. All rights reserved.
# Copyright (c) 2017 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/scancode-toolkit/
# The ScanCode software is licensed under the Apache License version 2.0.
# Data generated with ScanCode require an acknowledgment.
Expand Down Expand Up @@ -204,7 +204,7 @@ def test_copyright_no_copyright_in_class_file_1(self):
def test_copyright_sample_py(self):
test_file = self.get_test_loc('copyrights/copyright_sample_py-py.py')
expected = [
u'COPYRIGHT 2006',
u'COPYRIGHT 2006 ABC',
]
check_detection(expected, test_file)

Expand Down Expand Up @@ -751,13 +751,11 @@ def test_copyright_dionysos_c(self):
u'COPYRIGHT (c) ADIONYSOS2 2006',
u'COPYRIGHT (c) MyCompany 2006 - 2009',
u'COPYRIGHT (c) 2006 MyCompany2',
u'COPYRIGHT (c) 2024 DIONYSOS2',
u'copyright (c) 2006 - 2009 DIONYSOS',
u'copyright (c) ADIONYSOS 2006 - 2009',
u'copyright (c) ADIONYSOS2 2006',
u'copyright (c) MyCompany 2006 - 2009',
u'copyright (c) 2006 MyCompany2',
u'copyright (c) 2024 DIONYSOS2',
]
check_detection(expected, test_file)

Expand Down Expand Up @@ -1150,12 +1148,30 @@ def test_copyright_in_binary_file_with_metadata(self):
]
check_detection(expected, test_file)

def test_copyright_php_lib(self):
test_file = self.get_test_loc('copyrights/copyright_php_lib-php_embed_lib.lib')
@expectedFailure
def test_copyright_in_windows_binary_lib(self):
test_file = self.get_test_loc('copyrights/copyright_in_binary_lib-php_embed_lib.lib')
expected = [
u'Copyright nexB and others (c) 2012',
]
check_detection(expected, test_file)

@expectedFailure
def test_copyright_in_windows_binary_dll(self):
test_file = self.get_test_loc('copyrights/windows.dll')
expected = [
u'Copyright nexB and others (c) 2012',
]
check_detection(expected, test_file)

def test_copyright_in_windows_binary_dll_leading_junk(self):
test_file = self.get_test_loc('copyrights/windows.dll')
expected = [
u'ROW_SERVER_R_RES HKCR NoRemove Interface Copyright nexB and others (c) 2012'
]
check_detection(expected, test_file)


def test_copyright_in_c(self):
test_file = self.get_test_loc('copyrights/copyright_in_c-c.c')
expected = [
Expand Down Expand Up @@ -3022,7 +3038,7 @@ def test_copyright_oberhummer_text(self):
def test_copyright_objectivec(self):
test_file = self.get_test_loc('copyrights/copyright_objectivec-objectiveC_m.m')
expected = [
u'Copyright (c) 2009',
u'Copyright (c) 2009 ABC',
]
check_detection(expected, test_file)

Expand All @@ -3048,7 +3064,7 @@ def test_copyright_openoffice_org_report_builder_bin_copyright(self):
u'Copyright 2001-2007 The Apache Software Foundation',
u'Copyright 1999-2007 The Apache Software Foundation',
u'Copyright (c) 2000 Pat Niemeyer',
u'Copyright (c) 2000',
u'Copyright (c) 2000 INRIA',
u'Copyright (c) 2002 France Telecom',
u'Copyright (c) 1990-2003 Sleepycat Software',
u'Copyright (c) 1990, 1993, 1994, 1995 The Regents of the University of California',
Expand Down Expand Up @@ -4006,3 +4022,8 @@ def test_copyright_byten_c_exactly(self):
test_lines = [u'... don’t fit into your fixed-size buffer.\nByten ( c )\nExactly n bytes. If the']
expected = []
check_detection(expected, test_lines)

def test_copyright_should_not_be_detected_in_junk_strings_with_year_prefix(self):
test_file = self.get_test_loc('copyrights/access_strings.txt')
expected = []
check_detection(expected, test_file)

0 comments on commit f1c609a

Please sign in to comment.