-
Notifications
You must be signed in to change notification settings - Fork 27
/
utils.py
329 lines (263 loc) · 10.6 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
#
# Copyright (c) 2017-2018 nexB Inc. and others. All rights reserved.
# http://nexb.com and https://github.com/nexB/deltacode/
# The DeltaCode software is licensed under the Apache License version 2.0.
# Data generated with DeltaCode require an acknowledgment.
# DeltaCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# When you publish or redistribute any data created with DeltaCode or any DeltaCode
# derivative work, you must accompany this data with the following acknowledgment:
#
# Generated with DeltaCode and provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# DeltaCode should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
# DeltaCode is a free and open source software analysis tool from nexB Inc. and others.
# Visit https://github.com/nexB/deltacode/ for support and download.
#
from __future__ import absolute_import, division
from bitarray import bitarray
from collections import defaultdict
from bitarray.util import count_xor
import binascii
import os
from commoncode import paths
from collections import OrderedDict
def update_from_license_info(delta, unique_categories):
"""
Increase an 'added' or 'modified' Delta object's 'score' attribute and add
one or more appropriate categories to its 'factors' attribute if there has
been a license change and depending on the nature of that change.
"""
if delta.is_added():
update_added_from_license_info(delta, unique_categories)
if delta.is_modified():
update_modified_from_license_info(delta, unique_categories)
def update_added_from_license_info(delta, unique_categories):
"""
Increase an 'added' Delta object's 'score' attribute and add
one or more categories to its 'factors' attribute if there has
been a license change.
"""
new_licenses = (
delta.new_file.licenses if hasattr(delta.new_file, "licenses") else []
)
new_categories = set(license["category"] for license in new_licenses)
if hasattr(delta.new_file, "licenses"):
delta.update(20, "license info added")
for category in new_categories:
# no license ==> 'Copyleft Limited'or higher
if category in unique_categories:
delta.update(20, category.lower() + " added")
# no license ==> 'Permissive' or 'Public Domain'
else:
delta.update(0, category.lower() + " added")
return
def update_modified_from_license_info(delta, unique_categories):
"""
Increase a 'modified' Delta object's 'score' attribute and add
one or more categories to its 'factors' attribute if there has
been a license change.
"""
new_licenses = (
delta.new_file.licenses if hasattr(delta.new_file, "licenses") else []
)
old_licenses = (
delta.old_file.licenses if hasattr(delta.old_file, "licenses") else []
)
if not new_licenses and old_licenses:
delta.update(15, "license info removed")
return
new_categories = set(license.get("category", "") for license in new_licenses)
old_categories = set(license.get("category", "") for license in old_licenses)
if new_licenses and not old_licenses:
delta.update(20, "license info added")
for category in new_categories:
# no license ==> 'Copyleft Limited'or higher
if category in unique_categories:
delta.update(20, category.lower() + " added")
# no license ==> 'Permissive' or 'Public Domain'
else:
delta.update(0, category.lower() + " added")
return
new_keys = set(license.get("key", "") for license in new_licenses)
old_keys = set(license.get("key", "") for license in old_licenses)
if new_keys != old_keys:
delta.update(10, "license change")
for category in new_categories - old_categories:
unique_categories_in_old_file = len(old_categories & unique_categories)
# 'Permissive' or 'Public Domain' ==> 'Copyleft Limited' or higher
if unique_categories_in_old_file == 0 and category in unique_categories:
delta.update(20, category.lower() + " added")
# at least 1 category in the old file was 'Copyleft Limited' or higher ==> 'Copyleft Limited' or higher
elif unique_categories_in_old_file != 0 and category in unique_categories:
delta.update(10, category.lower() + " added")
# 'Permissive' or 'Public Domain' ==> 'Permissive' or 'Public Domain' if not in old_categories
elif category not in unique_categories:
delta.update(0, category.lower() + " added")
def update_from_copyright_info(delta):
"""
Increase an 'added' or 'modified' Delta object's 'score' attribute and add
one or more appropriate categories to its 'factors' attribute if there has
been a copyright change and depending on the nature of that change.
"""
if delta.is_added():
update_added_from_copyright_info(delta)
if delta.is_modified():
update_modified_from_copyright_info(delta)
def update_added_from_copyright_info(delta):
"""
Increase an 'added' Delta object's 'score' attribute and add
one or more categories to its 'factors' attribute if there has
been a copyright change.
"""
if hasattr(delta.new_file, "copyrights"):
delta.update(10, "copyright info added")
return
def update_modified_from_copyright_info(delta):
"""
Increase a 'modified' Delta object's 'score' attribute and add
one or more categories to its 'factors' attribute if there has
been a copyright change.
"""
new_copyrights = (
delta.new_file.copyrights if hasattr(delta.new_file, "copyrights") else []
)
old_copyrights = (
delta.old_file.copyrights if hasattr(delta.old_file, "copyrights") else []
)
if new_copyrights and not old_copyrights:
delta.update(10, "copyright info added")
return
if not new_copyrights and old_copyrights:
delta.update(10, "copyright info removed")
return
new_holders = set(
holder
for copyright in new_copyrights
for holder in copyright.get("holders", [])
)
old_holders = set(
holder
for copyright in old_copyrights
for holder in copyright.get("holders", [])
)
if new_holders != old_holders:
delta.update(5, "copyright change")
def collect_errors(deltacode):
errors = []
errors.extend(deltacode.new_files_errors)
errors.extend(deltacode.old_files_errors)
errors.extend(deltacode.errors)
return errors
def deltas(deltacode, all_delta_types=False):
"""
Return a generator of Delta dictionaries for JSON serialized ouput. Omit
all unmodified Delta objects unless the user selects the '-a'/'--all'
option.
"""
for delta in deltacode.deltas:
if all_delta_types is True:
yield delta.to_dict(deltacode)
elif not delta.status == "unmodified":
yield delta.to_dict(deltacode)
def calculate_percent(value, total):
"""
Return the rounded value percentage of total.
"""
try:
ratio = (value / total) * 100
return round(ratio, 2)
except ZeroDivisionError:
return 0
class AlignmentException(Exception):
"""
Named exception for alignment errors.
"""
pass
class FileError(Exception):
"""
Named Exception for handling errors which could be raised due to
unsupported errors in the json file
"""
def __init__(self, *args):
if args:
self.message = args[0]
else:
self.message = None
def __str__(self):
return self.message
def align_trees(codebase1, codebase2):
"""
Aligns the path of the two codebases
"""
a_names = defaultdict(list)
for resource in codebase1.walk():
a_names[resource.name].append(resource)
a_uniques = {k: v[0] for k, v in a_names.items() if len(v) == 1}
b_names = defaultdict(list)
for resource in codebase2.walk():
b_names[resource.name].append(resource)
b_uniques = {k: v[0] for k, v in b_names.items() if len(v) == 1}
candidate_found = False
for a_name, a_unique in a_uniques.items():
if a_name not in b_uniques:
continue
b_unique = b_uniques.get(a_name)
if a_unique and a_unique.sha1 == b_unique.sha1:
candidate_found = True
break
if not candidate_found:
raise AlignmentException
if a_unique.path == b_unique.path:
return 0, 0
common_suffix, common_segments = paths.common_path_suffix(
a_unique.path, b_unique.path
)
a_segments = len(paths.split(a_unique.path))
b_segments = len(paths.split(b_unique.path))
return a_segments - common_segments, b_segments - common_segments
def get_notice():
"""
Retrieve the notice text from the NOTICE file for display in the JSON output.
"""
notice_path = os.path.join(os.path.abspath(os.path.dirname(__file__)), "NOTICE")
notice_text = open(notice_path).read()
delimiter = "\n\n\n"
[notice_text, extra_notice_text] = notice_text.split(delimiter, 1)
extra_notice_text = delimiter + extra_notice_text
delimiter = "\n\n "
[notice_text, acknowledgment_text] = notice_text.split(delimiter, 1)
acknowledgment_text = delimiter + acknowledgment_text
notice = acknowledgment_text.strip().replace(" ", "")
return notice
def hamming_distance(fingerprint1, fingerprint2):
"""
Return hamming distance between two given fingerprints.
Hamming distance is the difference in the bits of two binary string.
Files with fingerprints whose hamming distance are less tends to be more similar.
"""
distance = count_xor(fingerprint1, fingerprint2)
result = int(distance)
return result
def bitarray_from_hex(fingerprint_hex):
"""
Return bitarray from a hex string.
"""
bytes = binascii.unhexlify(fingerprint_hex)
result = bitarray_from_bytes(bytes)
return result
def bitarray_from_bytes(b):
"""
Return bitarray from a byte string, interpreted as machine values.
"""
a = bitarray()
a.frombytes(b)
return a