-
Notifications
You must be signed in to change notification settings - Fork 2.8k
/
show-duplicate-java-classes
executable file
·366 lines (293 loc) · 13.6 KB
/
show-duplicate-java-classes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# @Function
# Find duplicate classes among java lib dirs and class dirs.
#
# @Usage
# $ show-duplicate-java-classes # search jars from current dir
# $ show-duplicate-java-classes path/to/lib_dir1 /path/to/lib_dir2
# $ show-duplicate-java-classes -c path/to/class_dir1 -c /path/to/class_dir2
# $ show-duplicate-java-classes -c path/to/class_dir1 path/to/lib_dir1
# $ show-duplicate-java-classes -L path/to/lib_dir1 # search jars in the subdirectories of lib dir
# $ show-duplicate-java-classes -J path/to/lib_dir1 # search jars in the jar file
#
# @online-doc https://github.com/oldratlee/useful-scripts/blob/dev-2.x/docs/java.md#-show-duplicate-java-classes
# @author tg123 (farmer1992 at gmail dot com)
# @author Jerry Lee (oldratlee at gmail dot com)
__author__ = 'tg123'
import os
import re
import sys
from glob import glob
from io import BytesIO
from optparse import OptionParser
from os import walk
from os.path import exists, isdir, relpath
from zipfile import BadZipfile, ZipFile
################################################################################
# utils functions
################################################################################
PROG_VERSION = '2.x-dev'
# How to delete line with echo?
# https://unix.stackexchange.com/questions/26576
#
# terminal escapes: http://ascii-table.com/ansi-escape-sequences.php
# In particular, to clear from the cursor position to the beginning of the line:
# echo -e "\033[1K"
# Or everything on the line, regardless of cursor position:
# echo -e "\033[2K"
__clear_line = '\033[2K\r'
__show_responsive = True
def __get_terminal_columns_of_stderr():
"""
Rewritten for stderr from <shutil.get_terminal_size>
"""
try:
columns, _ = os.get_terminal_size(sys.stderr.fileno())
except (AttributeError, ValueError, OSError):
columns = 0
return columns
def print_responsive_message(msg):
if not __show_responsive or not sys.stderr.isatty():
return
columns = __get_terminal_columns_of_stderr()
if columns <= 0:
return
print(__clear_line + msg[:columns], end='', file=sys.stderr)
def clear_responsive_message():
if not __show_responsive or not sys.stderr.isatty():
return
print(__clear_line, end='', file=sys.stderr)
def print_error(msg):
clear_responsive_message()
print(msg, file=sys.stderr)
def print_box_message(msg):
print()
print('=' * 80)
print(msg)
print('=' * 80)
def str_len(x):
return len(str(x))
# issue 32790: Keep trailing zeros in precision for string format option g - Python tracker
# https://bugs.python.org/issue32790
def percent_str(num):
"""
Input => Output
1.4545 / 10 **-1 => 1455%
1.4545 / 10 ** 0 => 145%
1.4545 / 10 ** 1 => 14.5%
1.4545 / 10 ** 2 => 1.45%
1.4545 / 10 ** 3 => 0.145%
1.4545 / 10 ** 4 => 0.015%
1.4545 / 10 ** 5 => 0.001%
1.4545 / 10 ** 6 => 0.000%
1.4545 / 10 ** 7 => 0.000%
"""
num = num * 100
if num >= 100:
return '%.0f%%' % num
elif num >= 10:
return '%.1f%%' % num
elif num >= 1:
return '%.2f%%' % num
else:
return '%.3f%%' % num
def list_jar_file_under_lib_dirs(lib_dirs, recursive):
jar_files = set()
max_idx_str_len = str_len(len(lib_dirs))
for idx, lib_dir in enumerate(lib_dirs, start=1):
print_responsive_message('list jar file under lib dir(%*s/%s): %s' % (
max_idx_str_len, idx, len(lib_dirs), lib_dir))
if not exists(lib_dir):
print_error('WARN: lib dir %s not exists, ignored!' % lib_dir)
continue
if not isdir(lib_dir):
jar_files.add(lib_dir)
continue
if not recursive:
jar_files |= {p for p in glob(lib_dir + '/*.jar')}
continue
jar_files |= {
dir_path + '/' + filename
for dir_path, _, file_names in walk(lib_dir)
for filename in file_names if filename.lower().endswith('.jar')
}
return jar_files
def list_class_under_jar_file(jar_file, recursive, progress):
"""
:return: map: jar_jar_path('a.jar!b.jar!c.jar') -> classes
"""
index = 0
def list_zip_in_zip(jar_jar_path, zf):
nonlocal index
index += 1
index_marker = ''
if recursive:
index_marker = ' #%3s' % index
print_responsive_message('list class under jar file(%*s/%s%s): %s' % (
str_len(progress[1]), progress[0], progress[1], index_marker, jar_jar_path))
ret = {}
classes = {f for f in zf.namelist() if f.lower().endswith('.class')}
ret[jar_jar_path] = classes
if not recursive:
return ret
jars_in_jar = {f for f in zf.namelist() if f.lower().endswith('.jar')}
for jar in jars_in_jar:
next_jar_jar_path = jar_jar_path + '!' + jar
try:
with ZipFile(BytesIO(zf.read(jar))) as f:
ret.update(list_zip_in_zip(next_jar_jar_path, f))
except BadZipfile as e:
print_error('WARN: %s is bad zip file(%s), ignored!' % (next_jar_jar_path, e))
return ret
try:
with ZipFile(jar_file) as zip_file:
return list_zip_in_zip(jar_file, zip_file)
except BadZipfile as error:
print_error('WARN: %s is bad zip file(%s), ignored!' % (jar_file, error))
return {}
def list_class_under_class_dir(class_dir, progress):
print_responsive_message('list class under class dir(%*s/%s): %s' % (
str_len(progress[1]), progress[0], progress[1], class_dir))
if not exists(class_dir):
print_error('WARN: class dir %s not exists, ignored!' % class_dir)
return {}
if not isdir(class_dir):
print_error('WARN: class dir %s is not dir, ignored!' % class_dir)
return {}
return {relpath(dir_path + '/' + filename, class_dir)
for dir_path, _, file_names in walk(class_dir)
for filename in file_names if filename.lower().endswith('.class')}
def collect_class_path_to_classes(class_dirs, jar_files, recursive_jar):
class_path_to_classes = {}
total_count = len(jar_files) + len(class_dirs)
index = 0
# list all classes in jar files
for jar_file in jar_files:
index += 1
class_path_to_classes.update(
list_class_under_jar_file(jar_file, recursive=recursive_jar, progress=(index, total_count)))
# list all classes in class dirs
for class_dir in class_dirs:
index += 1
class_path_to_classes[class_dir] = list_class_under_class_dir(class_dir, progress=(index, total_count))
return class_path_to_classes
def invert_as_class_to_class_paths(class_path_to_classes):
class_to_class_paths = {}
for class_path, classes in class_path_to_classes.items():
for clazz in classes:
class_to_class_paths.setdefault(clazz, set()).add(class_path)
return class_to_class_paths
################################################################################
# biz functions
################################################################################
__java9_module_file_pattern = re.compile(r'(^|.*/)module-info\.class$')
def find_duplicate_classes(class_to_class_paths):
class_paths_to_duplicate_classes = {}
for clazz, class_paths in class_to_class_paths.items():
# skip java 9 module-info files
if len(class_paths) == 1 or __java9_module_file_pattern.match(clazz):
continue
classes = class_paths_to_duplicate_classes.setdefault(frozenset(class_paths), set())
classes.add(clazz)
return class_paths_to_duplicate_classes
def print_duplicate_classes_info(class_paths_to_duplicate_classes, class_path_to_classes):
if not class_paths_to_duplicate_classes:
print('COOL! No duplicate classes found!')
return
duplicate_classes_total_count = sum(len(dcs) for dcs in class_paths_to_duplicate_classes.values())
class_paths_total_count = sum(len(cps) for cps in class_paths_to_duplicate_classes)
print('Found %s duplicate classes in %s class paths and %s class path sets:' % (
duplicate_classes_total_count, class_paths_total_count, len(class_paths_to_duplicate_classes)))
# sort key(class_paths) and value(duplicate_classes)
class_paths_to_duplicate_classes = [(sorted(cps), sorted(dcs))
for cps, dcs in class_paths_to_duplicate_classes.items()]
# sort kv pairs
#
# sort by multiple keys:
# 1. class paths count, *descending*; aka. sort by len(item[0]) *reverse=True*
# 2. duplicate classes count, *descending*; aka. sort by len(item[1]) *reverse=True*
# 3. class paths, ascending; aka. sort by item[0]
# sort also ensure output consistent for same input.
#
# How to sort objects by multiple keys in Python?
# https://stackoverflow.com/questions/1143671
# Sort a list by multiple attributes?
# https://stackoverflow.com/questions/4233476
#
# use - operator of number key for reverse sort key
class_paths_to_duplicate_classes.sort(key=lambda item: (-len(item[0]), -len(item[1]), item[0]))
max_idx_str_len = str_len(len(class_paths_to_duplicate_classes))
for idx, (class_paths, classes) in enumerate(class_paths_to_duplicate_classes, start=1):
duplicate_ratio = len(classes) / min((len(class_path_to_classes[cp]) for cp in class_paths))
print('[%*s] found %s(%s) duplicate classes in %s class paths:' % (
max_idx_str_len, idx, len(classes), percent_str(duplicate_ratio), len(class_paths)))
max_class_path_idx_str_len = str_len(len(class_paths))
max_classes_count_str_len = str_len(max(len(class_path_to_classes[cp]) for cp in class_paths))
for i, cp in enumerate(class_paths, start=1):
print(' %*s: (contain %*s classes) %s' % (
max_class_path_idx_str_len, i, max_classes_count_str_len, len(class_path_to_classes[cp]), cp))
print_box_message('Duplicate classes detail info:')
for idx, (class_paths, classes) in enumerate(class_paths_to_duplicate_classes, start=1):
print('[%*s] found %s duplicate classes in %s class paths %s :' % (
max_idx_str_len, idx, len(classes), len(class_paths), ' '.join(class_paths)))
max_class_idx_str_len = str_len(len(classes))
for i, c in enumerate(classes, start=1):
print(' %*s: %s' % (max_class_idx_str_len, i, c))
def print_class_paths_info(class_path_to_classes):
if not class_path_to_classes:
return
max_idx_str_len = str_len(len(class_path_to_classes))
max_classes_count_str_len = str_len(max(len(classes) for classes in class_path_to_classes.values()))
class_path_to_classes = sorted(class_path_to_classes.items(), key=lambda item: item[0])
print_box_message('Find in %s class paths:' % len(class_path_to_classes))
for idx, (cp, classes) in enumerate(class_path_to_classes, start=1):
print('%*s: (contain %*s classes) %s' % (
max_idx_str_len, idx, max_classes_count_str_len, len(classes), cp))
def main():
option_parser = OptionParser(
usage='%prog [OPTION]...'
' [-c class-dir1 [-c class-dir2] ...]'
' [lib-dir1|jar-file1 [lib-dir2|jar-file2] ...]'
'\nFind duplicate classes among java lib dirs and class dirs.'
'\n\nExamples:'
'\n %prog # search jars from current dir'
'\n %prog path/to/lib_dir1 /path/to/lib_dir2'
'\n %prog -c path/to/class_dir1 -c /path/to/class_dir2'
'\n %prog -c path/to/class_dir1 path/to/lib_dir1'
'\n %prog -L path/to/lib_dir1'
'\n %prog -J path/to/lib_dir1',
version='%prog ' + PROG_VERSION)
option_parser.add_option('-L', '--recursive-lib', dest='recursive_lib', default=False,
action='store_true', help='search jars in the sub-directories of lib dir')
option_parser.add_option('-J', '--recursive-jar', dest='recursive_jar', default=False,
action='store_true', help='search jars in the jar file')
option_parser.add_option('-c', '--class-dir', dest='class_dirs', default=[],
action='append', help='add class dir')
option_parser.add_option('-R', '--no-find-progress', dest='show_responsive', default=True,
action='store_false', help='do not display responsive find progress')
options, lib_dirs = option_parser.parse_args()
class_dirs = options.class_dirs
if not lib_dirs and not class_dirs:
lib_dirs = ['.']
global __show_responsive
__show_responsive = options.show_responsive
jar_files = list_jar_file_under_lib_dirs(lib_dirs, recursive=options.recursive_lib)
if not jar_files and not class_dirs:
clear_responsive_message()
print('search no jar files under lib dirs, and class dirs is absent.')
return 0
class_path_to_classes = collect_class_path_to_classes(class_dirs, jar_files, options.recursive_jar)
if all(not classes for classes in class_path_to_classes.values()):
clear_responsive_message()
print('find no class files in jar files or class dirs.')
return 0
print_responsive_message('find duplicate classes...')
class_to_class_paths = invert_as_class_to_class_paths(class_path_to_classes)
class_paths_to_duplicate_classes = find_duplicate_classes(class_to_class_paths)
clear_responsive_message()
print_duplicate_classes_info(class_paths_to_duplicate_classes, class_path_to_classes)
print_class_paths_info(class_path_to_classes)
return int(bool(class_paths_to_duplicate_classes))
if __name__ == '__main__':
exit(main())