-
Notifications
You must be signed in to change notification settings - Fork 53
/
fuzz_analysis.py
442 lines (379 loc) · 17.7 KB
/
fuzz_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# Copyright 2021 Fuzz Introspector Authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Performs analysis on the profiles output from fuzz introspector LLVM pass"""
import os
import copy
import cxxfilt
import logging
from typing import (
Any,
Dict,
List,
Set,
Tuple,
)
import fuzz_utils
import fuzz_cfg_load
import fuzz_data_loader
l = logging.getLogger(name=__name__)
def overlay_calltree_with_coverage(
profile: fuzz_data_loader.FuzzerProfile,
project_profile: fuzz_data_loader.MergedProjectProfile,
coverage_url: str,
git_repo_url: str,
basefolder: str) -> None:
# We use the callstack to keep track of all function parents. We need this
# when looking up if a callsite was hit or not. This is because the coverage
# information about a callsite is located in coverage data of the function
# in which the callsite is placed.
callstack = dict()
def callstack_get_parent(n, c):
return c[int(n.depth)-1]
def callstack_has_parent(n, c):
return int(n.depth)-1 in c
def callstack_set_curr_node(n, name, c):
c[int(node.depth)] = name
is_first = True
ct_idx = 0
for node in fuzz_cfg_load.extract_all_callsites(profile.function_call_depths):
node.cov_ct_idx = ct_idx
ct_idx += 1
demangled_name = fuzz_utils.demangle_cpp_func(node.dst_function_name)
# Add to callstack
callstack_set_curr_node(node, demangled_name, callstack)
# Get hitcount for this node
node_hitcount = 0
if is_first:
# The first node is always the entry of LLVMFuzzerTestOneInput
# LLVMFuzzerTestOneInput will never have a parent in the calltree. As such, we
# check here if the function has been hit, and if so, make it green. We avoid
# hardcoding LLVMFuzzerTestOneInput to be green because some fuzzers may not
# have a single seed, and in this specific case LLVMFuzzerTestOneInput
# will be red.
if not demangled_name == "LLVMFuzzerTestOneInput":
l.error("LLVMFuzzerTestOneInput must be the first node in the calltree")
exit(1)
coverage_data = profile.get_function_coverage("LLVMFuzzerTestOneInput")
if len(coverage_data) == 0:
l.error("There is no coverage data (not even all negative).")
#exit(0)
node.cov_parent = "EP"
node_hitcount = 0
for (n_line_number, hit_count_cov) in coverage_data:
node_hitcount = max(hit_count_cov, node_hitcount)
is_first = False
elif callstack_has_parent(node, callstack):
# Find the parent function and check coverage of the node
coverage_data = profile.get_function_coverage(fuzz_utils.normalise_str(callstack_get_parent(node, callstack)), True)
for (n_line_number, hit_count_cov) in coverage_data:
if n_line_number == node.src_linenumber and hit_count_cov > 0:
node_hitcount = hit_count_cov
node.cov_parent = callstack_get_parent(node, callstack)
else:
l.error("A node should either be the first or it must have a parent")
exit(1)
node.cov_hitcount = node_hitcount
# Map hitcount to color of target.
def get_hit_count_color(hit_count):
color_schemes = [ (0,1,"red"), (1, 10, "gold"), (10, 30, "yellow"),
(30, 50, "greenyellow"), (50, 1000000000000, "lawngreen") ]
for cmin, cmax, cname in color_schemes:
if hit_count >= cmin and hit_count < cmax:
return cname
return "red"
color_to_be = get_hit_count_color(node.cov_hitcount)
node.cov_color = color_to_be
# Get URL to coverage report for the node.
link = "#"
for fd_k, fd in project_profile.all_functions.items():
if fd.function_name == node.dst_function_name:
link = coverage_url + \
"%s.html#L%d" % (
fd.function_source_file, fd.function_linenumber)
break
node.cov_link = link
# Find the parent
callsite_link = "#"
if callstack_has_parent(node, callstack):
parent_fname = callstack_get_parent(node, callstack)
for fd_k, fd in project_profile.all_functions.items():
if fuzz_utils.demangle_cpp_func(fd.function_name) == parent_fname:
callsite_link = coverage_url + "%s.html#L%d" % (
fd.function_source_file, # parent source file
node.src_linenumber) # callsite line number
node.cov_callsite_link = callsite_link
# Get the Github URL to the node. However, if we got a "/" basefolder it means
# it is a wrong basefolder and we handle this by removing the two first folders
# in the complete path (which shuold be in most cases /src/NAME where NAME
# is the project folder.
if basefolder == "/":
fd_github_url = "%s/%s#L%d" % (git_repo_url, "/".join(
fd.function_source_file.split("/")[3:]), fd.function_linenumber)
else:
fd_github_url = "%s/%s#L%d" % (git_repo_url, fd.function_source_file.replace(
basefolder, ""), fd.function_linenumber)
node.cov_github_url = fd_github_url
# Extract data about which nodes unlocks data
all_callsites = fuzz_cfg_load.extract_all_callsites(profile.function_call_depths)
prev_end = -1
for idx1 in range(len(all_callsites)):#range(len(profile.function_call_depths)):
n1 = all_callsites[idx1]
prev = None
if idx1 > 0:
prev = all_callsites[idx1-1]
if n1.cov_hitcount == 0 and ((prev != None and prev.depth <= n1.depth) or idx1 < prev_end):
n1.cov_forward_reds = 0
n1.cov_largest_blocked_func = "none"
continue
# Read forward untill we see a green node. Depth must be the same or higher
idx2 = idx1+1
forward_red = 0
largest_blocked_name = ""
largest_blocked_count = 0
while idx2 < len(all_callsites):
# Check if we should break or increment forward_red
n2 = all_callsites[idx2]
# Break if the node is not at depth or deeper in the calltree than n1
# Remember:
# - the lower the depth, the higher up (closer to LLVMFuzzerTestOneInput) in the calltree
# - the higehr the depth, the lower down (further away from LLVMFuzzerTestOneInput) in the calltree
if n2.depth < n1.depth:
break
# break if the node is visited. We *could* change this to another metric, e.g.
# all nodes underneath n1 that are off, i.e. instead of breaking here we would
# increment forward_red iff cov-hitcount != 0. This, however, would prioritise
# blockers at the top rather than precisely locate them in the calltree.
if n2.cov_hitcount != 0:
break
for fd_k, fd in project_profile.all_functions.items():
if fuzz_utils.demangle_cpp_func(fd.function_name) == n2.dst_function_name and fd.total_cyclomatic_complexity > largest_blocked_count:
largest_blocked_count = fd.total_cyclomatic_complexity
largest_blocked_name = n2.dst_function_name
break
forward_red += 1
idx2 += 1
prev_end = idx2-1
#l.info("Assigning forward red: %d for index %d"%(forward_red, idx1))
n1.cov_forward_reds = forward_red
n1.cov_largest_blocked_func = largest_blocked_name
def analysis_get_optimal_targets(
merged_profile: fuzz_data_loader.MergedProjectProfile) -> Tuple[List[fuzz_data_loader.FuzzerProfile], Set[str]]:
"""
Finds the top reachable functions with minimum overlap.
Each of these functions is not be reachable by another function
in the returned set, but, they may reach some of the same functions.
"""
l.info(" - in analysis_get_optimal_targets")
optimal_set = set()
target_fds = list()
#for fd in reversed(sorted(merged_profile.all_functions, key=lambda x: len(x['functionsReached']))):
for fd in reversed(sorted(list(merged_profile.all_functions.values()), key=lambda x: len(x.functions_reached))):
total_vals = 0
for t in optimal_set:
if t in fd.functions_reached:
total_vals += 1
if fd.hitcount != 0:
continue
if len(fd.functions_reached) < 1:
continue
if fd.arg_count == 0:
continue
# We do not care about "main2" functions
if "main2" in fd.function_name:
continue
if fd.total_cyclomatic_complexity < 20:
continue
# Ensure that the overlap with existing functions in our optimal set is not excessive
# set is not excessive. There is likely some overlap because of use of
# utility functions and similar.
#proportion = (total_vals*1.0)/(len(fd['functionsReached'])*1.0)
#if proportion == 1.0:
# continue
condition1 = True #proportion < 0.5
# We also want to include all targets that have a fairly high complexity.
condition2 = fd.bb_count > 1
if not (condition1 or condition2):
continue
for func_name in fd.functions_reached:
optimal_set.add(func_name)
target_fds.append(fd)
l.info(". Done")
return target_fds, optimal_set
def analysis_synthesize_simple_targets(
merged_profile: fuzz_data_loader.MergedProjectProfile) -> (
Tuple[Dict[str, Dict[str, Any]],
fuzz_data_loader.MergedProjectProfile,
List[fuzz_data_loader.FuzzerProfile]]):
'''
Function for synthesizing fuzz targets. The way this one works is by finding
optimal targets that don't overlap too much with each other. The fuzz targets
are created to target functions in specific files, so all functions targeted
in each fuzzer will be from the same source file.
In a sense, this is more of a PoC wy to do some analysis on the data we have.
It is likely that we could do something much better.
'''
l.info(" - in analysis_synthesize_simple_targets")
new_merged_profile = copy.deepcopy(merged_profile)
target_fds, optimal_set = analysis_get_optimal_targets(merged_profile)
fuzzer_code = "#include \"ada_fuzz_header.h\"\n"
fuzzer_code += "\n"
fuzzer_code += "int LLVMFuzzerTestOneInput(const uint8_t *data, size_t size) {\n"
fuzzer_code += " af_safe_gb_init(data, size);\n\n"
variables_to_create = []
target_codes = dict()
optimal_functions_targeted = []
var_idx = 0
func_count = len(merged_profile.all_functions)
if func_count > 20000:
max_count = 1
elif func_count > 10000 and func_count < 20000:
max_count = 5
elif func_count > 2000 and func_count < 10000:
max_count = 7
else:
max_count = 10
#max_count = 8
curr_count = 0
while curr_count < max_count:
l.info(" - sorting by unreached complexity. ")
sorted_by_undiscovered_complexity = list(reversed(sorted(target_fds, key=lambda x: int(x.new_unreached_complexity))))
l.info(". Done")
#if len(sorted_by_undiscovered_complexity) == 0:
# break
#tfd = sorted_by_undiscovered_complexity[0]
#if tfd == None:
# break
try:
tfd = sorted_by_undiscovered_complexity[0]
except:
break
if tfd == None:
break
#to_continue = True
#if tfd['new_unreached_complexity'] <= 35:
# to_continue = False
#if curr_count >= max_count:
# to_continue = False
#if not to_continue:
# break
if tfd.new_unreached_complexity <= 35:
break
#if to_continue:
curr_count += 1
optimal_functions_targeted.append(tfd)
code = ""
code_var_decl = ""
variable_creation = ""
var_order = []
for arg_type in tfd.arg_types:
arg_type = arg_type.replace(" ","")
if arg_type == "char**":
code_var_decl += " char **new_var%d = af_get_double_char_p();\n"%(var_idx)
# We dont want the below line but instead we want to ensure
# we always return something valid.
var_order.append("new_var%d"%(var_idx))
var_idx += 1
elif arg_type == "char*":
code_var_decl += " char *new_var%d = ada_safe_get_char_p();\n"%(var_idx)
var_order.append("new_var%d"%(var_idx))
var_idx += 1
elif arg_type == "int":
code_var_decl += " int new_var%d = ada_safe_get_int();\n"%(var_idx)
var_order.append("new_var%d"%(var_idx))
var_idx += 1
elif arg_type == "int*":
code_var_decl += " int *new_var%d = af_get_int_p();\n"%(var_idx)
var_order.append("new_var%d"%(var_idx))
var_idx += 1
elif "struct" in arg_type and "*" in arg_type and "**" not in arg_type:
code_var_decl += " %s new_var%d = calloc(sizeof(%s), 1);\n"%(arg_type.replace(".", " "), var_idx, arg_type.replace(".", " ").replace("*",""))
var_order.append("new_var%d"%(var_idx))
var_idx += 1
else:
code_var_decl += " UNKNOWN_TYPE unknown_%d;\n"%(var_idx)
var_order.append("unknown_%d"%(var_idx))
var_idx += 1
#if len(var_order) > 0:
# Now add the function call.
code += " /* target %s */\n"%(tfd.function_name)
#code += " /* linkage %s */\n"%(tfd['linkageType'])
code += code_var_decl
code += " %s("%(tfd.function_name)
for idx in range(len(var_order)):
code += var_order[idx]
if idx < (len(var_order)-1):
code += ", "
code += ");\n"
code += "\n"
if tfd.function_source_file not in target_codes:
target_codes[tfd.function_source_file] = dict()
target_codes[tfd.function_source_file]['source_code'] = ""
target_codes[tfd.function_source_file]['target_fds'] = list()
#print("[Fuzz synthesizer] Function %s - adding code: %s"%(tfd['functionName'], code))
target_codes[tfd.function_source_file]['source_code'] += code
target_codes[tfd.function_source_file]['target_fds'].append(tfd)
l.info(" - calling add_func_t_reached_and_clone. ")
new_merged_profile = fuzz_data_loader.add_func_to_reached_and_clone(new_merged_profile, tfd)
# Ensure hitcount is set
tmp_ff = new_merged_profile.all_functions[tfd.function_name]
if tmp_ff.hitcount == 0:
l.info("Error. Hitcount did not get set for some reason. Exiting")
exit(0)
l.info(". Done")
# We need to update the optimal targets here.
# We only need to do this operation if we are actually going to continue analysis
if curr_count < max_count:
target_fds, optimal_set = analysis_get_optimal_targets(new_merged_profile)
final_fuzzers = dict()
#print("Fuzzers:")
for filename in target_codes:
file_fuzzer_code = fuzzer_code
#file_fuzzer_code += "\n"
file_fuzzer_code += target_codes[filename]['source_code']
file_fuzzer_code += " af_safe_gb_cleanup();\n"
file_fuzzer_code += "}\n"
#print("Fuzzer for %s:"%(filename))
#print("%s"%(file_fuzzer_code))
#print("-"*75)
final_fuzzers[filename] = dict()
final_fuzzers[filename]['source_code'] = file_fuzzer_code
final_fuzzers[filename]['target_fds'] = target_codes[filename]['target_fds']
l.info("Found the following optimal functions: { %s }"%(
str([f.function_name for f in optimal_functions_targeted])))
return final_fuzzers, new_merged_profile, optimal_functions_targeted
def analysis_coverage_runtime_analysis(
profiles : List[fuzz_data_loader.FuzzerProfile],
merged_profile : fuzz_data_loader.MergedProjectProfile):
"""
Identifies the functions that are hit in terms of coverage, but
only has a low percentage overage in terms of lines covered in the
target program.
This is useful to highlight functions that need inspection and is
in contrast to statically-extracted data which gives a hit/not-hit
verdict on a given function entirely.
"""
print("In coverage optimal analysis")
# Find all functions that satisfy:
# - source lines above 50
# - less than 15% coverage
functions_of_interest = []
for funcname in merged_profile.runtime_coverage.get_all_hit_functions():
try:
total_function_lines, hit_lines = merged_profile.runtime_coverage.get_hit_summary(funcname)
hit_proportion = (total_function_lines / hit_lines) * 100.0
if total_function_lines > 50 and hit_proportion < 20:
functions_of_interest.append(funcname)
except:
l.error("Error getting hit-summary information for %s"%(funcname))
return functions_of_interest