# Data Integrity
In this notebook, we do several sanity checks on our data.
If the sanity checks are not fulfilled either something went wrong,
or our assumptions were wrong.

## Ghidra Function to IDA Function Mapping
We use Ghidra for decompilation but our ground truth set uses IDA.
In this section, we validate that our database looks as if we never used IDA, but only Ghidra.

```sql
SELECT count(*)
FROM bsim.desctable
```

```sql
-- Select all binaries that are analyzed
SELECT b.*
FROM "binary" AS b
	JOIN bsim.exetable AS e
	ON b.md5 = e.md5
``` 

```sql
-- Ensure that we have a vector for a function if and only if the function belongs to the binary.
-- Note that statically-linked or compiler-inserted functions belong to the binary.
```



```sql
-- Which functions have different names in Ghidra/IDA.

WITH binary_analyzed AS
	(SELECT b.id as binary_id, e.id as executable_id
	FROM "binary" AS b
	JOIN bsim.exetable AS e
	ON b.md5 = e.md5),
function_data AS (
	SELECT f.id AS id, f.offset + b.image_base AS address, b.id AS binary_id, f.name
	FROM "function" AS f JOIN "binary" AS b ON f.binary_id = b.id
)
	
SELECT *
FROM bsim.desctable AS description
	-- Join all analyzed binaries to discard linked functions
	JOIN binary_analyzed ON (
		description.id_exe = binary_analyzed.executable_id
	)
	-- Outer join to not discard functions that ghidra found
	LEFT OUTER JOIN function_data ON (
		binary_analyzed.binary_id = function_data.binary_id AND
		description.addr = function_data.address
	)
WHERE NOT (
	function_data.name ~ ('^' || description.name_func || '(_[0-9]+)?$')
)
 ```

```sql
-- Which functions did Ghidra find, that IDA did not find?

	
SELECT *
FROM bsim.desctable AS description
	-- Join all analyzed binaries to discard linked functions
	JOIN binary_analyzed ON (
		description.id_exe = binary_analyzed.executable_id
	)
	-- Outer join to not discard functions that ghidra found
	LEFT OUTER JOIN function_data ON (
		binary_analyzed.binary_id = function_data.binary_id AND
		description.addr = function_data.address
		-- In Ghidra, multiple functions (as defined by their address) can have the same name.
		-- In IDA, a counter is appended.
		AND function_data.name ~ ('^' || description.name_func || '(_[0-9]+)?$')
	)
--WHERE function_data.id IS NULL
```



```sql
-- How many intra-binary edges are in Ghidra's call-graph?

WITH dynlibs AS (
	SELECT *
	FROM bsim.exetable
	WHERE name_exec = 'unknown'
)
SELECT count(*)
FROM bsim.callgraphtable AS cg
	JOIN bsim.desctable AS src ON (
		cg.src = src.id
	)
	JOIN bsim.desctable AS dst ON (
		cg.dest = dst.id
	)
WHERE NOT EXISTS (SELECT * FROM dynlibs WHERE src.id_exe = dynlibs.id) AND
NOT EXISTS (SELECT * FROM dynlibs WHERE dst.id_exe = dynlibs.id)

```

```sql
-- Does it match the amount of edges in our call-graph?
-- Turns out we loose very vew.

SELECT count(*)
FROM call_graph_edge
```