Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Parse "Site Characteristics Database" LevelDB #73

Merged
merged 2 commits into from
Feb 2, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions pyhindsight/analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -827,6 +827,9 @@ def generate_excel(self, output_object):
w.write(row_number, 5, item.interpretation, blue_value_format) # interpretation
w.write(row_number, 6, item.profile, blue_value_format) # Profile

if friendly_date(item.timestamp) < '1970-01-02':
w.set_row(row_number, options={'hidden': True})

except Exception as e:
log.error(f'Failed to write row to XLSX: {e}')

Expand All @@ -835,6 +838,7 @@ def generate_excel(self, output_object):
# Formatting
w.freeze_panes(2, 0) # Freeze top row
w.autofilter(1, 0, row_number, 19) # Add autofilter
w.filter_column('B', 'Timestamp > 1970-01-02')

s = workbook.add_worksheet('Storage')
# Title bar
Expand Down
61 changes: 59 additions & 2 deletions pyhindsight/browsers/chrome.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,11 @@ class Chrome(WebBrowser):
def __init__(self, profile_path, browser_name=None, cache_path=None, version=None, timezone=None,
parsed_artifacts=None, parsed_storage=None, storage=None, installed_extensions=None,
artifacts_counts=None, artifacts_display=None, available_decrypts=None, preferences=None,
no_copy=None, temp_dir=None):
no_copy=None, temp_dir=None, origin_hashes=None):
WebBrowser.__init__(self, profile_path, browser_name=browser_name, cache_path=cache_path, version=version,
timezone=timezone, parsed_artifacts=parsed_artifacts, parsed_storage=parsed_storage,
artifacts_counts=artifacts_counts, artifacts_display=artifacts_display,
preferences=preferences, no_copy=no_copy, temp_dir=temp_dir)
preferences=preferences, no_copy=no_copy, temp_dir=temp_dir, origin_hashes=origin_hashes)
self.profile_path = profile_path
self.browser_name = "Chrome"
self.cache_path = cache_path
Expand All @@ -52,6 +52,7 @@ def __init__(self, profile_path, browser_name=None, cache_path=None, version=Non
self.preferences = preferences
self.no_copy = no_copy
self.temp_dir = temp_dir
self.origin_hashes = origin_hashes

if self.version is None:
self.version = []
Expand All @@ -71,6 +72,9 @@ def __init__(self, profile_path, browser_name=None, cache_path=None, version=Non
if self.preferences is None:
self.preferences = []

if self.origin_hashes is None:
self.origin_hashes = {}

if self.artifacts_counts is None:
self.artifacts_counts = {}

Expand Down Expand Up @@ -1970,6 +1974,52 @@ def get_file_system(self, path, dir_name):
self.artifacts_counts['File System'] = len(result_list)
self.parsed_storage.extend(result_list)

def get_site_characteristics(self, path, dir_name):
result_list = []

self.build_hash_list_of_origins()

log.info('Site Characteristics:')
sc_root_path = os.path.join(path, dir_name)
log.info(f' - Reading from {sc_root_path}')

# Grab listing of 'Site Characteristics' directory
sc_root_listing = os.listdir(sc_root_path)
log.debug(f' - {len(sc_root_listing)} files in Site Characteristics directory: {str(sc_root_listing)}')

items = utils.get_ldb_records(sc_root_path)
for item in items:
try:
from pyhindsight.lib.site_data_pb2 import SiteDataProto

if item['key'] == b'database_metadata':
if item['value'] != b'1':
log.warning(f' - Expected type 1; got type {item["value"].encode()}. Trying to parse anyway.')
continue

raw_proto = item['value']

# Deleted records won't have a value
if raw_proto:
# SiteDataProto built from components/performance_manager/persistence/site_data/site_data.proto
parsed_proto = SiteDataProto.FromString(raw_proto)
last_loaded = parsed_proto.last_loaded
else:
parsed_proto = ''
last_loaded = 0

matched_url = self.origin_hashes.get(item['key'].decode(), f'MD5 of origin: {item["key"].decode()}')
result_list.append(Chrome.PreferenceItem(
self.profile_path, url=matched_url, timestamp=utils.to_datetime(last_loaded, self.timezone),
key=f'Status: {item["state"]}', value=str(parsed_proto), interpretation=''))

except Exception as e:
log.exception(f' - Exception parsing SiteDataProto ({item}): {e}')

log.info(f' - Parsed {len(result_list)} items')
self.artifacts_counts['Site Characteristics'] = len(result_list)
self.parsed_artifacts.extend(result_list)

def process(self):
supported_databases = ['History', 'Archived History', 'Media History', 'Web Data', 'Cookies', 'Login Data',
'Extension Cookies']
Expand Down Expand Up @@ -2146,6 +2196,13 @@ def process(self):
self.artifacts_display['Preferences'],
self.artifacts_counts.get('Preferences', '0')))

if 'Site Characteristics Database' in input_listing:
self.get_site_characteristics(self.profile_path, 'Site Characteristics Database')
self.artifacts_display['Site Characteristics'] = "Site Characteristics records"
print(self.format_processing_output(
self.artifacts_display['Site Characteristics'],
self.artifacts_counts.get('Site Characteristics', '0')))

if 'File System' in input_listing:
self.get_file_system(self.profile_path, 'File System')
self.artifacts_display['File System'] = 'File System Items'
Expand Down
18 changes: 16 additions & 2 deletions pyhindsight/browsers/webbrowser.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import hashlib
import logging
import sqlite3
import sys
import logging
import urllib.parse
from pyhindsight import utils

log = logging.getLogger(__name__)
Expand All @@ -10,7 +12,7 @@ class WebBrowser(object):
def __init__(
self, profile_path, browser_name, cache_path=None, version=None, display_version=None,
timezone=None, structure=None, parsed_artifacts=None, parsed_storage=None, artifacts_counts=None,
artifacts_display=None, preferences=None, no_copy=None, temp_dir=None):
artifacts_display=None, preferences=None, no_copy=None, temp_dir=None, origin_hashes=None):
self.profile_path = profile_path
self.browser_name = browser_name
self.cache_path = cache_path
Expand All @@ -25,6 +27,7 @@ def __init__(
self.preferences = preferences
self.no_copy = no_copy
self.temp_dir = temp_dir
self.origin_hashes = origin_hashes

if self.version is None:
self.version = []
Expand All @@ -44,6 +47,9 @@ def __init__(
if self.preferences is None:
self.preferences = []

if self.origin_hashes is None:
self.origin_hashes = {}

@staticmethod
def format_processing_output(name, items):
width = 80
Expand Down Expand Up @@ -104,6 +110,14 @@ def dict_factory(cursor, row):
d[col[0]] = row[idx]
return d

def build_hash_list_of_origins(self):
for artifact in self.parsed_artifacts:
if isinstance(artifact, self.HistoryItem):
domain = urllib.parse.urlparse(artifact.url).hostname
# Some URLs don't have a domain, like local PDF files
if domain:
self.origin_hashes[hashlib.md5(domain.encode()).hexdigest()] = domain

class HistoryItem(object):
def __init__(self, item_type, timestamp, profile, url=None, name=None, value=None, interpretation=None):
self.row_type = item_type
Expand Down