Skip to content

Commit

Permalink
Merge 345f7b5 into 0371d43
Browse files Browse the repository at this point in the history
  • Loading branch information
mtlynch committed Oct 30, 2018
2 parents 0371d43 + 345f7b5 commit 3aee288
Show file tree
Hide file tree
Showing 7 changed files with 482 additions and 13 deletions.
14 changes: 13 additions & 1 deletion chat_unifier/file_iterators/pidgin.py
@@ -1,13 +1,25 @@
import os

_IGNORED_MEDIA = ['irc']


def iterate_files(directory):
for root, dirs, filenames in os.walk(directory):
for filename in filenames:
if _is_log_file(filename):
yield os.path.join(root, filename)
log_path = os.path.join(root, filename)
if _get_log_medium(log_path) in _IGNORED_MEDIA:
continue
yield log_path


def _is_log_file(filename):
_, extension = os.path.splitext(filename)
return extension == '.html'


def _get_log_medium(log_path):
path_parts = log_path.split(os.path.sep)
if len(path_parts) < 4:
return None
return path_parts[-4]
58 changes: 51 additions & 7 deletions chat_unifier/parsers/pidgin/html_reader.py
Expand Up @@ -60,11 +60,15 @@ def __init__(self):
def results(self):
return self._results

def feed(self, html):
html_annotated = _annotate_html(html)
HTMLParser.feed(self, html_annotated)

def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag == 'title':
self._update_state(_STATE_PARSING_TITLE)
if ((self._state == _STATE_SEEKING_NEXT_MESSAGE) and (tag == 'font')):
elif ((self._state == _STATE_SEEKING_NEXT_MESSAGE) and (tag == 'font')):
if 'color' in attrs_dict:
font_color = attrs_dict['color']
if _is_local_user_font_color(font_color):
Expand Down Expand Up @@ -94,6 +98,9 @@ def handle_endtag(self, tag):

def handle_startendtag(self, tag, attrs):
if ((self._state == _STATE_PARSING_CONTENTS) and (tag == 'br')):
self._add_message_contents('\n')
elif ((self._state == _STATE_PARSING_CONTENTS) and
(tag == 'message-end')):
self._update_state(_STATE_SEEKING_NEXT_MESSAGE)

def handle_data(self, data):
Expand All @@ -106,22 +113,59 @@ def handle_data(self, data):
elif self._state == _STATE_PARSING_CONTENTS:
if not data.strip():
return
self._add_message_contents(data)
self._add_message_contents(data.decode('utf8'))

def handle_entityref(self, name):
decoded = _decode_html_entity_ref(name)
if self._state == _STATE_PARSING_CONTENTS:
self._add_message_contents(decoded)
elif self._state == _STATE_PARSING_DISPLAY_NAME:
self._add_display_name(decoded)

def handle_charref(self, name):
decoded = _decode_html_char_ref(name)
if self._state == _STATE_PARSING_CONTENTS:
self._add_message_contents(decoded)
elif self._state == _STATE_PARSING_DISPLAY_NAME:
self._add_display_name(decoded)

def _add_title(self, title):
self.results.append((RESULT_TYPE_TITLE, title))
self._results.append((RESULT_TYPE_TITLE, title))

def _add_message_start(self, message_type):
self.results.append((RESULT_TYPE_MESSAGE_START, message_type))
self._results.append((RESULT_TYPE_MESSAGE_START, message_type))

def _add_timestamp(self, timestamp):
self.results.append((RESULT_TYPE_TIMESTAMP, timestamp))
self._results.append((RESULT_TYPE_TIMESTAMP, timestamp))

def _add_display_name(self, display_name):
self.results.append((RESULT_TYPE_DISPLAY_NAME, display_name))
self._append_or_coalesce_result(RESULT_TYPE_DISPLAY_NAME, display_name)

def _add_message_contents(self, message_contents):
self.results.append((RESULT_TYPE_MESSAGE_CONTENTS, message_contents))
self._append_or_coalesce_result(RESULT_TYPE_MESSAGE_CONTENTS,
message_contents)

def _append_or_coalesce_result(self, result_type, result_value):
if self._results:
last_result_type, last_result_value = self._results[-1]
if last_result_type == result_type:
self._results.pop()
result_value = last_result_value + result_value
self._results.append((result_type, result_value))

def _update_state(self, new_state):
self._state = new_state


def _annotate_html(html):
# We need to specially mark line-terminating <br> tags otherwise there's
# ambiguity in where the message ends (<br> can appear within messages).
return html.replace('\r\n', '\n').replace('<br/>\n', '<message-end/>\n')


def _decode_html_entity_ref(entity_ref):
return HTMLParser().unescape('&' + entity_ref + ';')


def _decode_html_char_ref(entity_ref):
return HTMLParser().unescape('&#' + entity_ref + ';')
154 changes: 154 additions & 0 deletions chat_unifier/parsers/pidgin/parser.py
@@ -0,0 +1,154 @@
from __future__ import absolute_import

import datetime
import re

from chat_unifier import models
from chat_unifier.parsers.pidgin import html_reader

_TITLE_PATTERN = re.compile(
r'^Conversation with (?P<remote_username>.+) at (?P<start_date>\d{1,2}/\d{1,2}/\d{4}) (?P<start_time>\d{1,2}:\d{1,2}:\d{1,2}) (?P<am_pm>[AP]M) on (?P<local_username>.+) \((?P<medium>.+)\)$'
)


class Error(Exception):
pass


class UnexpectedResultType(Error):
pass


class UnexpectedMessageDirection(Error):
pass


class InvalidMetadata(Error):
pass


class Parser(object):

def parse(self, log_contents):
reader = html_reader.Reader()
reader.feed(log_contents)
converter = _ResultsToHistoryConverter(reader.results)
return converter.convert()


class _ResultsToHistoryConverter(object):

def __init__(self, results):
self._results = results
self._metadata = None
self._last_timestamp = None

def convert(self):
self._process_metadata()
return models.History(
local_username=self._metadata['local_username'],
remote_username=self._metadata['remote_username'],
messages=self._process_messages())

def _process_metadata(self):
title = self._pop_result_with_type(html_reader.RESULT_TYPE_TITLE)
self._metadata = _metadata_from_title(title)
self._last_timestamp = self._metadata['start_timestamp']

def _process_messages(self):
messages = []
while self._results:
messages.append(self._process_next_message())
return messages

def _process_next_message(self):
message_direction = self._pop_result_with_type(
html_reader.RESULT_TYPE_MESSAGE_START)
timestamp_raw = self._pop_result_with_type(
html_reader.RESULT_TYPE_TIMESTAMP)
# TODO(mtlynch): Save the display name.
self._pop_result_with_type(html_reader.RESULT_TYPE_DISPLAY_NAME)
contents = self._pop_result_with_type(
html_reader.RESULT_TYPE_MESSAGE_CONTENTS)

return models.Message(
sender=self._sender_from_message_direction(message_direction),
timestamp=self._parse_message_timestamp(timestamp_raw),
contents=contents)

def _sender_from_message_direction(self, message_direction):
if message_direction == html_reader.MESSAGE_DIRECTION_OUTGOING:
return self._metadata['local_username']
elif message_direction == html_reader.MESSAGE_DIRECTION_INCOMING:
return self._metadata['remote_username']
else:
raise UnexpectedMessageDirection(
'Unrecognized message direction: %s' % message_direction)

def _parse_message_timestamp(self, time_string):
# Strip parens from timestamp.
time_string = time_string[1:-1]
if _timestamp_includes_date(time_string):
timestamp = datetime.datetime.strptime(time_string,
'%m/%d/%Y %I:%M:%S %p')
else:
datetime_string = (
self._last_timestamp.strftime('%m/%d/%Y') + ' ' + time_string)
timestamp = datetime.datetime.strptime(datetime_string,
'%m/%d/%Y %I:%M:%S %p')
if self._timestamp_rolled_over_to_next_day(timestamp):
timestamp += datetime.timedelta(days=1)

self._last_timestamp = timestamp
return timestamp

def _timestamp_rolled_over_to_next_day(self, timestamp):
return timestamp < self._last_timestamp

def _peek_next_result_type(self):
if not self._results:
return None
result_type, _ = self._results[0]
return result_type

def _pop_result_with_type(self, result_type_expected):
result_type, result_value = self._results.pop(0)
if result_type != result_type_expected:
raise UnexpectedResultType(
'Expected result type %s, but got %s:%s' %
(result_type_expected, result_type, result_value))
return result_value


def _metadata_from_title(title):
match = _TITLE_PATTERN.match(title)
if not match:
raise InvalidMetadata('Unexpected metadata format: %s' % title)
local_username = _strip_username_suffix(match.group('local_username'))
return {
'local_username':
local_username,
'remote_username':
match.group('remote_username'),
'medium':
match.group('medium'),
'start_timestamp':
_parse_timestamp_parts(
match.group('start_date'), match.group('start_time'),
match.group('am_pm')),
}


def _strip_username_suffix(username):
if '/' in username:
return username.split('/')[0]
return username


def _timestamp_includes_date(timestamp):
return '/' in timestamp


def _parse_timestamp_parts(date_string, time_string, am_pm):
timestamp_string = '%s %s %s' % (date_string, time_string, am_pm)
return datetime.datetime.strptime(timestamp_string, '%m/%d/%Y %I:%M:%S %p')
12 changes: 7 additions & 5 deletions main.py
Expand Up @@ -7,6 +7,7 @@

from chat_unifier import json_serializer
from chat_unifier import history_merger
from chat_unifier.parsers.pidgin import parser as pidgin_parser
from chat_unifier.parsers.trillian_xml import parser as trillian_parser
from chat_unifier.file_iterators import pidgin as pidgin_iterator
from chat_unifier.file_iterators import trillian_xml as trillian_xml_iterator
Expand All @@ -31,7 +32,7 @@ def main(args):
merger = history_merger.Merger()
processors = [
(args.trillian, trillian_xml_iterator, trillian_parser.Parser()),
(args.pidgin, pidgin_iterator, None),
(args.pidgin, pidgin_iterator, pidgin_parser.Parser()),
]
for dir_roots, file_iterator, log_parser in processors:
if dir_roots:
Expand All @@ -50,11 +51,12 @@ def _process_log_dirs(dir_roots, file_iterator, log_parser, merger):
def _process_log_dir(dir_root, file_iterator, log_parser, merger):
logger.info('Searching for logs in %s', dir_root)
for log_path in file_iterator.iterate_files(dir_root):
if not log_parser:
logger.info('Skipping %s', log_path)
continue
logger.info('Parsing %s', log_path)
with open(log_path) as log_handle:
merger.add(parser.parse(log_handle.read()))
try:
merger.add(log_parser.parse(log_handle.read()))
except Exception as ex:
logger.error('Failed to parse: %s', ex.message)
logger.info('Parsed %s', os.path.basename(log_path))


Expand Down
20 changes: 20 additions & 0 deletions tests/file_iterators/test_pidgin.py
Expand Up @@ -31,3 +31,23 @@ def test_picks_correct_log_files(self):
'/log/aim/LocalUser123/RemoteUser456/2006-11-19.195755-0500EST.html',
'/log/aim/LocalUser123/RemoteUser456/2006-11-22.112333-0500EST.html',
], [f for f in pidgin.iterate_files('/logs')])

def test_ignores_irc_log_files(self):
with mock.patch.object(os, 'walk') as mock_walk:
mock_walk.return_value = [
('/logs', ('aim', 'irc'), ('README.txt',)),
('/logs/aim', ('LocalUser123',), ()),
('/log/aim/LocalUser123', ('RemoteUser345',), ()),
('/log/aim/LocalUser123/RemoteUser345', (),
('2007-02-24.020826-0500EST.html',
'2007-02-25.154550-0500EST.html')),
('/log/irc', ('localuser123@irc.freenode.net',), ()),
('/log/irc/localuser123@irc.freenode.net', ('#dummy.chat',),
()),
('/log/irc/localuser123@irc.freenode.net/#dummy.chat', (),
('2006-06-21.200806-0400EST.html',)),
]
self.assertEqual([
'/log/aim/LocalUser123/RemoteUser345/2007-02-24.020826-0500EST.html',
'/log/aim/LocalUser123/RemoteUser345/2007-02-25.154550-0500EST.html',
], [f for f in pidgin.iterate_files('/logs')])

0 comments on commit 3aee288

Please sign in to comment.