-
Notifications
You must be signed in to change notification settings - Fork 0
/
html_reader.py
171 lines (131 loc) · 6 KB
/
html_reader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
from HTMLParser import HTMLParser
RESULT_TYPE_TITLE = 'title'
RESULT_TYPE_MESSAGE_START = 'message-start'
RESULT_TYPE_TIMESTAMP = 'timestamp'
RESULT_TYPE_DISPLAY_NAME = 'display-name'
RESULT_TYPE_MESSAGE_CONTENTS = 'message-contents'
MESSAGE_DIRECTION_INCOMING = 'incoming'
MESSAGE_DIRECTION_OUTGOING = 'outgoing'
_STATE_SEEKING_TITLE = 1
_STATE_PARSING_TITLE = 2
_STATE_SEEKING_NEXT_MESSAGE = 3
_STATE_PARSING_TIMESTAMP = 4
_STATE_SEEKING_DISPLAY_NAME = 5
_STATE_PARSING_DISPLAY_NAME = 6
_STATE_SEEKING_CONTENTS = 7
_STATE_PARSING_CONTENTS = 8
class Error(Exception):
pass
class UnexpectedFontColor(Error):
pass
def _is_local_user_font_color(color):
return color == '#16569E'
def _is_remote_user_font_color(color):
return color == '#A82F2F'
def _is_system_message_font_color(color):
return color == '#FF0000'
def _is_pidgin_message_font_color(color):
return color == '#062585'
class Reader(HTMLParser):
"""Read relevant elements from Pidgin HTML log file.
Read a Pidgin HTML log file, pulling out the relevant elements with minimal
parsing. This is meant only to scrape the screen and leave more complex
parsing to other components.
"""
def __init__(self):
HTMLParser.__init__(self)
self._state = _STATE_SEEKING_TITLE
self._results = []
@property
def results(self):
return self._results
def feed(self, html):
html_annotated = _annotate_html(html)
HTMLParser.feed(self, html_annotated)
def handle_starttag(self, tag, attrs):
attrs_dict = dict(attrs)
if tag == 'title':
self._update_state(_STATE_PARSING_TITLE)
elif ((self._state == _STATE_SEEKING_NEXT_MESSAGE) and (tag == 'font')):
if 'color' in attrs_dict:
font_color = attrs_dict['color']
if _is_local_user_font_color(font_color):
self._add_message_start(MESSAGE_DIRECTION_OUTGOING)
self._update_state(_STATE_PARSING_TIMESTAMP)
elif _is_remote_user_font_color(font_color):
self._add_message_start(MESSAGE_DIRECTION_INCOMING)
self._update_state(_STATE_PARSING_TIMESTAMP)
elif (_is_system_message_font_color(font_color) or
_is_pidgin_message_font_color(font_color)):
pass
else:
raise UnexpectedFontColor(
'Font color %s is unexpected' % font_color)
elif ((self._state == _STATE_SEEKING_DISPLAY_NAME) and (tag == 'b')):
self._update_state(_STATE_PARSING_DISPLAY_NAME)
def handle_endtag(self, tag):
if ((self._state == _STATE_PARSING_TITLE) and (tag == 'title')):
self._update_state(_STATE_SEEKING_NEXT_MESSAGE)
elif ((self._state == _STATE_PARSING_TIMESTAMP) and (tag == 'font')):
self._update_state(_STATE_SEEKING_DISPLAY_NAME)
elif ((self._state == _STATE_PARSING_DISPLAY_NAME) and (tag == 'b')):
self._update_state(_STATE_SEEKING_CONTENTS)
elif ((self._state == _STATE_SEEKING_CONTENTS) and (tag == 'font')):
self._update_state(_STATE_PARSING_CONTENTS)
def handle_startendtag(self, tag, attrs):
if ((self._state == _STATE_PARSING_CONTENTS) and (tag == 'br')):
self._add_message_contents('\n')
elif ((self._state == _STATE_PARSING_CONTENTS) and
(tag == 'message-end')):
self._update_state(_STATE_SEEKING_NEXT_MESSAGE)
def handle_data(self, data):
if self._state == _STATE_PARSING_TITLE:
self._add_title(data)
elif self._state == _STATE_PARSING_TIMESTAMP:
self._add_timestamp(data)
elif self._state == _STATE_PARSING_DISPLAY_NAME:
self._add_display_name(data)
elif self._state == _STATE_PARSING_CONTENTS:
if not data.strip():
return
self._add_message_contents(data.decode('utf8'))
def handle_entityref(self, name):
decoded = _decode_html_entity_ref(name)
if self._state == _STATE_PARSING_CONTENTS:
self._add_message_contents(decoded)
elif self._state == _STATE_PARSING_DISPLAY_NAME:
self._add_display_name(decoded)
def handle_charref(self, name):
decoded = _decode_html_char_ref(name)
if self._state == _STATE_PARSING_CONTENTS:
self._add_message_contents(decoded)
elif self._state == _STATE_PARSING_DISPLAY_NAME:
self._add_display_name(decoded)
def _add_title(self, title):
self._results.append((RESULT_TYPE_TITLE, title))
def _add_message_start(self, message_type):
self._results.append((RESULT_TYPE_MESSAGE_START, message_type))
def _add_timestamp(self, timestamp):
self._results.append((RESULT_TYPE_TIMESTAMP, timestamp))
def _add_display_name(self, display_name):
self._append_or_coalesce_result(RESULT_TYPE_DISPLAY_NAME, display_name)
def _add_message_contents(self, message_contents):
self._append_or_coalesce_result(RESULT_TYPE_MESSAGE_CONTENTS,
message_contents)
def _append_or_coalesce_result(self, result_type, result_value):
if self._results:
last_result_type, last_result_value = self._results[-1]
if last_result_type == result_type:
self._results.pop()
result_value = last_result_value + result_value
self._results.append((result_type, result_value))
def _update_state(self, new_state):
self._state = new_state
def _annotate_html(html):
# We need to specially mark line-terminating <br> tags otherwise there's
# ambiguity in where the message ends (<br> can appear within messages).
return html.replace('\r\n', '\n').replace('<br/>\n', '<message-end/>\n')
def _decode_html_entity_ref(entity_ref):
return HTMLParser().unescape('&' + entity_ref + ';')
def _decode_html_char_ref(entity_ref):
return HTMLParser().unescape('&#' + entity_ref + ';')