/
author.py
181 lines (144 loc) · 4.71 KB
/
author.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Parsing Authors from html meta-tags and strings
This module was adapted from newspaper: http://github.com/codelucas/newspaper
"""
from bs4 import BeautifulSoup
from newslynx.lib.common import make_soup
from newslynx.lib import html
from newslynx.lib.regex import (
re_by, re_name_token, re_digits,
re_initial, re_prefix_suffix
)
MIN_NAME_TOKENS = 2 # how short can a name be?
MAX_NAME_TOKENS = 3 # how long can a name be?
DELIM = ['and', '|', '&', '']
PESSIMISTIC_TAGS = ['meta']
OPTIMISTIC_TAGS = [
'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'span', 'p', 'meta', 'div'
]
TAG_ATTRS = [
'name', 'rel', 'itemprop', 'class', 'id', 'property'
]
TAG_VALS = [
'author', 'byline', 'byl', 'byline-author', 'post-byline',
'parsely-author', 'storybyline'
]
# tokens indicative of non-authors (usually photographers)
BAD_TOKENS = [
'getty', 'images', 'photo', 'january', 'february', 'march',
'april', 'may', 'june', 'july', 'august', 'september', 'october',
'november', 'december'
]
def extract(
soup,
tags=PESSIMISTIC_TAGS,
attrs=TAG_ATTRS,
vals=TAG_VALS):
"""
Extract author attrs from meta tags.
Only works for english articles.
"""
# soupify
if not isinstance(soup, BeautifulSoup):
soup = make_soup(soup)
# Search popular author tags for authors
matches = []
_authors = []
for tag in tags:
for attr in attrs:
for val in vals:
found = soup.find_all(tag, {attr: val})
matches.extend(found)
for match in matches:
content = u''
m = match.attrs.get('content', None)
if m:
content = m
else: # match.tag == <any other tag>
content = match.text or u'' # text_content()
if len(content) > 0:
_authors.extend(parse(content))
return _format(_authors)
def parse(search_str):
"""
Takes a candidate string and
extracts out the name(s) in list form
>>> string = 'By: Brian Abelson, Michael H. Keller and Dr. Stijn Debrouwere IV'
>>> authors_from_string(string)
['Brian Abelson', 'Michael H Keller', 'DR Stijn Debrouwere IV']
"""
# set initial counter
initial_count = 0
# clean string
search_str = html.strip_tags(search_str)
search_str = re_by.sub('', search_str)
search_str = search_str.strip()
# tokenize
name_tokens = [s.strip() for s in re_name_token.split(search_str)]
_authors = []
curname = [] # List of first, last name tokens
for token in name_tokens:
# check if the length of the name
# and the token suggest an initial
if _is_initial(curname, token):
# upper case initial & increment
token = token.upper()
initial_count += 1
# if we're at a delimiter, check if the name is complete
if token.lower() in DELIM:
# check valid name based on initial count
if _end_name(curname, initial_count):
name = ' '.join(curname)
if not any([t in name.lower() for t in BAD_TOKENS]):
_authors.append(name)
# reset
initial_count = 0
curname = []
# otherwise, append token
elif not re_digits.search(token):
curname.append(token)
# One last check at end
valid_name = (len(curname) >= MIN_NAME_TOKENS)
if valid_name:
name = ' '.join(curname)
if not any([t in name.lower() for t in BAD_TOKENS]):
_authors.append(name)
return _format(_authors)
# format parsed authors
def _format(authors):
"""
Final formatting / deduping steps to parsed authors.
"""
_authors = []
uniq = list(set([a.lower().replace('.', '')
for a in authors if a != '']))
seen = []
for name in sorted(uniq, key=len):
# dedupe multiple html tags with same
# author info
if not any([n in name for n in seen]):
seen.append(name)
_authors.append(name.upper())
return _authors
def _match_initial(token):
"""
Check if a token looks like an initial / prefix / suffix.
"""
return re_initial.match(token) or re_prefix_suffix.match(token)
def _valid_initial(curname):
"""
Only include an inital if we haven't passed
the max name token range.
"""
return (len(curname) < MAX_NAME_TOKENS + 1)
def _is_initial(curname, token):
"""
Combination of the above two functions.
"""
return _valid_initial(curname) and _match_initial(token)
def _end_name(curname, initial_count):
"""
Check whether we should end the name.
"""
est_count = MAX_NAME_TOKENS + initial_count
return (len(curname) <= est_count)