Skip to content

Commit

Permalink
Merge cdda703 into a80db5c
Browse files Browse the repository at this point in the history
  • Loading branch information
jmhobbs committed Aug 29, 2015
2 parents a80db5c + cdda703 commit fe93f12
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 9 deletions.
27 changes: 20 additions & 7 deletions lassie/core.py
Expand Up @@ -40,7 +40,8 @@ def merge_settings(fetch_setting, class_setting):
class Lassie(object):
__attrs__ = [
'open_graph', 'twitter_card', 'touch_icon', 'favicon',
'all_images', 'parser', '_retrieve_content', 'client'
'canonical', 'all_images', 'parser', '_retrieve_content',
'client'
]

def __init__(self):
Expand All @@ -49,6 +50,7 @@ def __init__(self):
self.twitter_card = True
self.touch_icon = True
self.favicon = True
self.canonical = False
self.all_images = False
self.parser = 'html5lib'
self.handle_file_content = False
Expand All @@ -72,7 +74,8 @@ def __repr__(self):
return '<Lassie [parser: %s]>' % (self.parser)

def fetch(self, url, open_graph=None, twitter_card=None, touch_icon=None,
favicon=None, all_images=None, parser=None, handle_file_content=None):
favicon=None, all_images=None, parser=None, handle_file_content=None,
canonical=None):
"""Retrieves content from the specified url, parses it, and returns
a beautifully crafted dictionary of important information about that
web page.
Expand All @@ -91,6 +94,8 @@ def fetch(self, url, open_graph=None, twitter_card=None, touch_icon=None,
:type touch_icon: bool
:param favicon: (optional) If ``True``, retrieves any favicon images and includes them in the response ``images`` array
:type favicon: bool
:param canonical: (optional) If ``True``, retrieves canonical url from meta tags. Default: False
:type canonical: bool
:param all_images: (optional) If ``True``, retrieves images inside web pages body and includes them in the response ``images`` array. Default: False
:type all_images: bool
:param parser: (optional) String reference for the parser that BeautifulSoup will use
Expand All @@ -105,6 +110,7 @@ def fetch(self, url, open_graph=None, twitter_card=None, touch_icon=None,
twitter_card = merge_settings(twitter_card, self.twitter_card)
touch_icon = merge_settings(touch_icon, self.touch_icon)
favicon = merge_settings(favicon, self.favicon)
canonical = merge_settings(canonical, self.canonical)
all_images = merge_settings(all_images, self.all_images)
parser = merge_settings(parser, self.parser)
handle_file_content = merge_settings(handle_file_content, self.handle_file_content)
Expand Down Expand Up @@ -152,6 +158,9 @@ def fetch(self, url, open_graph=None, twitter_card=None, touch_icon=None,
if favicon:
self._filter_link_tag_data('favicon', soup, data, url)

if canonical:
self._filter_link_tag_data('canonical', soup, data, url)

if all_images:
# Maybe filter out 1x1, no "good" way to do this if image doesn't supply width/height
self._find_all_images(soup, data, url)
Expand Down Expand Up @@ -275,11 +284,15 @@ def _filter_link_tag_data(self, source, soup, data, url):

html = soup.find_all('link', {link['key']: link['pattern']})

for line in html:
data['images'].append({
'src': urljoin(url, line.get('href')),
'type': link['type'],
})
if link['type'] == 'url':
for line in html:
data['url'] = line.get('href')
else:
for line in html:
data['images'].append({
'src': urljoin(url, line.get('href')),
'type': link['type'],
})

def _find_all_images(self, soup, data, url):
"""This method finds all images in the web page content
Expand Down
5 changes: 5 additions & 0 deletions lassie/filters/generic.py
Expand Up @@ -32,5 +32,10 @@
'key': 'rel',
'type': str('favicon'),
},
'canonical': {
'pattern': 'canonical',
'key': 'rel',
'type': 'url'
}
},
}
4 changes: 3 additions & 1 deletion tests/templates/generic/all_properties.html
Expand Up @@ -7,8 +7,10 @@
<meta name="description" content="Just a random description of a web page." />
<meta name="keywords" content="one, two, three, four, five" />
<meta name="title" content="Lassie Generic Test | all_properties" />

<link rel="canonical" href="http://example.com/canonical/path" />
</head>
<body>

</body>
</html>
</html>
12 changes: 12 additions & 0 deletions tests/templates/generic/canonical.html
@@ -0,0 +1,12 @@
<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Lassie Generic Test | Canonical</title>

<link rel="canonical" href="http://example.com/canonical/path" />
</head>
<body>

</body>
</html>
9 changes: 8 additions & 1 deletion tests/test_generic.py
Expand Up @@ -6,11 +6,12 @@
class LassieTwitterCardTestCase(LassieBaseTestCase):
def test_generic_all_properties(self):
url = 'http://lassie.it/generic/all_properties.html'
data = lassie.fetch(url)
data = lassie.fetch(url, canonical=True)

self.assertEqual(data['locale'], 'en_US')
self.assertEqual(data['title'], 'Lassie Generic Test | all_properties')
self.assertEqual(data['description'], 'Just a random description of a web page.')
self.assertEqual(data['url'], 'http://example.com/canonical/path')
self.assertEqual(len(data['keywords']), 5)

def test_generic_bad_locale(self):
Expand All @@ -33,3 +34,9 @@ def test_no_title(self):
data = lassie.fetch(url)

self.assertTrue(not 'title' in data)

def test_canonical(self):
url = 'http://lassie.it/generic/canonical.html'
data = lassie.fetch(url, canonical=True)

self.assertEqual(data['url'], 'http://example.com/canonical/path')

0 comments on commit fe93f12

Please sign in to comment.