-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
69 additions
and
3 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
from .feed_seeker import FeedSeeker, generate_feed_urls, find_feed_url # noqa | ||
from .feed_seeker import FeedSeeker, generate_feed_urls, find_feed_url, find_feedly_feeds # noqa |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -5,13 +5,15 @@ | |
from contextlib import contextmanager | ||
import signal | ||
from urllib.parse import urljoin, urlparse, urlunparse | ||
|
||
from typing import Iterable | ||
from bs4 import BeautifulSoup | ||
import requests | ||
import sys | ||
from requests.adapters import HTTPAdapter | ||
from requests.exceptions import InvalidSchema, RetryError | ||
from urllib3.util.retry import Retry | ||
|
||
import publicsuffix | ||
import time | ||
|
||
@contextmanager | ||
def timeout(seconds=None): | ||
|
@@ -244,6 +246,8 @@ def _generate_feed_urls(self, spider=0, seen=None, max_links=None): | |
|
||
if spider > 0: | ||
for internal_link in self.find_internal_links(): | ||
print("Internal Link: {}".format(internal_link)) | ||
#sys.exit() | ||
spider_seeker = FeedSeeker(internal_link, html=None, fetcher=self.fetcher) | ||
kwargs = { | ||
'spider': spider - 1, | ||
|
@@ -366,6 +370,48 @@ def guess_feed_links(self): | |
for suffix in suffixes: | ||
yield urljoin(base=self.clean_url(), url=suffix) | ||
|
||
def find_feedly_feeds(self, | ||
max_links : int = None, | ||
throttle : int = 5): | ||
"""This is the class method for the find_feedly_feeds method below. Check out the | ||
description there for more information on how to use the method | ||
""" | ||
|
||
search_url = "https://cloud.feedly.com/v3/search/feeds" | ||
|
||
# Fetch current public suffix list and determine root domain of url | ||
psl = publicsuffix.fetch() | ||
ps = publicsuffix.PublicSuffixList(psl) | ||
self.uri_root_domain = ps.get_public_suffix(self.url) | ||
self.uri_hostname = urlparse(self.url).hostname | ||
self.uri_domain_only = self.uri_root_domain.split('.', 1)[0] | ||
|
||
found_hostnames = set() # Hostnames found during search | ||
checked_queries = set() # Previously checked queries / urls | ||
found_feeds = set() # Set of found feeds | ||
queries = [self.uri_hostname, self.uri_root_domain, self.uri_domain_only] | ||
This comment has been minimized.
Sorry, something went wrong.
This comment has been minimized.
Sorry, something went wrong.
pushshift
Author
Collaborator
|
||
|
||
for url in queries: | ||
if url in checked_queries: | ||
continue | ||
params = {} | ||
params['query'] = url | ||
params['count'] = 500 | ||
response = requests.get(search_url,params=params) | ||
if response.status_code == 200: | ||
checked_queries.add(url) | ||
feeds = response.json() | ||
for feed in feeds['results']: | ||
url = feed['feedId'][5:] | ||
hostname = urlparse(url).hostname | ||
if hostname.endswith(self.uri_hostname): | ||
This comment has been minimized.
Sorry, something went wrong.
hroberts
Collaborator
|
||
if hostname not in found_hostnames: | ||
queries.append(hostname) # Add more hostnames relevant to main site for more results | ||
found_hostnames.add(hostname) | ||
if url not in found_feeds: | ||
yield url | ||
time.sleep(throttle) # Throttle requests | ||
|
||
|
||
def find_feed_url(url, html=None, spider=0, max_time=None, max_links=None): | ||
"""Find the single most likely feed url for a page. | ||
|
@@ -423,3 +469,18 @@ def generate_feed_urls(url, html=None, spider=0, max_time=None, max_links=None): | |
with timeout(max_time): | ||
for feed in FeedSeeker(url, html).generate_feed_urls(spider=spider, max_links=max_links): | ||
yield feed | ||
|
||
def find_feedly_feeds(url:str, | ||
max_links : int = None, | ||
throttle : int = 5) -> Iterable[str]: | ||
"""Use feedly to discover feeds | ||
There are a few gotchas here. Sometimes searching with the top level domain | ||
attached doesn't yield as many results (e.g. washingtonpost.com) -- however, | ||
searching by just the domain (e.g. washingtonpost) does turn up many results. | ||
Also, an API Key is not required for this endpoint. However, occasionally a | ||
403 response is returned which may be from an internal undocumented throttle | ||
or other issues. The default throttle between requests is 5 seconds and can be | ||
set using the throttle parameter. | ||
""" | ||
for feed in FeedSeeker(url).find_feedly_feeds(max_links=max_links,throttle=throttle): | ||
yield feed |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,8 @@ | ||
beautifulsoup4>=4.6.0 | ||
lxml>=4.1.1 | ||
requests>=2.18.4 | ||
publicsuffix>=1.1.0 | ||
urllib3>=1.24.1 | ||
pytest==4.5.0 | ||
responses>=0.10.6 | ||
typing>=3.6.6 |
is this going to pull down http://blogspot.com/feeds (or whatever) for http://halroberts.blogpsot.com?