In [4]:
import re
import requests
import pandas as pd
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from tkinter import Tk

In [5]:
r = Tk()
text = r.clipboard_get()
r.withdraw()
r.update()
r.destroy()

In [6]:
s = BeautifulSoup(text, "lxml")
links = s.select('td > a')
urls = (l.get('href') for l in links)
names = (n.text for n in links)
named_urls = zip(names, urls)

In [8]:
def get_feed_links(res):
    if (res):
        pattern = re.compile(r"(rss)|(atom)|(rdf)")
        try:
            print('checking for feeds')
            resultset = BeautifulSoup(res.content, "lxml").findAll("link", attrs={"type": pattern})
            rss_links = [urljoin(res.url, link.get("href")) for link in resultset]
            print('%d feeds' % (len(rss_links)))
            return rss_links

        except Exception as e:
            print(e)
    else:
        return []
    
        
def get_response(url):

    print("Getting: %s" % (url))
    try:
        response = requests.get(url, timeout = 3)
        response.url = url
        if (response.ok):
            return response
        else:
            response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print (e)
        return
    
    
def get_feeds(urls):
    names = (url[0] for url in urls)
    domains = (url[1] for url in urls)
    results = (get_response(url[1]) for url in urls)
    links = (filter(None, get_feed_links(res)) for res in results)
    zipped = zip(names, domains, links)
    
    result = [{'name':n, 'domain':d, 'feeds':list(f)} for n, d, f in zipped]
    return result
    

%time links = get_feeds(named_urls)

Getting: http://www.romandie.com
checking for feeds
0 feeds
Getting: http://www.spiegel.de
checking for feeds
2 feeds
Getting: http://www.ft.com
checking for feeds
1 feeds
Getting: http://www.smh.com.au
checking for feeds
1 feeds
Getting: http://www.heilpraxisnet.de
checking for feeds
2 feeds
CPU times: user 1.24 s, sys: 28 ms, total: 1.27 s
Wall time: 2.25 s


In [12]:
df = pd.DataFrame(links)
df

Unnamed: 0,domain,feeds,name
0,http://www.abola.pt,[],香港新浪網
1,http://www.ilsole24ore.com,"[http://www.spiegel.de/schlagzeilen/index.rss,...",Sport.cz
2,http://www.telegraaf.nl,[http://www.ft.com/rss/home/uk],Irish Independent
3,http://news.google.com,[http://feeds.smh.com.au/rssheadlines/top.xml],The Hindu
4,http://www.independent.ie,"[http://www.heilpraxisnet.de/feed, http://www....",El País.com (España)


In [16]:
f = df.apply(lambda x: pd.Series(x['feeds']),axis=1).stack().reset_index(level=1, drop=True)
f.name = 'feed'

In [24]:
df = df.drop('feeds', axis=1).join(f)

ValueError: labels ['feeds'] not contained in axis

In [26]:
df = df.dropna()

Unnamed: 0,domain,name,feed
1,http://www.ilsole24ore.com,Sport.cz,http://www.spiegel.de/schlagzeilen/index.rss
1,http://www.ilsole24ore.com,Sport.cz,http://www.spiegel.de/index.rss
2,http://www.telegraaf.nl,Irish Independent,http://www.ft.com/rss/home/uk
3,http://news.google.com,The Hindu,http://feeds.smh.com.au/rssheadlines/top.xml
4,http://www.independent.ie,El País.com (España),http://www.heilpraxisnet.de/feed
4,http://www.independent.ie,El País.com (España),http://www.heilpraxisnet.de/comments/feed


In [23]:
df.to_csv('feeds.csv', sep="\t", index=False)