In [37]:
import requests
from requests_futures.sessions import FuturesSession
import re
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urldefrag, urlparse, urljoin, urlsplit, urlunsplit
session = FuturesSession()

In [74]:
def get_response(url):
    try:
        res = requests.get(url)
        if (res.ok):
            print("Status %s" % (res.status_code))
            return res
        else:
            res.raise_for_status()
            
    except Exception as e:
        print(url, e)


def parse_content(res_object):
    if (res_object):
        parse_only = SoupStrainer(["a", "link", "meta", "title"])
        s = BeautifulSoup(res_object.content, "lxml", parse_only=parse_only)
        return s
    else:
        return False;

def get_urls(parsed_html, element=""):
    if (parsed_html):
        pattern = re.compile(u"(rss)|([./\+]xml$)|(xml;.*)", flags=re.I)
        reject_pattern = re.compile(u"(\/comments\/)", flags=re.I)
        links = (alink.get("href")
                 for alink in parsed_html.findAll(element)
                 if re.findall(pattern, str(alink)) 
                 and not re.findall(reject_pattern, str(alink)))
        return links
    else:
        return []

def parse_meta_attrs(parsed_html):
    if (parsed_html):
        pattern = re.compile(u"twitter:|fb:|description|og:|keywords", flags=re.I)
        try:
            metas = {attrs["property"]: attrs["content"] for attrs in 
                     [meta.attrs for meta in parsed_html.findAll("meta") 
                      if "property=" in str(meta) and re.findall(pattern, str(meta))]}
            named_metas = {attrs["name"]: attrs["content"] for attrs in 
                     [meta.attrs for meta in parsed_html.findAll("meta") 
                      if "name=" in str(meta) and re.findall(pattern, str(meta))]}
            metas.update(named_metas)
            metas["title"] = parsed_html.find("title").string
            return metas
        except Exception as e:
            print(e)
            return None
    else:
        return None

    
def check_feed_response(url):
    if (url):
        try:
            res = requests.get(url)
            if (res.ok):
                return res.headers["content-type"]
            else:
                res.raise_for_status()
        except Exception as e:
            print(e)
            return res.status_code
    else:
        return None
    
def get_site_info(url):
    site_info = {"url": url}
    print("%s: \nGetting response..." % (url))
    res = get_response(url)
    print("Parsing content...")
    html = parse_content(res)
    site_info["rss_links"] = list(set(get_urls(html, "link")))
    site_info["rss_links"] = [urljoin(url, link, allow_fragments=False) for link in site_info["rss_links"]]
    site_info["rss_leads"] = set(get_urls(html, "a"))
    site_info["rss_leads"] = (urljoin(url, link, allow_fragments=False) for link in site_info["rss_leads"])
    site_info["rss_leads"] = list(set(link for link in site_info["rss_leads"] if link not in site_info["rss_links"]))
    print("found %d RSS feed(s) and %d link(s) to possible feeds" % 
          (len(site_info["rss_links"]), len(site_info["rss_leads"])))
    site_info["meta"] = parse_meta_attrs(html)
    
    return site_info

In [75]:
def get_domain_link(url):
    scheme, netloc = urlsplit(url)[:2]
    link = urlunsplit((scheme, netloc,"", "", ""))
    return link

res = requests.get("https://news.google.com/news?cf=all&pz=1&ned=uk&siidp=d99c9ccef53edd7975a8b314decb9a979877&ict=ln")
s = BeautifulSoup(res.content,"lxml")
links = [a.get("href") for a in s.findAll("a") if not (re.findall(u"google\.com", str(a.get("href"))) or not str(a.get("href")).startswith("http"))]
links = pd.Series(([get_domain_link(link) for link in links]))
links = links.drop_duplicates()
links = links.loc[links != b'',]

In [33]:
links.to_csv("glinks.csv", sep="\t", encoding="UTF-8")

In [111]:
dict_list = (get_site_info(url) for url in links)

In [112]:
%time dicts = list(dict_list)

https://www.youtube.com: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 0 link(s) to possible feeds
http://www.mirror.co.uk: 
Getting response...
Status 200
Parsing content...
found 1 RSS feed(s) and 2 link(s) to possible feeds
http://www.newstatesman.com: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 1 link(s) to possible feeds
https://www.theguardian.com: 
Getting response...
Status 200
Parsing content...
found 1 RSS feed(s) and 0 link(s) to possible feeds
http://www.walesonline.co.uk: 
Getting response...
Status 200
Parsing content...
found 1 RSS feed(s) and 4 link(s) to possible feeds
http://www.dailymail.co.uk: 
Getting response...
Status 200
Parsing content...
found 2 RSS feed(s) and 1 link(s) to possible feeds
http://www.bbc.co.uk: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 0 link(s) to possible feeds
http://www.ft.com: 
Getting response...
Status 200
Parsing content...
found 1 RSS feed(s) a

In [126]:
from itertools import chain
rss_links = filter(None, map(lambda x: x["rss_links"], dicts))

In [127]:
list_of_rss_links = list(chain.from_iterable(rss_links))

In [None]:
df = urls[10:20].copy()
df["content"] = df["url"].apply(get_response).apply(parse_content)
df["links"] = df["content"].apply(get_urls, element = "link").apply(list)
df["leads"] = df["content"].apply(get_urls, element = "a").apply(list)

In [None]:
f = df.apply(lambda x: pd.Series(x['links']),axis=1).stack().reset_index(level=1, drop=True)
f.name = 'links'
df = df.drop('links', axis=1).join(f)
f = df.apply(lambda x: pd.Series(x['leads']),axis=1).stack().reset_index(level=1, drop=True)
f.name = 'leads'
df = df.drop('leads', axis=1).join(f)

In [None]:
df = df.drop_duplicates()
df = df.fillna("")
df["url"] = df["url"].str.rstrip("/")
df["links"] = df["links"].str.rstrip("/")
df["leads"] = df["leads"].str.rstrip("/")

In [None]:
df["links"] = df.apply(lambda x: urljoin(x["url"], str(x["links"])), axis=1)
df["leads"] = df.apply(lambda x: urljoin(x["url"], str(x["leads"])), axis=1)

In [None]:
df["link_type"] = df["links"].apply(check_feed_response) 

In [None]:
df.to_csv("test.csv", sep="\t", encoding="UTF-8")

In [None]:
df["test"] = df["links"] != df["leads"]

In [None]:
df["og"] = df["content"].apply(parse_opengraph_attrs)