In [8]:
import requests
from requests_futures.sessions import FuturesSession
import re
import pandas as pd
from bs4 import BeautifulSoup, SoupStrainer
from urllib.parse import urldefrag, urlparse, urljoin, urlsplit, urlunsplit
session = FuturesSession()
urls = pd.read_csv("urls.csv", sep="\t", names=["url"])

In [2]:
def get_response(url):
    try:
        res = requests.get(url)
        if (res.ok):
            print("Status %s" % (res.status_code))
            return res
        else:
            res.raise_for_status()
            
    except Exception as e:
        print(url, e)


def parse_content(res_object):
    if (res_object):
        parse_only = SoupStrainer(["a", "link", "meta"])
        s = BeautifulSoup(res_object.content, "lxml", parse_only=parse_only)
        return s
    else:
        return False;

def get_urls(parsed_html, element=""):
    if (parsed_html):
        pattern = re.compile(u"(rss)|([./\+]xml$)|(xml;.*)", flags=re.I)
        reject_pattern = re.compile(u"(\/comments\/)", flags=re.I)
        links = (alink.get("href")
                 for alink in parsed_html.findAll(element)
                 if re.findall(pattern, str(alink)) 
                 and not re.findall(reject_pattern, str(alink)))
        return links
    else:
        return []

def parse_properties_attrs(parsed_html):
    if (parsed_html):
        try:
            metas = {attrs["property"]: attrs["content"] for attrs in 
                     [meta.attrs for meta in parsed_html.findAll("meta") 
                      if "property=" in str(meta)]}
            return metas
        except Exception as e:
            print(e)
            return None
    else:
        return None
    
def parse_named_attrs(parsed_html):
    if (parsed_html):
        try:
            metas = {attrs["name"]: attrs["content"] for attrs in 
                     [meta.attrs for meta in parsed_html.findAll("meta") 
                      if "name=" in str(meta)]}
            return metas
        except Exception as e:
            print(e)
            return None
    else:
        return None
    
def check_feed_response(url):
    if (url):
        try:
            res = requests.get(url)
            print(res)
            if (res.ok):
                return res.headers["content-type"]
            else:
                res.raise_for_status()
        except Exception as e:
            print(e)
            return res.status_code
    else:
        return None
    
def get_site_info(url):
    site_info = {"url": url}
    print("%s: \nGetting response..." % (url))
    res = get_response(url)
    print("Parsing content...")
    html = parse_content(res)
    site_info["rss_links"] = list(set(get_urls(html, "link")))
    site_info["rss_links"] = [urljoin(url, link, allow_fragments=False) for link in site_info["rss_links"]]
    site_info["rss_leads"] = set(get_urls(html, "a"))
    site_info["rss_leads"] = (urljoin(url, link, allow_fragments=False) for link in site_info["rss_leads"])
    site_info["rss_leads"] = [link for link in site_info["rss_leads"] if link not in site_info["rss_links"]]
    print("found %d RSS feed(s) and %d link(s) to possible feeds" % 
          (len(site_info["rss_links"]), len(site_info["rss_leads"])))
    site_info["meta_properties"] = parse_properties_attrs(html)
    site_info["meta_named"] = parse_named_attrs(html)
    
    return site_info

In [3]:
def get_domain_link(url):
    scheme, netloc = urlsplit(url)[:2]
    link = urlunsplit((scheme, netloc,"", "", ""))
    return link

res = requests.get("https://news.google.com/news?cf=all&pz=1&ned=uk&siidp=d99c9ccef53edd7975a8b314decb9a979877&ict=ln")
s = BeautifulSoup(res.content,"lxml")
links = [a.get("href") for a in s.findAll("a") if not (re.findall(u"google\.com", str(a.get("href"))) or not str(a.get("href")).startswith("http"))]
links = pd.Series(([get_domain_link(link) for link in links]))
links = links.drop_duplicates()
links = links.loc[links != b'',]

In [4]:
links.to_csv("glinks.csv", sep="\t", encoding="UTF-8")

In [6]:
dict_list = (get_site_info(url) for url in links[79:99])

In [7]:
%time dicts = list(dict_list)

http://www.techtimes.com: 
Getting response...
Status 200
Parsing content...
found 1 RSS feed(s) and 1 link(s) to possible feeds
http://www.stroudnewsandjournal.co.uk: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 1 link(s) to possible feeds
http://www.gazetteseries.co.uk: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 1 link(s) to possible feeds
http://www.bristolpost.co.uk: 
Getting response...
http://www.bristolpost.co.uk 404 Client Error: Not Found for url: http://www.bristolpost.co.uk/
Parsing content...
found 0 RSS feed(s) and 0 link(s) to possible feeds
http://www.dorsetecho.co.uk: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 1 link(s) to possible feeds
http://www.miltonkeynes.co.uk: 
Getting response...
Status 200
Parsing content...
found 4 RSS feed(s) and 0 link(s) to possible feeds
http://www.weather.com: 
Getting response...
Status 200
Parsing content...
found 0 RSS feed(s) and 0 link(s) t

In [62]:
dicts

[{'meta_named': None,
  'meta_properties': None,
  'rss_leads': [],
  'rss_links': [],
  'url': 'http://www.sevenoakschronicle.co.uk'},
 {'meta_named': {'description': 'The Bolton News, sport, Wanderers',
   'google-site-verification': '1fkS6UJOMYtA_lZ3vedEj4vPB0rUa4YXGe5SYvO7VgQ',
   'keywords': 'Bolton, News, Sport, Wanderers, Crime, Politics, Health, Education, Business, Leisure, Video, Videos, Photos, Photographs, Pictures, Neil Lennon, Phil Gartside',
   'msvalidate.01': '7C7F57AF0404D89F1BA6816F067C009A',
   'title': 'The Bolton News. First for news and sport in Bolton.',
   'viewport': 'width=device-width, initial-scale=1.0, user-scalable=yes',
   'vr:category': 'News',
   'vr:type': 'Text',
   'y_key': 'y_key" content="809215ec9b977b45'},
  'meta_properties': {'fb:app_id': '100403180042168'},
  'rss_leads': ['http://www.theboltonnews.co.uk/rss/'],
  'rss_links': [],
  'url': 'http://www.theboltonnews.co.uk'},
 {'meta_named': {'description': 'Medical news and health news headlin

In [None]:
df = urls[10:20].copy()
df["content"] = df["url"].apply(get_response).apply(parse_content)
df["links"] = df["content"].apply(get_urls, element = "link").apply(list)
df["leads"] = df["content"].apply(get_urls, element = "a").apply(list)

In [None]:
f = df.apply(lambda x: pd.Series(x['links']),axis=1).stack().reset_index(level=1, drop=True)
f.name = 'links'
df = df.drop('links', axis=1).join(f)
f = df.apply(lambda x: pd.Series(x['leads']),axis=1).stack().reset_index(level=1, drop=True)
f.name = 'leads'
df = df.drop('leads', axis=1).join(f)

In [None]:
df = df.drop_duplicates()
df = df.fillna("")
df["url"] = df["url"].str.rstrip("/")
df["links"] = df["links"].str.rstrip("/")
df["leads"] = df["leads"].str.rstrip("/")

In [None]:
df["links"] = df.apply(lambda x: urljoin(x["url"], str(x["links"])), axis=1)
df["leads"] = df.apply(lambda x: urljoin(x["url"], str(x["leads"])), axis=1)

In [None]:
df["link_type"] = df["links"].apply(check_feed_response) 

In [None]:
df

In [None]:
df.to_csv("test.csv", sep="\t", encoding="UTF-8")

In [None]:
df["test"] = df["links"] != df["leads"]

In [None]:
df["og"] = df["content"].apply(parse_opengraph_attrs)

In [None]:
df

In [None]:
dicts