Required Libraries: 
lxml ,
requests,
html_to_etree,
six

In [1]:
from __future__ import unicode_literals
import re
import six

__author__ = 'Johannes Ahlmann'
__email__ = 'johannes@fluquid.com'
__version__ = '0.4.0'


# FIXME: a lot wrong with the below
# - too permissive
# - likely too slow
PREFIX = r'https?://(?:www\.)?'
SITES = ['twitter.com/', 'youtube.com/',
         '(?:[a-z]{2}\.)?linkedin.com/(?:company/|in/|pub/)',
         'github.com/', '(?:[a-z]{2}-[a-z]{2}\.)?facebook.com/', 'fb.co',
         'plus\.google.com/', 'pinterest.com/', 'instagram.com/',
         'snapchat.com/', 'flipboard.com/', 'flickr.com',
         'google.com/+', 'weibo.com/', 'periscope.tv/',
         'telegram.me/', 'soundcloud.com', 'feeds.feedburner.com',
         'vimeo.com', 'slideshare.net', 'vkontakte.ru']
BETWEEN = ['user/', 'add/', 'pages/', '#!/', 'photos/',
           'u/0/']
ACCOUNT = r'[\w\+_@\.\-/%]+'
PATTERN = (
    r'%s(?:%s)(?:%s)?%s' %
    (PREFIX, '|'.join(SITES), '|'.join(BETWEEN), ACCOUNT))
SOCIAL_REX = re.compile(PATTERN, flags=re.I)
BLACKLIST_RE = re.compile(
    """
    sharer.php|
    /photos/.*\d{6,}|
    google.com/(?:ads/|
                  analytics$|
                  chrome$|
                  intl/|
                  maps/|
                  policies/|
                  search$
               )|
    instagram.com/p/|
    /share\?|
    /status/|
    /hashtag/|
    home\?status=|
    twitter.com/intent/|
    twitter.com/share|
    search\?|
    /search/|
    pinterest.com/pin/create/|
    vimeo.com/\d+$|
    /watch\?""",
    flags=re.VERBOSE)

import requests
from html_to_etree import parse_html_bytes

def _from_url(url):  # pragma: no cover
    """ get list of social media links/handles given a url """
    try:
        res = requests.get(url)
        tree = parse_html_bytes(res.content, res.headers.get('content-type'))
        return set(find_links_tree(tree))
    except:
        return set({None})  
    


    

def matches_string(string):
    """ check if a given string matches known social media url patterns """
    return SOCIAL_REX.match(string) and not BLACKLIST_RE.search(string)


def find_links_tree(tree):
    """
    find social media links/handles given an lxml etree.
    TODO:
    - `<fb:like href="http://www.facebook.com/elDiarioEs"`
    - `<g:plusone href="http://widgetsplus.com/"></g:plusone>`
    - <a class="reference external" href="https://twitter.com/intent/follow?screen_name=NASA">
    """
    for link in tree.xpath('//*[@href or @data-href]'):
        href = link.get('href') or link.get('data-href')
        if (href and
                isinstance(href, (six.string_types, six.text_type)) and
                matches_string(href)):
            yield href

    for script in tree.xpath('//script[not(@src)]/text()'):
        for match in SOCIAL_REX.findall(script):
            if not BLACKLIST_RE.search(match):
                yield match

    for script in tree.xpath('//meta[contains(@name, "twitter:")]'):
        name = script.get('name')
        if name in ('twitter:site', 'twitter:creator'):
            # FIXME: track fact that source is twitter
            yield script.get('content')

In [2]:
from urllib.parse import urlparse,ParseResult
def format_url(url):
    p = urlparse(url, 'http')
    netloc = p.netloc or p.path
    path = p.path if p.netloc else ''
    if not netloc.startswith('www.'):
        netloc = 'www.' + netloc

    return ParseResult('http', netloc, path, *p[3:]).geturl()
    

In [3]:
import requests
from html_to_etree import parse_html_bytes

In [4]:
def getSocialMedia(url): 
    ##res = requests.get(format_url(url))
    res = requests.get(url)
    tree = parse_html_bytes(res.content, res.headers.get('content-type'))

    return set(find_links_tree(tree))

In [5]:
import pandas as pd

In [6]:
df = pd.read_csv('sample_data_2017_M4.csv',usecols=['WebsiteAddressTxt'])
df = df.dropna()

In [7]:
df_top = df.head(20)

In [8]:
df_top

Unnamed: 0,WebsiteAddressTxt
1,WWW.NOAHCDC.ORG
2,RELEAFMICHIGAN.ORG
4,WWW.KARLASMITHFOUNDATION.ORG
5,www.developerie.com
6,WWW.HCANJ.ORG
9,www.greatergalileelearningcenter.com
11,http://www.alabamaprobatejudges.org
12,WWW.ANOKACOUNTYHISTORY.ORG
13,www.bsa-mdsc.org
14,WWW.WESTRHODERIVERKEEPER.ORG


In [15]:
df_top['socialMedia'] = df_top['WebsiteAddressTxt'].apply(lambda x: getSocialMedia(x))

KeyboardInterrupt: 

In [None]:
df.head()

In [47]:
getSocialMedia('http://WWW.TOUCHALIFEKIDS.ORG')

{'http://twitter.com/touchalife',
 'http://www.facebook.com/pages/Touch-A-Life-Foundation/54920679807',
 'http://www.linkedin.com/company/touch-a-life-foundation',
 'http://www.youtube.com/user/TouchALifeFoundation',
 'https://instagram.com/touchalife/',
 'https://www.facebook.com/54920679807/',
 'https://www.facebook.com/54920679807/posts/10157117875939808'}