In [15]:
import sqlite3
import logging
import pandas as pd
from urllib.parse import urlparse
from tld import get_tld, get_fld
import networkx as nx

In [3]:
logger = logging.getLogger('analysis')
logger.setLevel('INFO')

In [4]:
# Specify your local path to the sqlite db containing crawl data
SQLITE_LOCAL_PATH = '/home/berra/Documents/Mercator/gemeente-social/data/crawl-data-copy.sqlite'

# Specify the list for scoial media FLDs that we are interested in observing
SM_FLDS = ['facebook', 'instagram', 'linkedin', 'google', 'youtube', 'twitter']
#, 'whatsapp', 'snapchat', 'amazon', 'reddit']

In [5]:
# Read sqlite query results into a pandas DataFrame
con = sqlite3.connect(SQLITE_LOCAL_PATH)
cursor = con.cursor()

# Parse javascript table from SQLite
javascript = pd.read_sql_query("SELECT * from javascript", con)

# Parse SITE_VISITS table from SQLite
site_visits = pd.read_sql_query("SELECT * from site_visits", con)

In [6]:
# Helper function to get TLD+1 level of crawled link.
def parse_tld_url(url):
    return get_fld(url, fail_silently=True)

# Helper function to match social media names in url FLDs
def sm_match(url, sm_name):
    if url is not None:
        return sm_name in url
    else:
        return None

In [7]:
# Extract the First level domain for page links at the top level;
# pages linked to via a hyperlink from the first visible page when visiting a gemeente website
site_visits['tld_url'] = site_visits['site_url'].apply(parse_tld_url)

# Extract the top level domain from the keyed gemeente website
site_visits['tld_parent'] = site_visits['parent_url'].apply(parse_tld_url)

# Create a boolean column for each social media site fo interest.
for i in SM_FLDS:
    site_visits[('is_' + i)] = site_visits['tld_url'].apply(sm_match, args=(i,))
    
# Return social media aggregates per gemeente per social media platform.
social_media_pressence = site_visits[['parent_url']+ [('is_' + i) for i in SM_FLDS]]\
    .groupby(['parent_url']).agg(['sum'])\
    .applymap(lambda x: True if x >= 1 else False)
social_media_pressence.head()

Unnamed: 0_level_0,is_facebook,is_instagram,is_linkedin,is_google,is_youtube,is_twitter
Unnamed: 0_level_1,sum,sum,sum,sum,sum,sum
parent_url,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
http://gemeente.groningen.nl,True,True,False,False,False,True
http://www.aaenhunze.nl,True,False,False,False,True,True
http://www.aalsmeer.nl,True,False,True,False,True,True
http://www.aalten.nl,True,False,True,False,False,True
http://www.achtkarspelen.nl,True,False,False,False,False,True


In [172]:
### Generate Gephi network file from social_media_pressence (gemeente to social media)

gem2sm = site_visits[['parent_url']+ [('is_' + i) for i in SM_FLDS]]

gem2sm_graph = nx.Graph()

#Adding nodes from list of gemeente and social media
gem2sm_graph.add_nodes_from(social_media_pressence.index)
gem2sm_graph.add_nodes_from(SM_FLDS)

#Adding edges if social media is linked in gemeente website
for sm in SM_FLDS:
    for gem in gem2sm.parent_url:
        n_is_sm = 0
        for is_sm in gem2sm[gem2sm.parent_url==gem]['is_'+sm].values:
            if is_sm is True: n_is_sm+=1 #do more n_is_sm mean more links or...?
            gem2sm_graph.add_edge(sm,gem.strip('http://www.'),weight=n_is_sm)
        
        

In [174]:
#Save file in .gexf format for Gephi
nx.write_gexf(gem2sm_graph, '/home/berra/Documents/Mercator/gemeente-social/analysis/gem2sm.gexf')