In [1]:
import networkx as nx
from tqdm.notebook import tqdm
from networkx_query import search_nodes
from typing import Iterable,Dict,Optional

### Reading the Graph

In [2]:
G = nx.read_gpickle('output\out_graph_14_05_2022_22_20.pkl')

### Graph Analysis 

In order to to modify the PageRank algorithm to favor URLs that contain more email addresses, We will weight the edges according to the number of email addresses in the target node.

In [3]:
for u,v,d in tqdm(G.edges(data=True)):
    try:
        num_of_emails = (len(G.nodes[v]['emails']))
        if num_of_emails == 0:
            num_of_emails+=1
        d['num_of_emails'] = num_of_emails
    except KeyError:
        d['num_of_emails'] = 1

  0%|          | 0/536761 [00:00<?, ?it/s]

In [4]:
nx.set_node_attributes(G,nx.algorithms.pagerank(G,weight='num_of_emails'),name='email_weighted_pagerank') #email weighted PageRank

In [5]:
nx.set_node_attributes(G,nx.algorithms.pagerank(G),name='normal_pagerank')

We also have the ability to use any other metric and transform it to a metric that is weighted by the amount of emails in the node. For example, we can weight the in degrees of a specific node.
For that porpuse, we can use the function 'set_email_weighted_attribute' which in turn uses the function 'emails_weighted_metric'

In [6]:
nx.set_node_attributes(G,dict(G.in_degree),name='in_degree')
nx.set_node_attributes(G,dict(G.out_degree),name='out_degree')

In [7]:
def emails_weighted_metric(node_data:dict,metric_field_name:str,emails_weight:float = 1.0) -> float:
    try:
        return node_data[metric_field_name]+len(node_data['emails'])*emails_weight
    except KeyError:
        return 0

In [8]:
def set_email_weighted_attribute(G:nx.Graph,attribute_name:str,weighted_attribute_name:Optional[str] = None,email_weight:float = 1.0):
    if not weighted_attribute_name:
        weighted_attribute_name = f"email_weighted_{attribute_name}"
    for node_key,node_data in G.nodes(data=True):
        email_weighted_att = emails_weighted_metric(node_data,metric_field_name=attribute_name,emails_weight=email_weight)
        node_data[weighted_attribute_name] = email_weighted_att

In [9]:
set_email_weighted_attribute(G,'in_degree') #this will add a new attribute named 'email_weighted_in_degree' for every node

### What are the most important URLs per domain (top 5) ?

In [10]:
def get_top_n_important_urls_per_domain(domains_list:Iterable[str],importance_field:str,tie_breaker_field:Optional[str] = None,n:int =5) -> Dict[str,str]:
    result_dict = {}
    if not tie_breaker_field:
        tie_breaker_field = importance_field
    for domain in tqdm(domains_list):
        domain_nodes_keys = list(search_nodes(G, {"eq": [("domain",), domain]}))
        sorted_domain_nodes_keys = sorted(domain_nodes_keys,key= lambda k:(G.nodes[k][importance_field],G.nodes[k][tie_breaker_field]),reverse=True)
        result_dict[domain] = sorted_domain_nodes_keys[:n]
    return result_dict

In [11]:
all_domains = set()
for _,node_data in G.nodes(data=True):
    try:
        domain:str = node_data['domain']
        if len(domain)>1:
            all_domains.add(domain)
    except KeyError:
        continue

In [12]:
get_top_n_important_urls_per_domain(['wikipedia','github','stackoverflow','protonmail'],importance_field='email_weighted_pagerank')

  0%|          | 0/4 [00:00<?, ?it/s]

{'wikipedia': ['https://en.wikipedia.org/wiki/Email_address',
  'https://zh.wikipedia.org/wiki/%E9%9B%BB%E5%AD%90%E9%83%B5%E4%BB%B6%E5%9C%B0%E5%9D%80',
  'https://uk.wikipedia.org/wiki/%D0%90%D0%B4%D1%80%D0%B5%D1%81%D0%B0_%D0%B5%D0%BB%D0%B5%D0%BA%D1%82%D1%80%D0%BE%D0%BD%D0%BD%D0%BE%D1%97_%D0%BF%D0%BE%D1%88%D1%82%D0%B8',
  'https://fi.wikipedia.org/wiki/S%C3%A4hk%C3%B6posti',
  'https://eu.wikipedia.org/wiki/Posta_elektroniko'],
 'github': ['https://github.com/mdn/content/blob/main/README.md',
  'https://github.com/contact',
  'https://github.com/ietf-tools/datatracker/releases/tag/8.2.0',
  'https://github.com/ietf-tools/datatracker/issues/new/choose',
  'https://tc39.github.io/ecma262/#sec-tostring'],
 'stackoverflow': ['https://stackoverflow.com/tags/email-address',
  'https://stackoverflow.blog/2022/05/12/an-unfiltered-look-back-at-2022-april-fools/',
  'http://stackoverflow.com/questions/1335851/what-does-use-strict-do-in-javascript-and-what-is-the-reasoning-behind-it'],
 'protonma

In [14]:
top_5_imporant_urls_for_all_domains = get_top_n_important_urls_per_domain(all_domains,importance_field='email_weighted_pagerank',)

  0%|          | 0/399 [00:00<?, ?it/s]