In [1]:
import stanza
stanza.download('en')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/master/resources_1.0.0.json: 120kB [00:00, 940kB/s]                                         
2020-08-15 04:18:18 INFO: Downloading default packages for language: en (English)...
2020-08-15 04:18:23 INFO: File exists: C:\Users\meena\stanza_resources\en\default.zip.
2020-08-15 04:18:31 INFO: Finished downloading models and saved to C:\Users\meena\stanza_resources.


In [44]:
import logging
import stanza
from bs4 import BeautifulSoup
#logging.basicConfig(level=logging.INFO)
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.INFO)
# This class gives the match rule for traversal for template given parsed beautiful soup object and org_name
# and given a traversal rule identify the org
class TraversalRule:
    NER_TAG_PERSON = "PERSON" 
    NER_TAG_ORG = "ORG"
    nlp = stanza.Pipeline(lang='en', processors='tokenize,ner')

    def __init__(self, parsed_page, org_name, traversal_rule):
        self.parsed_page = parsed_page
        self.org_name = org_name
        self.traversal_rule = list()
        if traversal_rule:
            self.traversal_rule = traversal_rule

        # https://stackoverflow.com/questions/54265391/find-all-end-nodes-that-contain-text-using-beautifulsoup4
    def mark_if_leaf_with_text(self, node):
        if node.name in ["style", "script", "link", "meta"]:
            return False
        if not node.text:
            return False
        elif len(node.find_all(text=False)) > 0:  # no other tags inside other than text
            return False
        node.leaf = True
        return False

    def is_leaf_nodes_with_org(self, node):
        if not node.leaf:
            return False
        text = node.text
        processed_text = self.nlp(text)
        no_of_entities = len(processed_text.entities)
        org = [ent for ent in processed_text.entities if ent.type in [self.NER_TAG_ORG]]
        no_of_org = len(org)
        self.candidates = list()
        if no_of_org > 0:
            print(no_of_org, processed_text.entities)
            node.no_of_org = no_of_org
            node.no_of_entities = no_of_entities
            node.org = org
            return True
        return False

    def find_leaf_nodes_with_org(self, node):
        self.mark_if_leaf_with_text(node)
        return self.is_leaf_nodes_with_org(node)

    def find_candidates(self):
        logging.info("Finding Candidate Org")
        # Use stanza to identify org in text, the nodes containing them and traversal for the leaf nodes
        candidate_org_nodes_details = self.parsed_page.find_all(self.find_leaf_nodes_with_org)
        candidate_orgs = list()
        for node in candidate_org_nodes_details:
            
            candidate_org = dict()
            # This condition may not hold
            if node.no_of_org == 1:
                candidate_org['org_entity'] = node.org[0].text
                candidate_org['ancestors'] = [parent.name for parent in node.parents]
                candidate_org['node_name'] = node.name
                candidate_orgs.append(candidate_org)
        self.candidates = candidate_orgs

    def pick_traversal_from_org(self):
        logging.info("Picking the traversal rule given org")
        self.find_candidates()
        for candidate in self.candidates:
            candidate_org_name = candidate['org_entity']
            #candidate_anc = candidate['ancestors']
            #candidate_nod = candidate['node_name']
            normal_name = self.get_normal_name()
            #print("\n candidate org nmae is "+candidate['ancestors'])
            #logging.info("\n Candidate Org Name: {}, ancestor name: {}, node  name: {} \n".format(candidate_org_name, candidate_anc, candidate_nod))
            if candidate_org_name == self.org_name or candidate_org_name == normal_name:
                self.traversal_rule = candidate['ancestors']
                print(candidate_org_name)

    def get_org_from_traversal(self):
        logging.info("Picking the org from candidates based on the traversal rule")
        self.find_candidates()
        candidates = self.candidates
        if candidates:
            for candidate in candidates:
                if candidate['ancestors'] == self.traversal_rule:
                    return candidate['org_entity']

        return None


    def get_normal_name(self):
        if "," in self.org_name:
            split = self.org_name.split(",")
            not_last_name = split[1]
            last_name = split[0]
            normal_name = not_last_name + " " + last_name
            return normal_name.strip()
        return "None"

2020-08-29 09:12:43 INFO: Loading these models for language: en (English):
| Processor | Package   |
-------------------------
| tokenize  | ewt       |
| ner       | ontonotes |

2020-08-29 09:12:43 INFO: Use device: cpu
2020-08-29 09:12:43 INFO: Loading: tokenize
2020-08-29 09:12:43 INFO: Loading: ner
2020-08-29 09:12:44 INFO: Done loading processors!


In [45]:
with open("./resources/linkedin-origin-ab-testing-nicolai-kramer-jakobsen.html", "r", encoding="UTF-8") as fp:
    html_content = fp.read()

soup = BeautifulSoup(html_content, 'lxml')
soup = BeautifulSoup(soup.prettify('utf-8'), 'lxml')  # some inputs are so messy that they affect the output
t_rule = TraversalRule(soup, "Facebook", None)

t_rule.pick_traversal_from_org()
print(t_rule.traversal_rule)

2020-08-29 09:12:52 INFO: Picking the traversal rule given org
2020-08-29 09:12:52 INFO: Finding Candidate Org


1 [{
  "text": "Facebook",
  "type": "ORG",
  "start_char": 13,
  "end_char": 21
}]
3 [{
  "text": "A/B",
  "type": "ORG",
  "start_char": 6,
  "end_char": 9
}, {
  "text": "Fintech",
  "type": "ORG",
  "start_char": 219,
  "end_char": 226
}, {
  "text": "Big Data",
  "type": "ORG",
  "start_char": 250,
  "end_char": 258
}]
1 [{
  "text": "James Lind",
  "type": "PERSON",
  "start_char": 16,
  "end_char": 26
}, {
  "text": "18th century",
  "type": "DATE",
  "start_char": 29,
  "end_char": 41
}, {
  "text": "British East India",
  "type": "LOC",
  "start_char": 52,
  "end_char": 70
}, {
  "text": "the Jenner Institute",
  "type": "ORG",
  "start_char": 84,
  "end_char": 104
}, {
  "text": "A/B",
  "type": "PRODUCT",
  "start_char": 157,
  "end_char": 160
}, {
  "text": "two",
  "type": "CARDINAL",
  "start_char": 208,
  "end_char": 211
}, {
  "text": "one",
  "type": "CARDINAL",
  "start_char": 253,
  "end_char": 256
}, {
  "text": "two",
  "type": "CARDINAL",
  "start_char": 303,
  "e

In [47]:
import urllib3
from bs4 import BeautifulSoup
import json
import tldextract
import certifi
import ssl

ssl_context = ssl.SSLContext()
ssl_context.load_verify_locations(certifi.where())
http = urllib3.PoolManager(ssl_context=ssl_context)

class OrgTraversalRules:
    persistence_type = "json"

    def __init__(self, filename):
        self.filename = filename
        self.org_traversal_rules = dict()
        self.load_org_traversal_rules()

    def load_org_traversal_rules(self):
        with open(self.filename, "r") as fp:
            self.org_traversal_rules =  json.load(fp)
            #print(json.load(fp))

    def get_org_traversal_for_url(self, url):
        extract_result = tldextract.extract(url)
        print("\n URL IS" + url)
        host_url = extract_result.registered_domain
        if host_url in self.org_traversal_rules:
            return self.org_traversal_rules[host_url]
        return None

class FindOrgWithTraversal:

    def __init__(self, url, org_traversal_rule_for_site):
        self.url = url
        self.org_traversal_rule = org_traversal_rule_for_site
        self.page_content = None

    def load_page_content(self):
        self.page_content = http.request('GET', self.url).data
        #print(self.page_content)

    def get_org(self):
        self.load_page_content()
        soup = BeautifulSoup(self.page_content, 'lxml')
        soup = BeautifulSoup(soup.prettify('utf-8'), 'lxml')
        t = TraversalRule(soup, None, self.org_traversal_rule)
        return t.get_org_from_traversal()


class FindOrg:
    domain_traversal_file = "./resources/domain_traversal_rules-500.json"
    domain_traversal = OrgTraversalRules(domain_traversal_file)

    def __init__(self, url):
        self.url = url
        extracted = tldextract.extract(url)
        site = extracted.registered_domain
        #print("\n site is"+ site)
        #if "." in site:
        #    split = site.split(".")
          #  domain_name = split[0]
            #last_name = split[0]
         #   site = domain_name.strip()
        print("\n corrected site is"+site)
        self.find_org = FindOrgWithTraversal(self.url, self.domain_traversal.org_traversal_rules[site])
        print(self.find_org.get_org())

    def get_org(self):       
        print("\n in self.find_org.get_org function as"+ self.find_org.get_org())
        return self.find_org.get_org()

In [48]:
#
print(FindOrg("https://www.Facebook.com/TechRadar").get_org())
print(FindOrg("https://www.linkedin.com/pulse/automating-user-creation-aws-sftp-service-transfer-arjun-dandagi/").get_org())
#print(FindAuthor("https://www.linkedin.com/pulse/automating-user-creation-aws-sftp-service-transfer-arjun-dandagi/").get_author())


 corrected site islinkedin.com


2020-08-29 09:13:44 INFO: Picking the org from candidates based on the traversal rule
2020-08-29 09:13:44 INFO: Finding Candidate Org


1 [{
  "text": "AWS",
  "type": "ORG",
  "start_char": 32,
  "end_char": 35
}]
1 [{
  "text": "AWS",
  "type": "ORG",
  "start_char": 34,
  "end_char": 37
}]
3 [{
  "text": "DevOps AWS",
  "type": "ORG",
  "start_char": 9,
  "end_char": 19
}, {
  "text": "CDA",
  "type": "ORG",
  "start_char": 24,
  "end_char": 27
}, {
  "text": "CSA",
  "type": "ORG",
  "start_char": 29,
  "end_char": 32
}]
1 [{
  "text": "Facebook",
  "type": "ORG",
  "start_char": 13,
  "end_char": 21
}]
1 [{
  "text": "AWS",
  "type": "ORG",
  "start_char": 28,
  "end_char": 31
}]
1 [{
  "text": "one",
  "type": "CARDINAL",
  "start_char": 136,
  "end_char": 139
}, {
  "text": "0",
  "type": "CARDINAL",
  "start_char": 237,
  "end_char": 238
}, {
  "text": "2012-10-17",
  "type": "DATE",
  "start_char": 835,
  "end_char": 845
}, {
  "text": "Sid\"",
  "type": "WORK_OF_ART",
  "start_char": 902,
  "end_char": 906
}, {
  "text": "AllowListingOfUserFolder",
  "type": "WORK_OF_ART",
  "start_char": 909,
  "end_char": 9

2020-08-29 09:14:11 INFO: Picking the org from candidates based on the traversal rule
2020-08-29 09:14:11 INFO: Finding Candidate Org


1 [{
  "text": "AWS",
  "type": "ORG",
  "start_char": 32,
  "end_char": 35
}]
1 [{
  "text": "AWS",
  "type": "ORG",
  "start_char": 34,
  "end_char": 37
}]
3 [{
  "text": "DevOps AWS",
  "type": "ORG",
  "start_char": 9,
  "end_char": 19
}, {
  "text": "CDA",
  "type": "ORG",
  "start_char": 24,
  "end_char": 27
}, {
  "text": "CSA",
  "type": "ORG",
  "start_char": 29,
  "end_char": 32
}]
1 [{
  "text": "Facebook",
  "type": "ORG",
  "start_char": 13,
  "end_char": 21
}]
1 [{
  "text": "AWS",
  "type": "ORG",
  "start_char": 28,
  "end_char": 31
}]
1 [{
  "text": "one",
  "type": "CARDINAL",
  "start_char": 136,
  "end_char": 139
}, {
  "text": "0",
  "type": "CARDINAL",
  "start_char": 237,
  "end_char": 238
}, {
  "text": "2012-10-17",
  "type": "DATE",
  "start_char": 835,
  "end_char": 845
}, {
  "text": "Sid\"",
  "type": "WORK_OF_ART",
  "start_char": 902,
  "end_char": 906
}, {
  "text": "AllowListingOfUserFolder",
  "type": "WORK_OF_ART",
  "start_char": 909,
  "end_char": 9

TypeError: must be str, not NoneType