# Install the relevant libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install transformers wikipedia newspaper3k GoogleNews
!pip install pyvis==0.3.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import math
import torch
import wikipedia
from newspaper import Article, ArticleException
from GoogleNews import GoogleNews
import IPython
from pyvis.network import Network

# Load the REBEL model
REBEL is a text2text model trained by BabelScape by fine-tuning BART for translating a raw input sentence containing entities and implicit relations into a set of triplets that explicitly refer to those relations. It has been trained on more than 200 different relation types.

The authors created a custom dataset for REBEL pre-training, using entities and relations found in Wikipedia abstracts and Wikidata, and filtering them using a RoBERTa Natural Language Inference model (similar to this model).

In [None]:
# Load model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("Babelscape/rebel-large")
model = AutoModelForSeq2SeqLM.from_pretrained("Babelscape/rebel-large")

# From short text to KB

In [None]:
def extract_relations_from_model_output(text):
    relations = []
    relation, subject, relation, object_ = '', '', '', ''
    text = text.strip()
    current = 'x'
    text_replaced = text.replace("<s>", "").replace("<pad>", "").replace("</s>", "")
    for token in text_replaced.split():
        if token == "<triplet>":
            current = 't'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
                relation = ''
            subject = ''
        elif token == "<subj>":
            current = 's'
            if relation != '':
                relations.append({
                    'head': subject.strip(),
                    'type': relation.strip(),
                    'tail': object_.strip()
                })
            object_ = ''
        elif token == "<obj>":
            current = 'o'
            relation = ''
        else:
            if current == 't':
                subject += ' ' + token
            elif current == 's':
                object_ += ' ' + token
            elif current == 'o':
                relation += ' ' + token
    if subject != '' and relation != '' and object_ != '':
        relations.append({
            'head': subject.strip(),
            'type': relation.strip(),
            'tail': object_.strip()
        })
    return relations

# Filter and normalize entities with Wikipedia

- remove all entities that doesn't have a page on Wikipedia
- merge entities if they have the same wikipedia page

In [None]:
class KB():
    def __init__(self):
        self.entities = {}
        self.relations = []

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r1):
        r2 = [r for r in self.relations
              if self.are_relations_equal(r1, r)][0]
        spans_to_add = [span for span in r1["meta"]["spans"]
                        if span not in r2["meta"]["spans"]]
        r2["meta"]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")

# Extract KB from web article

In [None]:
def from_text_to_kb(text, article_url, span_length=128, article_title=None,
                    article_publish_date=None, verbose=False):
    # tokenize whole text
    inputs = tokenizer([text], return_tensors="pt")

    # compute span boundaries
    num_tokens = len(inputs["input_ids"][0])
    if verbose:
        print(f"Input has {num_tokens} tokens")
    num_spans = math.ceil(num_tokens / span_length)
    if verbose:
        print(f"Input has {num_spans} spans")
    overlap = math.ceil((num_spans * span_length - num_tokens) / 
                        max(num_spans - 1, 1))
    spans_boundaries = []
    start = 0
    for i in range(num_spans):
        spans_boundaries.append([start + span_length * i,
                                 start + span_length * (i + 1)])
        start -= overlap
    if verbose:
        print(f"Span boundaries are {spans_boundaries}")

    # transform input with spans
    tensor_ids = [inputs["input_ids"][0][boundary[0]:boundary[1]]
                  for boundary in spans_boundaries]
    tensor_masks = [inputs["attention_mask"][0][boundary[0]:boundary[1]]
                    for boundary in spans_boundaries]
    inputs = {
        "input_ids": torch.stack(tensor_ids),
        "attention_mask": torch.stack(tensor_masks)
    }

    # generate relations
    num_return_sequences = 3
    gen_kwargs = {
        "max_length": 256,
        "length_penalty": 0,
        "num_beams": 3,
        "num_return_sequences": num_return_sequences
    }
    generated_tokens = model.generate(
        **inputs,
        **gen_kwargs,
    )

    # decode relations
    decoded_preds = tokenizer.batch_decode(generated_tokens,
                                           skip_special_tokens=False)

    # create kb
    kb = KB()
    i = 0
    for sentence_pred in decoded_preds:
        current_span_index = i // num_return_sequences
        relations = extract_relations_from_model_output(sentence_pred)
        for relation in relations:
            relation["meta"] = {
                article_url: {
                    "spans": [spans_boundaries[current_span_index]]
                }
            }
            kb.add_relation(relation, article_title, article_publish_date)
        i += 1

    return kb

In [None]:
class KB():
    def __init__(self):
        self.entities = {} # { entity_title: {...} }
        self.relations = [] # [ head: entity_title, type: ..., tail: entity_title,
          # meta: { article_url: { spans: [...] } } ]
        self.sources = {} # { article_url: {...} }

    def merge_with_kb(self, kb2):
        for r in kb2.relations:
            article_url = list(r["meta"].keys())[0]
            source_data = kb2.sources[article_url]
            self.add_relation(r, source_data["article_title"],
                              source_data["article_publish_date"])

    def are_relations_equal(self, r1, r2):
        return all(r1[attr] == r2[attr] for attr in ["head", "type", "tail"])

    def exists_relation(self, r1):
        return any(self.are_relations_equal(r1, r2) for r2 in self.relations)

    def merge_relations(self, r2):
        r1 = [r for r in self.relations
              if self.are_relations_equal(r2, r)][0]

        # if different article
        article_url = list(r2["meta"].keys())[0]
        if article_url not in r1["meta"]:
            r1["meta"][article_url] = r2["meta"][article_url]

        # if existing article
        else:
            spans_to_add = [span for span in r2["meta"][article_url]["spans"]
                            if span not in r1["meta"][article_url]["spans"]]
            r1["meta"][article_url]["spans"] += spans_to_add

    def get_wikipedia_data(self, candidate_entity):
        try:
            page = wikipedia.page(candidate_entity, auto_suggest=False)
            entity_data = {
                "title": page.title,
                "url": page.url,
                "summary": page.summary
            }
            return entity_data
        except:
            return None

    def add_entity(self, e):
        self.entities[e["title"]] = {k:v for k,v in e.items() if k != "title"}

    def add_relation(self, r, article_title, article_publish_date):
        # check on wikipedia
        candidate_entities = [r["head"], r["tail"]]
        entities = [self.get_wikipedia_data(ent) for ent in candidate_entities]

        # if one entity does not exist, stop
        if any(ent is None for ent in entities):
            return

        # manage new entities
        for e in entities:
            self.add_entity(e)

        # rename relation entities with their wikipedia titles
        r["head"] = entities[0]["title"]
        r["tail"] = entities[1]["title"]

        # add source if not in kb
        article_url = list(r["meta"].keys())[0]
        if article_url not in self.sources:
            self.sources[article_url] = {
                "article_title": article_title,
                "article_publish_date": article_publish_date
            }

        # manage new relation
        if not self.exists_relation(r):
            self.relations.append(r)
        else:
            self.merge_relations(r)

    def print(self):
        print("Entities:")
        for e in self.entities.items():
            print(f"  {e}")
        print("Relations:")
        for r in self.relations:
            print(f"  {r}")
        print("Sources:")
        for s in self.sources.items():
            print(f"  {s}")

In [None]:
def get_article(url):
    article = Article(url)
    article.download()
    article.parse()
    return article

def from_url_to_kb(url):
    article = get_article(url)
    config = {
        "article_title": article.title,
        "article_publish_date": article.publish_date
    }
    kb = from_text_to_kb(article.text, article.url, **config)
    return kb


#Get Urls from Submission Json Files



In [None]:
import json
with open("/content/drive/MyDrive/consented_submissions.json",'r') as load_f:
    submissions = json.load(load_f)

length = len(submissions)
print(submissions)
highlighted_texts = []
explanations = []
source_urls = []

for i in range(length):
    highlighted_texts.append(submissions[i]["highlighted_text"])
    explanations.append(submissions[i]["explanation"])
    source_urls.append(submissions[i]["source_url"])

# print(source_urls)


# Named Entity Recognition and Entitiy Linking

In [None]:
!pip install tagme
import tagme
import logging
import sys
import os.path
import requests
import json
from urllib.parse import urlparse

endpoint_url = "https://query.wikidata.org/sparql"
tagme.GCUBE_TOKEN = "59f2ad7b-fd30-4a28-9746-bcc1b10163f1-843339462"

program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s')

def Annotate(txt, language="en", theta=0.1):
    """
    text entities -> wikidata entities
    :param txt: str
    :param language: 'en' means English
    :param theta:[0, 1] entity score threshold
    :return: [(A, B):score]  A is entity，B is wiki entity
    """
    annotations = tagme.annotate(txt, lang=language)
    dic = dict()
    for ann in annotations.get_annotations(theta):
        try:
            A, B, score = str(ann).split(" -> ")[0], str(ann).split(" -> ")[1].split(" (score: ")[0], str(ann).split(" -> ")[1].split(" (score: ")[1].split(")")[0]
            dic[(A, B)] = score
        except:
            logger.error('error annotation about ' + ann)
    return dic

def find_id(entity_list):
    #find the id in wikidata 
    url="https://labs.tib.eu/falcon/falcon2/api?mode=long"
    headers = {'Content-type': 'application/json'}
    entity_id_list = []
    relation_id_list = []
    for name in entity_list:
        data = {"text":name}
        data_json = json.dumps(data)
        response = requests.post(url, data=data_json, headers=headers)
        json1 = eval(str(response.content,'utf-8'))
        if 'entities_wikidata' in json1 and json1['entities_wikidata'] != []:
            obj_entity_url = json1['entities_wikidata'][0]['URI']
            # print(obj_entity_url)
            parsed = urlparse(obj_entity_url)
            path = parsed[2] #this is the path element
            pathlist = path.split("/")
            obj_entity_id = pathlist[-1][0:-1]
            entity_id_list.append(obj_entity_id)
        if 'relations_wikidata' in json1 and json1['relations_wikidata'] != []:
            obj_relation_url = json1['relations_wikidata'][0]['URI']
            # print(obj_relation_url)
            parsed = urlparse(obj_relation_url)
            path = parsed[2] #this is the path element
            pathlist = path.split("/")
            obj_relation_id = pathlist[-1][0:-1]
            relation_id_list.append(obj_relation_id)
    return entity_id_list, relation_id_list


## 相关 ## L1.1



Traceback (most recent call last):
  File "/usr/local/bin/pip3", line 5, in <module>
    from pip._internal.cli.main import main
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main.py", line 9, in <module>
    from pip._internal.cli.autocompletion import autocomplete
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/autocompletion.py", line 10, in <module>
    from pip._internal.cli.main_parser import create_main_parser
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/main_parser.py", line 9, in <module>
    from pip._internal.build_env import get_runnable_pip
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/build_env.py", line 19, in <module>
    from pip._internal.cli.spinners import open_spinner
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/cli/spinners.py", line 9, in <module>
    from pip._internal.utils.logging import get_indentation
  File "/usr/local/lib/python3.10/dist-packages/pip/_internal/utils

#Find key words in one topic

In [None]:
topic = "#L3"
def find_topic_keywords(topic):
    urls_list = []
    keywords_dic = {}
    for i in range(length):
      # print()
      highlighted_text = highlighted_texts[i]
      explanation = explanations[i]
      if topic in highlighted_text and highlighted_text!= '':
          obj1 = Annotate(highlighted_text, theta=0.2)
          for i in obj1.keys():
            # print(i[0] + " ---> " + i[1] + "  " + obj[i])
              if i[1] not in keywords_dic:
                  keywords_dic[i[1]] = 1
              else:
                  keywords_dic[i[1]] += 1
      if topic in explanation and explanation!='':
          obj2 = Annotate(explanation, theta=0.2)
          for i in obj2.keys():
              if i[1] not in keywords_dic:
                  keywords_dic[i[1]] = 1
              else:
                  keywords_dic[i[1]] += 1

    return keywords_dic

keywords_dic = find_topic_keywords(topic)
    #     entity_id_list, relation_id_list = find_id(entity_list)      

#Ranking Keywords

In [None]:
keywords_dic_sorted = sorted(keywords_dic.items(), key=lambda x: x[1], reverse=True)
print(keywords_dic_sorted)

[('Information retrieval', 21), ('Probability', 19), ('BM-25 (MRL)', 15), ('Okapi BM25', 5), ('Generative model', 3), ('Document', 3), ('Infrared', 3), ('Language model', 3), ('Maximum likelihood', 3), ('Conceptual model', 2), ('Tf–idf', 2), ('Python (programming language)', 2), ('Lagrangian point', 2), ('Dirichlet distribution', 2), ('Relevance', 2), ('Statistical model', 2), ('Query likelihood model', 2), ('Skewness', 2), ('Poisson distribution', 2), ('Mixture model', 2), ('Discriminative model', 1), ('Mathematics', 1), ('Israel Defense Forces', 1), ('Scientific modelling', 1), ('Understanding', 1), ('Difference (philosophy)', 1), ('Generative grammar', 1), ('Description', 1), ('Computer programming', 1), ('Smoothing', 1), ('Logistic regression', 1), ('Logit', 1), ('Apollo Lunar Module', 1), ('Wikipedia', 1), ('Web page', 1), ('Condensed matter physics', 1), ('PageRank', 1), ('Algorithm', 1), ('Information theory', 1), ('Machine learning', 1), ('Ranking (information retrieval)', 1), 

#Find urls in one topic


In [None]:
topic = "#L3"

def find_topic_urls(topic):
    urls_list = []
    for i in range(length):
      if topic in highlighted_texts[i] or topic in explanations[i]:
          urls_list.append(source_urls[i])
    return urls_list     

urls_list = find_topic_urls(topic)
print(urls_list)     
print(len(urls_list)) 



# Extract KB from multiple articles

In [None]:
def get_news_links(query, lang="en", region="US", pages=1, max_links=100000):
    googlenews = GoogleNews(lang=lang, region=region)
    googlenews.search(query)
    all_urls = []
    for page in range(pages):
        googlenews.get_page(page)
        all_urls += googlenews.get_links()
    return list(set(all_urls))[:max_links]

def from_urls_to_kb(urls, verbose=False):
    kb = KB()
    if verbose:
        print(f"{len(urls)} links to visit")
    for url in urls:
        if verbose:
            print(f"Visiting {url}...")
        try:
            kb_url = from_url_to_kb(url)
            kb.merge_with_kb(kb_url)
        except ArticleException:
            if verbose:
                print(f"  Couldn't download article at url {url}")
    return kb

In [None]:
import pickle

def save_kb(kb, filename):
    with open(filename, "wb") as f:
        pickle.dump(kb, f)

def load_kb(filename):
    res = None
    with open(filename, "rb") as f:
        res = pickle.load(f)
    return res

In [None]:
def from_urls_to_kb_new(urls, verbose=False):
    # kb = load_kb("/content/drive/MyDrive/kb0_70")
    if verbose:
        print(f"{len(urls)} links to visit")
    for url in urls:
        if verbose:
            print(f"Visiting {url}...")
        try:
            kb_url = from_url_to_kb(url)
            kb.merge_with_kb(kb_url)
        except ArticleException:
            if verbose:
                print(f"  Couldn't download article at url {url}")
    return kb

urls_list_new = urls_list[70:75]

kb_new = from_urls_to_kb_new(urls_list_new, verbose=True)
kb_new.print()

# kb = from_urls_to_kb(urls_list_1, verbose=True)
# kb.print()


5 links to visit
Visiting https://www.youtube.com/watch?v=XFIKE34HafY...
Visiting https://nlp.stanford.edu/IR-book/pdf/11prob.pdf...
Visiting https://towardsdatascience.com/how-to-build-a-smart-search-engine-a86fca0d0795...
Visiting https://nlp.stanford.edu/IR-book/html/htmledition/language-models-for-information-retrieval-1.html...
Visiting https://www.youtube.com/watch?v=dfCrWzyUq1U...
Entities:
  ('B', {'url': 'https://en.wikipedia.org/wiki/B', 'summary': 'B, or b, is the second letter of the Latin-script alphabet, used in the modern English alphabet, the alphabets of other western European languages and others worldwide. Its name in English is bee (pronounced ), plural bees. It represents the voiced bilabial stop in many languages, including English. In some other languages, it is used to represent other bilabial consonants.\n\n'})
  ('Elasticsearch', {'url': 'https://en.wikipedia.org/wiki/Elasticsearch', 'summary': 'Elasticsearch is a search engine based on the Lucene library. It 

In [None]:
save_kb(kb_new,"/content/drive/MyDrive/kb0_75")