## Step 1. Import and connect to drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install -U spacy==2.1.0 
!python -m spacy download en
!pip install Cython --install-option="--no-cython-compile"
!pip uninstall -y neuralcoref 
!pip install neuralcoref --no-binary neuralcoref
!git clone https://github.com/thunlp/OpenNRE.git --depth 1
!pip install -r OpenNRE/requirements.txt
!pip install -e OpenNRE/.

Collecting spacy==2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/78/0f/ca790def675011f25bce8775cf9002b5085cd2288f85e891f70b32c18752/spacy-2.1.0-cp37-cp37m-manylinux1_x86_64.whl (27.7MB)
[K     |████████████████████████████████| 27.7MB 131kB/s 
[?25hCollecting preshed<2.1.0,>=2.0.1
[?25l  Downloading https://files.pythonhosted.org/packages/bc/2b/3ecd5d90d2d6fd39fbc520de7d80db5d74defdc2d7c2e15531d9cc3498c7/preshed-2.0.1-cp37-cp37m-manylinux1_x86_64.whl (82kB)
[K     |████████████████████████████████| 92kB 11.2MB/s 
Collecting plac<1.0.0,>=0.9.6
  Downloading https://files.pythonhosted.org/packages/9e/9b/62c60d2f5bc135d2aa1d8c8a86aaf84edb719a59c7f11a4316259e61a298/plac-0.9.6-py2.py3-none-any.whl
Collecting blis<0.3.0,>=0.2.2
[?25l  Downloading https://files.pythonhosted.org/packages/fa/5f/47b7b29ad202b2210020e2f33bfb06d1db2abe0e709c2a84736e8a9d1bd5/blis-0.2.4-cp37-cp37m-manylinux1_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 41.3MB/s 
[?25hC

In [None]:
import re
import itertools
import json
import urllib
from string import punctuation
import nltk
import spacy
import neuralcoref
import opennre

## Step 2: Read data from google drive and using regex to remove special character

In [None]:
path = "/content/gdrive/MyDrive/Colab Notebooks/input/vn_tourism.txt"

In [None]:
with open(path, 'r') as file:
  filedata = file.read()

In [None]:
filedata = re.sub('[~@#$%^&*(){}\[\]`\n++©🙂]', ' ', filedata)

## Step 3: Inital all models

In [None]:
cd /content/gdrive/MyDrive/Colab Notebooks/model

/content/gdrive/MyDrive/Colab Notebooks/model


In [None]:
ENTITY_TYPES = ["human", "person", "company", "enterprise", "business", "geographic region",
                "human settlement", "geographic entity", "territorial entity type", "organization"]

# Load SpaCy
nlp = spacy.load('en')
# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp)

# Load opennre -- VN Tourism data's quite similar to wiki data
relation_model = opennre.get_model('wiki80_bert_softmax')

# Load NLTK
nltk.download('punkt')

2021-06-10 10:36:16,298 - root - INFO - Loading BERT pre-trained checkpoint.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Step 4: Using neural corefence to improve text

In [None]:
def coref_resolution(text):
    """Function that executes coreference resolution on a given text"""
    doc = nlp(text)
    # fetches tokens with whitespaces from spacy document
    tok_list = list(token.text_with_ws for token in doc)
    for cluster in doc._.coref_clusters:
        # get tokens from representative cluster name
        cluster_main_words = set(cluster.main.text.split(' '))
        for coref in cluster:
            if coref != cluster.main:  # if coreference element is not the representative element of that cluster
                if coref.text != cluster.main.text and bool(set(coref.text.split(' ')).intersection(cluster_main_words)) == False:
                    # if coreference element text and representative element text are not equal and none of the coreference element words are in representative element. This was done to handle nested coreference scenarios
                    tok_list[coref.start] = cluster.main.text + \
                        doc[coref.end-1].whitespace_
                    for i in range(coref.start+1, coref.end):
                        tok_list[i] = ""

    return "".join(tok_list)

In [None]:
corpus = coref_resolution(filedata)

In [None]:
corpus

'Dive into Hanoi\'s pulsating city streets, and you\'ll capture the essence of Vietnamese life. The country\'s capital is a burgeoning economic center that still clings strongly to traditional culture, managing to be a showcase of both old and modern Vietnam. The old quarter district hums with street vendor action; the cafés and restaurants are vibrant, contemporary scenes; and just trying to cross a road here can end up being an adrenaline-fueled escapade. When the crowds begin to wear you down, Hanoi has a bundle of places to visit where you can escape for some peace. Hoan Kiem Lake is a relaxing respite right within the city, while the Temple of Literature and Vietnam Museum of Ethnology provide plenty of opportunities to reflect on Vietnam\'s grand history. For more ideas on things to do, see our list of the top attractions in Hanoi. See also: Where to Stay in Hanoi Note: Some businesses may be temporarily closed due to recent global health and safety issues.   For many visitors to

## Step 2: Named Entity Linking

In [None]:
def wikifier(text, lang="en", threshold=0.8):
    """Function that fetches entity linking results from wikifier.com API"""
    # Prepare the URL.
    data = urllib.parse.urlencode([
        ("text", text), ("lang", lang),
        ("userKey", "tgbdmkpmkluegqfbawcwjywieevmza"),
        ("pageRankSqThreshold", "%g" %
         threshold), ("applyPageRankSqThreshold", "true"),
        ("nTopDfValuesToIgnore", "100"), ("nWordsToIgnoreFromList", "100"),
        ("wikiDataClasses", "true"), ("wikiDataClassIds", "false"),
        ("support", "true"), ("ranges", "false"), ("minLinkFrequency", "2"),
        ("includeCosines", "false"), ("maxMentionEntropy", "3")
    ])
    url = "http://www.wikifier.org/annotate-article"
    # Call the Wikifier and read the response.
    req = urllib.request.Request(url, data=data.encode("utf8"), method="POST")
    with urllib.request.urlopen(req, timeout=60) as f:
        response = f.read()
        response = json.loads(response.decode("utf8"))
    # Output the annotations.
    results = list()
    for annotation in response["annotations"]:
        # Filter out desired entity classes
        if ('wikiDataClasses' in annotation) and (any([el['enLabel'] in ENTITY_TYPES for el in annotation['wikiDataClasses']])):

            # Specify entity label
            if any([el['enLabel'] in ["human", "person"] for el in annotation['wikiDataClasses']]):
                label = 'Person'
            elif any([el['enLabel'] in ["company", "enterprise", "business", "organization"] for el in annotation['wikiDataClasses']]):
                label = 'Organization'
            elif any([el['enLabel'] in ["geographic region", "human settlement", "geographic entity", "territorial entity type"] for el in annotation['wikiDataClasses']]):
                label = 'Location'
            else:
                label = None

            results.append({'title': annotation['title'], 'wikiId': annotation['wikiDataItemId'], 'label': label,
                            'characters': [(el['chFrom'], el['chTo']) for el in annotation['support']]})
    return results

## Step 3: Relationship extraction

In [None]:
def strip_punctuation(s):
    """Removes all punctuation from a string"""
    return ''.join(c for c in s if c not in punctuation)

In [None]:
def deduplicate_dict(d):
    return [dict(y) for y in set(tuple(x.items()) for x in d)]

In [None]:
relation_threshold = 0.4
entities_threshold = 0.8
relations_list = list()
entities_list = list()

In [None]:
for sentence in nltk.sent_tokenize(corpus):
    sentence = strip_punctuation(sentence)
    entities = wikifier(sentence, threshold=entities_threshold)
    entities_list.extend([{'title': el['title'], 'wikiId': el['wikiId'], 'label': el['label']} for el in entities])
    # Iterate over every permutation pair of entities
    for permutation in itertools.permutations(entities, 2):
       for source in permutation[0]['characters']:
           for target in permutation[1]['characters']:
           # Relationship extraction with OpenNRE
              data = relation_model.infer({'text': sentence, 'h': {'pos': [source[0], source[1] + 1]}, 't': {'pos': [target[0], target[1] + 1]}})
              if data[1] > relation_threshold:
                  relations_list.append({'source': permutation[0]['title'], 'target': permutation[1]['title'], 'type': data[0]})

kg_data =  {'entities': deduplicate_dict(entities_list), 'relations': deduplicate_dict(relations_list)}

In [None]:
len(kg_data['entities'])

292

In [None]:
len(kg_data['relations'])

1308

In [None]:
import pandas as pd

kg = pd.DataFrame(kg_data['relations'])

In [None]:
kg

Unnamed: 0,source,target,type
0,Russian Aircraft Corporation MiG,Soviet Union,operator
1,Atlantic Ocean,Phú Quốc,located on terrain feature
2,Côn Đảo,Phú Quốc,located on terrain feature
3,Phu Bai International Airport,Vietnam,country
4,Vietnamese đồng,Nguyễn dynasty,owned by
...,...,...,...
1303,Atlantic Ocean,Taoism,instance of
1304,Village,Town,located in the administrative territorial entity
1305,China,Mainland China,followed by
1306,BBC News,Central Highlands (Vietnam),has part


In [None]:
kg.to_csv("/content/gdrive/MyDrive/Colab Notebooks/input/TourismVN.csv")