In [1]:
# Connect to Google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
!pip install -U spacy==2.1.0 
!python -m spacy download en
!pip install Cython --install-option="--no-cython-compile"
!pip uninstall -y neuralcoref 
!pip install neuralcoref --no-binary neuralcoref
!python -m spacy download en_core_web_lg

Requirement already up-to-date: spacy==2.1.0 in /usr/local/lib/python3.7/dist-packages (2.1.0)
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')
  cmdoptions.check_install_build_global(options)
Uninstalling neuralcoref-4.0:
  Successfully uninstalled neuralcoref-4.0
Collecting neuralcoref
  Using cached https://files.pythonhosted.org/packages/0c/40/8db3db763077fe80b71859f57731261aeb03cc624635f97a3bcfe55ab37b/neuralcoref-4.0.tar.gz
Skipping wheel build for neuralcoref, due to binaries being disabled for it.
Installing collected packages: neuralcoref
    Running setup.py install for neuralcoref ... [?25l[?25hdone
Successfully installed neuralcoref-4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the model via sp

In [3]:
import re
import spacy
import neuralcoref

In [4]:
# Load SpaCy
nlp = spacy.load('en_core_web_lg')
# Add neural coref to SpaCy's pipe
neuralcoref.add_to_pipe(nlp)

<spacy.lang.en.English at 0x7f179f4a2e90>

In [5]:
import pandas as pd

In [6]:
path = "/content/gdrive/MyDrive/Nguyễn Đình Lâm-Project 2-Knowledge Graph/Covid-19 wiki/input/"

In [9]:
covid19_df = pd.read_csv(path + "raw-covid19-wiki.csv")

In [10]:
covid19_df

Unnamed: 0.1,Unnamed: 0,page,text,link
0,0,COVID-19,\n\n\n\n\n\nCoronavirus disease 2019 (COVID-19...,https://en.wikipedia.org/wiki/COVID-19
1,1,COVID-19 pandemic,"\nThe COVID-19 pandemic, also known as the cor...",https://en.wikipedia.org/wiki/COVID-19 pandemic
2,2,COVID-19 vaccine,\n\n\n\nVaccines (deployment)\n\nA COVID‑19 va...,https://en.wikipedia.org/wiki/COVID-19 vaccine
3,3,Severe acute respiratory syndrome coronavirus 2,\nSevere acute respiratory syndrome coronaviru...,https://en.wikipedia.org/wiki/Severe acute res...
4,4,COVID-19 misinformation,\n\n\n\n\nBy industry\nThe COVID-19 pandemic h...,https://en.wikipedia.org/wiki/COVID-19 misinfo...
5,5,Social distancing,"\nIn public health, social distancing, also ca...",https://en.wikipedia.org/wiki/Social distancing
6,6,Transmission (medicine),"In medicine, public health, and biology, trans...",https://en.wikipedia.org/wiki/Transmission (me...
7,7,Symptoms of COVID-19,"\n\n\n\n\nSymptoms of COVID-19 are variable, r...",https://en.wikipedia.org/wiki/Symptoms of COVI...
8,8,COVID-19 testing,\n\n\n\n\nVaccines (deployment)\n\nCOVID-19 te...,https://en.wikipedia.org/wiki/COVID-19 testing
9,9,Workplace hazard controls for COVID-19,\n\n\n\n\nHazard controls for COVID-19 in work...,https://en.wikipedia.org/wiki/Workplace hazard...


In [11]:
def coref_resolution(text):
    """Function that executes coreference resolution on a given text"""
    doc = nlp(text)
    # fetches tokens with whitespaces from spacy document
    tok_list = list(token.text_with_ws for token in doc)
    for cluster in doc._.coref_clusters:
        # get tokens from representative cluster name
        cluster_main_words = set(cluster.main.text.split(' '))
        for coref in cluster:
            if coref != cluster.main:  # if coreference element is not the representative element of that cluster
                if coref.text != cluster.main.text and bool(set(coref.text.split(' ')).intersection(cluster_main_words)) == False:
                    # if coreference element text and representative element text are not equal and none of the coreference element words are in representative element. This was done to handle nested coreference scenarios
                    tok_list[coref.start] = cluster.main.text + \
                        doc[coref.end-1].whitespace_
                    for i in range(coref.start+1, coref.end):
                        tok_list[i] = ""

    return "".join(tok_list)

In [12]:
def preprocess(text, coref=True):
    # preprocess text
    text = text.replace('\n', ' ')
    text = re.sub(r'\[.*?\]+', ' ', text)
    text = re.sub(r'\n+', ' ', text)  # replace multiple newlines with period
    text = re.sub(r'\[\d+\]', ' ', text)  # remove reference numbers
    if coref:
        text = coref_resolution(text)  # resolve coreference clusters
    return text

In [13]:
covid19_df['text'] = covid19_df['text'].apply(preprocess)

In [14]:
test = covid19_df.loc[0, 'text']

In [15]:
test = nlp(test)

In [16]:
test._.coref_clusters

[      Coronavirus disease 2019 (COVID-19): [      Coronavirus disease 2019 (COVID-19), The disease],
 Symptoms of COVID-19: [Symptoms of COVID-19, Symptoms],
       Coronavirus disease 2019 (COVID-19): [      Coronavirus disease 2019 (COVID-19), the disease],
 airborne particles: [airborne particles, Those particles],
 Wuhan: [Wuhan, Wuhan, Wuhan, Wuhan],
 the WHO: [the WHO, the WHO],
 China: [China, China, China, China, China],
 COVID-19: [COVID-19, COVID-19, COVID-19],
 the people who are infected with the virus: [the people who are infected with the virus, the people who are infected with the virus],
 the virus: [the virus, the virus, the virus, the virus, the virus, the virus, the virus],
 the disease: [the disease, The disease, the disease],
 a delay between the moment a person first becomes infected and the appearance of the first symptoms: [a delay between the moment a person first becomes infected and the appearance of the first symptoms, The median delay for COVID-19],
 COVID

In [17]:
covid19_df.to_csv(path + "covid19-wiki.csv", index=True)

In [18]:
covid19_df

Unnamed: 0.1,Unnamed: 0,page,text,link
0,0,COVID-19,Coronavirus disease 2019 (COVID-19) is a...,https://en.wikipedia.org/wiki/COVID-19
1,1,COVID-19 pandemic,"The COVID-19 pandemic, also known as the coro...",https://en.wikipedia.org/wiki/COVID-19 pandemic
2,2,COVID-19 vaccine,Vaccines (deployment) A COVID‑19 vaccine ...,https://en.wikipedia.org/wiki/COVID-19 vaccine
3,3,Severe acute respiratory syndrome coronavirus 2,Severe acute respiratory syndrome coronavirus...,https://en.wikipedia.org/wiki/Severe acute res...
4,4,COVID-19 misinformation,By industry The COVID-19 pandemic has res...,https://en.wikipedia.org/wiki/COVID-19 misinfo...
5,5,Social distancing,"In public health, social distancing, also cal...",https://en.wikipedia.org/wiki/Social distancing
6,6,Transmission (medicine),"In medicine, public health, and biology, trans...",https://en.wikipedia.org/wiki/Transmission (me...
7,7,Symptoms of COVID-19,"Symptoms of COVID-19 are variable, rangin...",https://en.wikipedia.org/wiki/Symptoms of COVI...
8,8,COVID-19 testing,Vaccines (deployment) COVID-19 testing i...,https://en.wikipedia.org/wiki/COVID-19 testing
9,9,Workplace hazard controls for COVID-19,Hazard controls for COVID-19 in workplace...,https://en.wikipedia.org/wiki/Workplace hazard...
