In [1]:
# Connect to Google drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [2]:
import pandas as pd
import spacy
from spacy import displacy

In [3]:
path_to_project = "/content/gdrive/MyDrive/Nguyễn Đình Lâm-Project 2-Knowledge Graph/Covid-19 wiki"

In [4]:
# Read csv data from Google drive
covid19_df = pd.read_csv(path_to_project + "/input/covid19-wiki.csv")

In [5]:
covid19_df

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,page,text,link
0,0,0,COVID-19,Coronavirus disease 2019 (COVID-19) is a...,https://en.wikipedia.org/wiki/COVID-19
1,1,1,COVID-19 pandemic,"The COVID-19 pandemic, also known as the coro...",https://en.wikipedia.org/wiki/COVID-19 pandemic
2,2,2,COVID-19 vaccine,Vaccines (deployment) A COVID‑19 vaccine ...,https://en.wikipedia.org/wiki/COVID-19 vaccine
3,3,3,Severe acute respiratory syndrome coronavirus 2,Severe acute respiratory syndrome coronavirus...,https://en.wikipedia.org/wiki/Severe acute res...
4,4,4,COVID-19 misinformation,By industry The COVID-19 pandemic has res...,https://en.wikipedia.org/wiki/COVID-19 misinfo...
5,5,5,Social distancing,"In public health, social distancing, also cal...",https://en.wikipedia.org/wiki/Social distancing
6,6,6,Transmission (medicine),"In medicine, public health, and biology, trans...",https://en.wikipedia.org/wiki/Transmission (me...
7,7,7,Symptoms of COVID-19,"Symptoms of COVID-19 are variable, rangin...",https://en.wikipedia.org/wiki/Symptoms of COVI...
8,8,8,COVID-19 testing,Vaccines (deployment) COVID-19 testing i...,https://en.wikipedia.org/wiki/COVID-19 testing
9,9,9,Workplace hazard controls for COVID-19,Hazard controls for COVID-19 in workplace...,https://en.wikipedia.org/wiki/Workplace hazard...


In [6]:
covid19_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Unnamed: 0    12 non-null     int64 
 1   Unnamed: 0.1  12 non-null     int64 
 2   page          12 non-null     object
 3   text          12 non-null     object
 4   link          12 non-null     object
dtypes: int64(2), object(3)
memory usage: 608.0+ bytes


In [7]:
path_to_web_lg = path_to_project + "/model/en_core_web_lg-2.2.5/en_core_web_lg/en_core_web_lg-2.2.5"

In [8]:
# load spacy model
nlp = spacy.load(path_to_web_lg)

# TEST FOR FIRST PAGE

In [9]:
pandemic =  covid19_df.loc[0, 'text']

In [10]:
pandemic

'      Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2 (SARS-CoV-2). The first known case was identified in Wuhan, China in December 2019.  The disease has since spread worldwide, leading to an ongoing pandemic.  Symptoms of COVID-19 are variable, but often include fever,  cough, headache,  fatigue, breathing difficulties, and loss of smell and taste.    Symptoms may begin one to fourteen days after exposure to       Coronavirus disease 2019 (COVID-19). At least a third of people who are infected do not develop noticeable symptoms.  Of those people who develop noticeable symptoms enough to be classed as patients, most (81%) develop mild to moderate symptoms (up to mild pneumonia), while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging), and 5% suffer critical symptoms (respiratory failure, shock, or multiorgan dysfunction).  Older people are at a higher risk of developing se

In [11]:
pandemic = nlp(pandemic)
sentences = [sent.string.strip() for sent in pandemic.sents]  # split text into sentences

In [12]:
len(sentences)

499

In [13]:
sentences

['Coronavirus disease 2019 (COVID-19) is a contagious disease caused by severe acute respiratory syndrome coronavirus 2',
 '(SARS-CoV-2).',
 'The first known case was identified in Wuhan, China in December 2019.',
 'The disease has since spread worldwide, leading to an ongoing pandemic.',
 'Symptoms of COVID-19 are variable, but often include fever,  cough, headache,  fatigue, breathing difficulties, and loss of smell and taste.',
 'Symptoms may begin one to fourteen days after exposure to       Coronavirus disease 2019 (COVID-19).',
 'At least a third of people who are infected do not develop noticeable symptoms.',
 'Of those people who develop noticeable symptoms enough to be classed as patients, most (81%) develop mild to moderate symptoms (up to mild pneumonia), while 14% develop severe symptoms (dyspnea, hypoxia, or more than 50% lung involvement on imaging), and 5% suffer critical symptoms (respiratory failure, shock, or multiorgan dysfunction).',
 'Older people are at a higher r

In [14]:
def refine_sents(sentences):
  for i in range(len(sentences)):
    sentences[i] = sentences[i].replace('\xa0', ' ')
    sentences[i] = ' '.join([word for word in sentences[i].split() if word != "" and word != " "])
    if len(sentences[i]) < 20: 
      if sentences[i].endswith("."):
        sentences[i-1] += " " +  sentences[i]
      else:
        sentences[i+1] = sentences[i] + " " + sentences[i+1]
  return sentences

In [15]:
sents = []
sentences = refine_sents(sentences)
for sentence in sentences:
  if len(sentence) > 20: 
    sents.append(sentence)
sentences = sents

In [16]:
spacy.explain("acl")

'clausal modifier of noun (adjectival clause)'

In [17]:
sentence1 = nlp(sentences[0])
displacy.render(sentence1, style="dep" , jupyter=True)

In [18]:
for tok in sentence1:
  print(tok.text, " ==== ", tok.dep_, " ====== ", tok.pos_)



In [19]:
displacy.render(sentence1, style="ent" , jupyter=True)

In [20]:
# modifier + prefix + sub/obj
def extract_triples(sentence):
  triples = []
  ent1 = "" # source entity, can have many dest entities
  cur_rel = ""
  
  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  sentence = nlp(sentence)
  iterable = iter(range(len(sentence)))

  for i in iterable:
    tok = sentence[i]

    if tok.dep_ != 'PUNCT':
      if tok.dep_ == 'compound':
        if prefix == "" and prv_tok_dep.endswith("mod") == False:
          modifier = ""
        prefix = tok.text
        if prv_tok_dep == 'compound':
          prefix = prv_tok_text + " " + prefix

      if tok.dep_.endswith("mod") and tok.pos_ != 'ADV': 
        if prv_tok_dep.endswith("mod"):
          modifier += " " +  tok.text
        else:
          modifier = tok.text

      if tok.pos_ == "AUX":
        cur_rel = tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = "" 

      if tok.pos_ == 'VERB':
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""
        cur_rel = tok.text        
        if tok.text.endswith('ed') == False and tok.text.endswith('ing') == False:
          cur_rel = tok.lemma_
        if i < len(sentence) - 1 and sentence[i+1].dep_ in ('prep', 'agent'):
          cur_rel += " " + sentence[i+1].text
          if i < len(sentence) - 1 : 
            iterable.__next__()
        elif [children for children in tok.children if children.dep_ == 'prep']:
          cur_rel += " " + [children for children in tok.children if children.dep_ == 'prep'][0].text
      
      if tok.dep_.find("subj") == True and tok.pos_ != 'DET':
        ent1 = modifier + " " + prefix + " " + tok.text
        ent1 = ' '.join([word for word in ent1.split() if nlp(word)[0].pos_ != 'PUNCT'])

        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      if tok.dep_.find("obj") == True:
        ent2 = modifier + " " + prefix + " " + tok.text
        if i < len(sentence) - 1 and sentence[i+1].dep_ != 'prep' and sentence[i+1].dep_ != 'PUNCT' and sentence[i+1].text in [children.text for children in tok.children]:
          ent2 += " " + sentence[i+1].text
          if i < len(sentence) -1 : 
            iterable.__next__()
        ent2 = ' '.join([word for word in ent2.split() if nlp(word)[0].pos_ != 'PUNCT'])
        if ent1.strip() and cur_rel.strip() and ent2.strip():
          triple = {
              "source": ent1.strip(),
              "relation": cur_rel.strip(),
              "destination": ent2.strip()
          }
          triples.append(triple)
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""
        
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return triples


In [21]:
triple_list = []

In [22]:
for sentence in sentences:
  triple_list += extract_triples(sentence)

In [23]:
covid19 = pd.DataFrame(triple_list)

In [24]:
covid19

Unnamed: 0,source,relation,destination
0,Coronavirus disease,caused by,severe acute respiratory syndrome coronavirus 2
1,case,identified in,Wuhan
2,case,identified in,December 2019
3,disease,leading to,ongoing pandemic
4,Symptoms,include,fever
...,...,...,...
1592,Public Health England,reported in,year 2021
1593,Public Health England,reported in,date
1594,Public Health England,was,89 % sales
1595,Public Health England,was,Vitamin D


# All data

In [25]:
covid19_sentences = []

In [26]:
for i in range(covid19_df.shape[0]):
  text = nlp(covid19_df.loc[i, 'text'])
  sentences = [sent.string.strip() for sent in text.sents]
  sents = []
  sentences = refine_sents(sentences)
  for sentence in sentences:
    if len(sentence) > 20: 
      sents.append(sentence)
  sentences = sents
  covid19_sentences += sentences

In [27]:
len(covid19_sentences)

3489

In [28]:
covid19_triple = []

In [29]:
for sentence in covid19_sentences:
  covid19_triple += extract_triples(sentence)

In [30]:
len(covid19_triple)

11755

In [31]:
covid19_rdf = pd.DataFrame(covid19_triple)

In [32]:
covid19_rdf

Unnamed: 0,source,relation,destination
0,Coronavirus disease,caused by,severe acute respiratory syndrome coronavirus 2
1,case,identified in,Wuhan
2,case,identified in,December 2019
3,disease,leading to,ongoing pandemic
4,Symptoms,include,fever
...,...,...,...
11750,article,incorporate,text
11751,article,incorporate,free content work
11752,Text,Anticipating,impacts
11753,Text,Anticipating,COVID-19


In [33]:
path_to_save = "/content/gdrive/MyDrive/Nguyễn Đình Lâm-Project 2-Knowledge Graph/Covid-19 wiki/"

In [34]:
covid19_rdf.to_csv(path_to_save + "KG_triples.csv", index=True)

In [35]:
covid19_rdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11755 entries, 0 to 11754
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   source       11755 non-null  object
 1   relation     11755 non-null  object
 2   destination  11755 non-null  object
dtypes: object(3)
memory usage: 275.6+ KB
