In [None]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 
import networkx as nx
from tqdm import tqdm


In [None]:
faq = pd.read_csv('/content/faq.csv')
faq.shape

(90, 3)

In [None]:
faq.head()

Unnamed: 0,question,answer,category
0,What if I need to know my correct CGPA?,CGPA is seen on the grade card. The CGPA count...,grade card
1,I have cleared my backlog but the result sheet...,GPA and CGPA are not mentioned on backlog gra...,grade card
2,"I have cleared my backlog, but the result on E...",Grade card shows first attempt GPA and CGPA o...,grade card
3,What do I do if I have not received my grade c...,Grade cards should be collected from the stude...,grade card
4,My name/Father's Name/Mother's name is incorre...,Name correction on grade cards/PDC needs to b...,Corrections on grade card /PDC


In [None]:
doc = nlp(faq['question'][3])
for tok in doc: 
  print(tok.text, '---', tok.dep_)

What --- dobj
do --- aux
I --- nsubj
do --- ROOT
if --- mark
I --- nsubj
have --- aux
not --- neg
received --- advcl
my --- poss
grade --- compound
card --- dobj
? --- punct


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
t = word_tokenize(faq.question[1])    # all words
t_sw = [word for word in t if not word in stopwords.words()]

pun = ['.', ',', '?']
t = [word for word in t_sw if word != pun]


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
print(t)
# print(t_sw)

['I', 'cleared', 'backlog', 'result', 'sheet', 'show', 'GPA', 'CGPA', '.', 'Why', '?']


In [None]:
len(t)

18

In [None]:
print(t)

['I', 'have', 'cleared', 'my', 'backlog', 'but', 'the', 'result', 'sheet', 'does', 'not', 'show', 'GPA', 'and', 'CGPA', '.', 'Why']


In [None]:
# pun = [',', '.', '?']
# for i in pun:
t.remove('?')
len(t)

17

In [None]:
st = []
for tok in nlp(faq.question[1]):
  if tok.dep_ != 'punct':
    st.append(tok)

In [None]:
st

[I,
 have,
 cleared,
 my,
 backlog,
 but,
 the,
 result,
 sheet,
 does,
 not,
 show,
 GPA,
 and,
 CGPA,
 Why]

In [None]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""
  
  for tok in nlp(sent):
    # chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text

  return [ent1.strip(), ent2.strip()]

**Chunk 1**

Defined a few empty variables in this chunk. prv_tok_dep and prv_tok_text will hold the dependency tag of the previous word in the sentence and that previous word itself, respectively. prefix and modifier will hold the text that is associated with the subject or the object.

**Chunk 2**

Next, we will loop through the tokens in the sentence. We will first check if the token is a punctuation mark or not. If yes, then we will ignore it and move on to the next token. If the token is a part of a compound word (dependency tag = “compound”), we will keep it in the prefix variable. A compound word is a combination of multiple words linked to form a word with a new meaning (example – “Football Stadium”, “animal lover”).

As and when we come across a subject or an object in the sentence, we will add this prefix to it. We will do the same thing with the modifier words, such as “nice shirt”, “big house”, etc.

**Chunk 3**

Here, if the token is the subject, then it will be captured as the first entity in the ent1 variable. Variables such as prefix, modifier, prv_tok_dep, and prv_tok_text will be reset.

**Chunk 4**

Here, if the token is the object, then it will be captured as the second entity in the ent2 variable. Variables such as prefix, modifier, prv_tok_dep, and prv_tok_text will again be reset.

**Chunk 5**

Once we have captured the subject and the object in the sentence, we will update the previous token and its dependency tag.

Let’s test this function on a sentence:

In [None]:
for i in faq.question.head(10):
  print(i)
  print(get_entities(i))
  print('---')

What if I need to know my correct CGPA?
['I', 'correct  CGPA']
---
I have cleared my backlog but the result sheet does not show GPA and CGPA. Why?
['result sheet', 'GPA']
---
I have cleared my backlog, but the result on ERP is not updated .It still shows earlier CGPA.As a result, it is affecting my CGPA of next trimesters also. How do I get it corrected?
['it', 'next  trimesters']
---
What do I do if I have not received my grade card?
['I', 'grade card']
---
My name/Father's Name/Mother's name is incorrect on grade card. How do I get it corrected?
['it', 'grade card']
---
How can I get a new copy of corrected grade cards?
['How  I', 'corrected grade cards']
---
From where should I collect the corrected grade cards/PDC?
['where  I', 'grade cards grade PDC']
---
In how many days will I get the corrected grade cards?
['days  I', 'corrected grade cards']
---
Do I need to submit an Exam form for the end term exam?
['I', 'end term exam']
---
I have an internal backlog, do I need to submit an

In [None]:
entity_pairs = []

for i in faq.question.head(10):
  entity_pairs.append(get_entities(i))


In [None]:
entity_pairs

[['I', 'correct  CGPA'],
 ['result sheet', 'GPA'],
 ['it', 'next  trimesters'],
 ['I', 'grade card'],
 ['it', 'grade card'],
 ['How  I', 'corrected grade cards'],
 ['where  I', 'grade cards grade PDC'],
 ['days  I', 'corrected grade cards'],
 ['I', 'end term exam'],
 ['internal  I', 'exam form']]

In [None]:
def get_relation(sent):

  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
  k = len(matches) - 1

  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [None]:
for i in faq.question.head(10):
  print(get_relation(i))

What
Why
get
do
get
get
collect
get
need
need


In [None]:
relations = [get_relation(i) for i in tqdm(faq.question.head(10))]

100%|██████████| 10/10 [00:00<00:00, 59.79it/s]


In [None]:
pd.Series(relations).value_counts()

get        4
need       2
What       1
Why        1
do         1
collect    1
dtype: int64