In [1]:
!pip install transformers


Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/81/89/f07e7a884072ad37b1b6b1578637ab36152e0251d74abb950d967a59904e/transformers-4.3.1-py3-none-any.whl (1.8MB)
[K     |████████████████████████████████| 1.8MB 8.5MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/fd/5b/44baae602e0a30bcc53fbdbc60bd940c15e143d252d658dfdefce736ece5/tokenizers-0.10.1-cp36-cp36m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 43.4MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 41.4MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=432e38643a2

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Importing required modules


In [3]:
import itertools
import os
from gensim.summarization.bm25 import BM25
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, QuestionAnsweringPipeline
import json
import spacy

In [4]:
class PassageRetrieval:

    def __init__(self, nlp):
        self.tokenize = lambda text: [token.lemma_ for token in nlp(text)]
        self.bm25 = None
        self.passages = None

    def preprocess(self, doc):
        passages = [p for p in doc.split('\n') if p and not p.startswith('=')]
        return passages

    def fit(self, docs):
        # passages = list(itertools.chain(*map(self.preprocess, docs)))
        corpus = [self.tokenize(p) for p in passages]
        self.bm25 = BM25(corpus)
        self.passages = passages

    def most_similar(self, question, topn=10):
        tokens = self.tokenize(question)
        average_idf = sum(map(lambda k: float(self.bm25.idf[k]), self.bm25.idf.keys())) / len(self.bm25.idf.keys())
        scores = self.bm25.get_scores(tokens, average_idf)
        pairs = [(s, i) for i, s in enumerate(scores)]
        pairs.sort(reverse=True)
        passages = [self.passages[i] for _, i in pairs[:topn]]
        return passages

In [5]:
class bert:

  def __init__(self, model):
    self.tokenizer = AutoTokenizer.from_pretrained(model)
    self.model = AutoModelForQuestionAnswering.from_pretrained("/content/drive/My Drive/bert_model/bert_policy_documents")
    self.bert = QuestionAnsweringPipeline(model = self.model, tokenizer = self.tokenizer)
  
  def evaluateAnswer(self, question, sentence):
    answer = self.bert(question = question, context = sentence)
    return answer

In [6]:
def completeAnswer(answer, para):
  for p in para:
    if answer in p:
      return p
  return None

In [7]:
def getPassages(filename):
    with open(filename) as file:
        data = json.load(file)
    passages = list()
    for i in data["vertices"]["paragraphs"]:
      passages.append(i["text"])
    return passages

In [8]:
passages = getPassages("handbook_graph.json")

In [10]:
SPACY_MODEL = os.environ.get("SPACY_MODEL", "en_core_web_sm")
nlp = spacy.load(SPACY_MODEL, disable = ["ner","parser","textcat"])


retreivePassage = PassageRetrieval(nlp)
retreivePassage.fit(passages)
bertModel = bert("deepset/bert-base-cased-squad2")

# questions = [
#              "How do I calculate cgpa",
#              "What is the normal load for UG students",
#              "If I fail a course and take it again in the later semester, will my earlier course with F grade be removed from the transcript",
#             " what is the process of registration?",
#             "how many seats are there in cse for admission?",
#              " what is the admission criteria for btech",
#              "I am in 1st year. Can I take overload?",
#              "I am in 2nd year. Can I take overload?",
#              "what happens if I miss the endsem because of a medical reason?",
#              "what happens if I fail a course?",
#              " what happens if I get an F grade in a course?",
#              "How can I calculate sgpa",
#              "What if I pass all my semesters",
#              "What about canteen",
#              "Will I get hostel",
#              "I dont know anything about IIIT",
#              "Who was abraham lincoln",
#              "Can i take 8 credits of online courses in a semester",
#              "how many credits do i need to graduate",
#              "how is my semester graded",
#              "what if I do more than 156 credits in my btech course"
# ]
questions = [
             "can I take up internships during a semester?",
              "what is the i grade",
              "can I replace a core course on getting an F grade?",
              "how can I get the grade given to me in a course changed?",
              "how will my cgpa be computed if I do more than 156 credits?",
              "is there any rule for attendance?",
              "how can I apply for a semester leave?",
              "how can I apply for branch transfer from ece to cse",
              "what is the minimum credit requirement for graduation?",
              "what are the requirements to get an honours degree?",
              "when is the convocation held?"
]
for q in questions:
  topAnswer = retreivePassage.most_similar(q, topn = 6)
  # print(topAnswer)
  sentence = ""
  for i in topAnswer:
    # print(i)
    sentence += i + " "
  ans = bertModel.evaluateAnswer(q, sentence)
  print(len(sentence.split()))
  print("Q:",q)
  print("Ans:",completeAnswer(ans["answer"], topAnswer))
  # print(len(sentence.split()))
  print("---------------------")

533
Q: can I take up internships during a semester?
Ans: Registering in the summer term is optional. A student may register for up to 6 credits of courses that are offered during the summer term (inclusive of SG or CW credits). Out of to 6 credits of courses that are offered during the summer term (inclusive of SG or CW credits), for students who have completed the 2nd year of to 6 credits of courses that are offered during the summer term (inclusive of SG or CW credits) btech program, may be allowed to take up to 4 credits for IP or IS or UR or BTP. An exception may be made for the students of the graduating batch, who may be allowed to take up to 8 credits. The registration shall be done just before the start of the term. There will not be any late registration in the summer term and a student shall not be allowed to add a course after registration. 
---------------------
423
Q: what is the i grade
Ans: The F and X grades are fail grades and the student shall be required to repeat th