Setting everything up

In [None]:
# mount to google drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# necessary pip installs
!pip install Whoosh
!pip install transformers
!pip install sentencepiece
!pip install transformers[sentencepiece]
!pip install fasttext
!pip install pycountry


In [None]:
# import all necessary libraries
import codecs 
from IPython.core.display import display, HTML
from whoosh.index import * 
from whoosh.fields import *
from whoosh import qparser
import glob
import random
from transformers import BertForQuestionAnswering
import torch
from transformers import BertTokenizer
from transformers import pipeline
import fasttext
from pycountry import languages


In [None]:
# setup the data, that shall be used as a knowledge base
path='drive/My Drive/Colab Notebooks/nlp-appl2-project/data/' #set path to passages
corpora = 'full_qa_corpora.txt'
# for poster, use of different sets possible, more data, more computation, always tradeoff 
count=0
passages=[]
with open(path+corpora) as f:
    for line in f:
      count+=1
      passages.append(line)
      if count == 15000:
          pass
          #break
      #print(line)

Module 1: The Language Identification Module

In [None]:
class FastTextIdentifier(object):
    def __init__(self, file_path, k=5):
        self.identifier = fasttext.load_model(file_path)
        self.k = k

    def predict_language(self, language_sample, verbose=False):
        forbidden_lang_list = ['als', 'arz', 'ast', 'azb', 'bar', 'bcl', 'bh', 'bpy', 'bxr', 'cbk', 'ceb', 'ckb', 'diq', 'dsb', 'dty', 'eml', 'frr', 'gom', 'hif', 'hsb', 'ilo', 'jbo', 'krc', 'lez', 'lmo', 'lrc', 'mai', 'mhr', 'min', 'mrj', 'mwl', 'myv', 'mzn', 'nah', 'nap', 'nds', 'new', 'pam', 'pfl', 'pms', 'pnb', 'rue', 'sah', 'scn', 'sco', 'tyv', 'vec', 'vep', 'vls', 'war', 'wuu', 'xal', 'xmf', 'yue']
        predictions, pred_score = self.identifier.predict(language_sample, k=self.k if verbose else 2)
        short_list = [prediction.split("__")[-1] for prediction in predictions]
        index_list = []
        list_it = short_list.copy()
        for i, j in enumerate(list_it):
            if j in forbidden_lang_list:
                pass
            else:
                index_list.append(i)
        short_list = [short_list[i] for i in index_list]
        if short_list is None:
            success = False
        else:
            success = True
        if verbose and success:
            name_list = [languages.get(alpha_2=lang).name for lang in short_list]
            for i, language in enumerate(name_list):
                print("{order}. language_detected: {lang} wit a score of {score}".format(order=i+1, lang=language, score=pred_score[i]))
        
        return short_list[0], pred_score[0], success


In [None]:
# get correct languages from fasttext
forbidden_lang = []
lang_string = "af als am an ar arz as ast av az azb ba bar bcl be bg bh bn bo bpy br bs bxr ca cbk ce ceb ckb co cs cv cy da de diq dsb dty dv el eml en eo es et eu fa fi fr frr fy ga gd gl gn gom gu gv he hi hif hr hsb ht hu hy ia id ie ilo io is it ja jbo jv ka kk km kn ko krc ku kv kw ky la lb lez li lmo lo lrc lt lv mai mg mhr min mk ml mn mr mrj ms mt mwl my myv mzn nah nap nds ne new nl nn no oc or os pa pam pfl pl pms pnb ps pt qu rm ro ru rue sa sah sc scn sco sd sh si sk sl so sq sr su sv sw ta te tg th tk tl tr tt tyv ug uk ur uz vec vep vi vls vo wa war wuu xal xmf yi yo yue zh"
lang_list =lang_string.split(" ")
for lang in lang_list:
    curr_lang = languages.get(alpha_2=lang)
    if curr_lang is None:
        forbidden_lang.append(lang)
print(forbidden_lang)


['als', 'arz', 'ast', 'azb', 'bar', 'bcl', 'bh', 'bpy', 'bxr', 'cbk', 'ceb', 'ckb', 'diq', 'dsb', 'dty', 'eml', 'frr', 'gom', 'hif', 'hsb', 'ilo', 'jbo', 'krc', 'lez', 'lmo', 'lrc', 'mai', 'mhr', 'min', 'mrj', 'mwl', 'myv', 'mzn', 'nah', 'nap', 'nds', 'new', 'pam', 'pfl', 'pms', 'pnb', 'rue', 'sah', 'scn', 'sco', 'tyv', 'vec', 'vep', 'vls', 'war', 'wuu', 'xal', 'xmf', 'yue']


In [None]:
# sanity check of language identification model
alt_model_path = 'drive/My Drive/Colab Notebooks/nlp-appl2-project/models/lid.176.ftz'
model_path = 'drive/My Drive/Colab Notebooks/nlp-appl2-project/models/lid.176.bin' # big model works a lot better
fasttxt = FastTextIdentifier(model_path, k=6)
example1 = "Ea orain funtzionatzen duen."
example2 = "Hoffentlich klappt das jetzt"
example3 = "Je mange la baguette"
result_lang1, score1, success = fasttxt.predict_language(example1, verbose=True)
print(result_lang1, score1)
result_lang2, score2, success = fasttxt.predict_language(example2, verbose=True)
print(result_lang2, score2)
result_lang3, score3, success = fasttxt.predict_language(example2, verbose=True)
print(result_lang3, score3)
result_lang4, score4, success = fasttxt.predict_language(example3, verbose=True)
print(result_lang4, score4)

Module 2: The two-way Translation Pipeline

In [None]:
class TwoWayTranslator(object):
    def __init__(self):
        self.language_list = []
        self.unknown_language_list = []
        self.pipeline_dict = {}
        self.back_pipeline_dict = {}


    def get_language_model(self, source_language):
        success = True
        # prepare parameters
        model = "Helsinki-NLP/opus-mt-{src}-{dst}".format(src = source_language, dst = "en")
        model_back = "Helsinki-NLP/opus-mt-{src}-{dst}".format(src = "en", dst = source_language)
        translation_direction = "translation_{src}_to_{dst}".format(src = source_language, dst = "en")
        translation_direction_back = "translation_{src}_to_{dst}".format(src = "en", dst = source_language)
        # try to get translation pipelines
        try:
            translator  = pipeline(translation_direction, model=model, tokenizer=model)
            back_translator  = pipeline(translation_direction_back, model=model_back, tokenizer=model_back)
        except:
            success = False
            self.unknown_language_list.append(source_language)
        else:
            self.pipeline_dict[source_language] = translator
            self.back_pipeline_dict[source_language] = back_translator
            self.language_list.append(source_language)
        finally:
            return success


    def translate(self, text, source_language, to_english):
        if source_language in self.unknown_language_list:
            return "Translation was not possible, no Language Model exists for this Language", False
        if source_language not in self.language_list:
            success = self.get_language_model(source_language)
        else:
            success = True
        if not success:
            return "Translation was not possible, no Language Model exists for this Language", False
        else:
            if to_english:
                return self.pipeline_dict[source_language](text)[0]["translation_text"], True
            else: 
                return self.back_pipeline_dict[source_language](text)[0]["translation_text"], True




        

In [None]:
# sanity check for translation pipeline
two_way_translator = TwoWayTranslator()
src = "de"
dst = "fr"
example1 = "Hoffentlich klappt das jetzt"

output1, _ = two_way_translator.translate(example1, src, to_english=True)
print(output1)
output2 = two_way_translator.translate(output1, dst, to_english=False)
output22 = two_way_translator.translate(output1, dst, to_english=False)
print(output2)
output3 = two_way_translator.translate(output2[0], dst, to_english=True)
print(output3)
output4 = two_way_translator.translate(output3[0], "eus", to_english=False)
print(output4)
output4 = two_way_translator.translate(output3[0], "eu", to_english=False)
print(output4)


Module 3:
The QA System

In [None]:
# class definition of submodels
class IR(object):
    def __init__(self, passages):
        schema = Schema(id = ID(stored=True,unique=True),
                        text = TEXT(analyzer=analysis.StemmingAnalyzer())
                        )
        self.passages = passages
        if not os.path.exists("index"):
              os.mkdir("index")
        ix = create_in("index", schema)
        writer = ix.writer() 
        for ind,passage_text in enumerate(self.passages): 
            writer.add_document(id=str(ind),text=passage_text)
        writer.commit()
        self.ix = ix

    def retrieve_documents(self, query,topk):
        scores=[]
        text=[]
        with self.ix.searcher() as searcher:
            searcher = self.ix.searcher()
            q = qparser.QueryParser("text", self.ix.schema, group=qparser.OrGroup).parse(query)
            results = searcher.search(q, limit=topk)
        for hit in results:
            scores.append(hit.score)
            text.append(self.passages[int(hit['id'])])
        return text, scores

class QA(object):
    def __init__(self):       
        self.model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        self.tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

    def answer_question(self, query,passage):
        input_ids = self.tokenizer.encode(query,passage) 
        tokens = self.tokenizer.convert_ids_to_tokens(input_ids)
        sep_index = input_ids.index(self.tokenizer.sep_token_id)
        num_seg_a = sep_index + 1
        num_seg_b = len(input_ids) - num_seg_a
        segment_ids = [0]*num_seg_a + [1]*num_seg_b
        outputs = self.model(torch.tensor([input_ids]), 
                             token_type_ids=torch.tensor([segment_ids]),
                             return_dict=True) 
        start_scores = outputs.start_logits[0]
        end_scores = outputs.end_logits[0]
        max_score = -float('inf')
        index_start = None
        index_end = None
        # get best combination of indices
        for i in range(len(start_scores[num_seg_a:])):
            for j in range(len(end_scores[num_seg_a + i:-1])):
              current_score = start_scores[num_seg_a + i] + end_scores[num_seg_a + i + j]
              if current_score >= max_score:
                  max_score = current_score
                  index_start = num_seg_a + i
                  index_end = num_seg_a + i + j
        answer_start = index_start
        answer_end = index_end
        answer = tokens[answer_start]

        for i in range(answer_start+1, answer_end+1):          
            if tokens[i][0:2] == '##':
                answer += tokens[i][2:]           
            else:
                # filter out remaining seperators
                if tokens[i] == '[SEP]':
                  continue
                else: 
                  answer += ' ' + tokens[i]

        return 'Answer: "' + answer + '"', max_score.item()/2

In [None]:
# Wrapper class for IR and QA modules
class QA_pipeline(object):
    def __init__(self, passages):
        self.ir = IR(passages)
        self.qa = QA()

    def answer_question(self, query, topk_docs=3):
        topk_docs, scores=self.ir.retrieve_documents(query, topk_docs)
        return self.qa.answer_question(query,topk_docs[0])

In [None]:
#sanity check
test_wrapper = QA_pipeline(passages)
query='Range of incubation periods for the disease in humans'
print(test_wrapper.answer_question(query, 3))

Bringing it all together: The final Pipeline

In [None]:
# Wrapper class for IR and QA modules
class MT_QA_pipeline(object):
    def __init__(self, passages, identification_threshold=0.70):
        assert passages is not None
        model_path = 'drive/My Drive/Colab Notebooks/nlp-appl2-project/models/lid.176.bin' # big model works a lot better
        self.fasttxt = FastTextIdentifier(model_path, k=6)
        self.qa_system = QA_pipeline(passages)
        self.two_way_translator = TwoWayTranslator()
        self.thresh = identification_threshold

    def answer_question(self, query, topk_docs=1, verbose=False):
        source_language, score, success = self.fasttxt.predict_language(query, verbose=verbose)
        if score < self.thresh:
            return "Language could not be identified for certain, Please try a different language or mean of phrasing it"
        if source_language == 'en':
            english_question = query
        else:
            english_question, success = self.two_way_translator.translate(query, source_language, to_english=True)
            if verbose:
              print(english_question)
            if not success:
                return english_question

        english_answer, _ = self.qa_system.answer_question(english_question, topk_docs=topk_docs)
        if verbose:
            print(english_answer)
        if source_language == 'en':
            answer = english_answer
        else:
            answer, _ = self.two_way_translator.translate(english_answer, source_language, to_english=False)
        return answer

In [None]:
qa_mt_module=MT_QA_pipeline(passages)
# full passages take around 45 min to load



In [None]:
while True:
    question= input()
    if question == "stop":
      break
    print(qa_mt_module.answer_question(question))
# What are common symptoms of Covid19?

In [None]:
qa_mt_module=MT_QA_pipeline(passages)
while True:
    question= input()
    if question == "stop":
      break
    print(qa_mt_module.answer_question(question, 5, True))