<a href="https://colab.research.google.com/github/koyomin9zx/UITQA-Vietnamese-Question-Answering/blob/master/example_predict_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!git clone https://github.com/koyomin9zx/UITQA-Vietnamese-Question-Answering.git

In [0]:
!pip install pytorch-transformers
!pip install underthesea 
!pip install unidecode
!pip install whoosh

In [0]:
from google.colab import drive
drive.mount('/content/drive')

In [0]:
!ln -s /content/drive/'My Drive'/data
!mv /content/UITQA-Vietnamese-Question-Answering/combine.py /content

In [0]:
# importing libraries 
from underthesea import word_tokenize, sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity 
from scipy.spatial import distance 
from collections import defaultdict, OrderedDict 
from string import punctuation
import unidecode
import re
import pandas as pd 
import numpy as np 
import glob
from combine import QA
import time
import sys
import os
from whoosh.fields import Schema, TEXT, STORED
from whoosh.index import create_in, open_dir
from whoosh.query import *
from whoosh.qparser import QueryParser
from underthesea import pos_tag

In [8]:
start = time.time()
model=QA('/content/data/BERT_Squad_WIki_UIT_pretrain') #path to model
end = time.time()
print("time load model: "+str(round((end - start),2)))

time load model: 28.6


In [0]:
def load_data(path):
  data=[]
  all_files = glob.glob(path + "/*.txt")
  for file in all_files:
      passage=open(file, "r", encoding='utf-8').read()
      data.append(passage)
  return data

def vi_tokenizer(row):
    return word_tokenize(row, format="text")

def remove_stopwords(stopwords,text):
  sent = [s for s in text.split() if s not in stopwords ]
  sent = ' '.join(sent)
  return sent


def remove_punctuation(row):
  remove = punctuation
  remove = remove.replace("_", "")
  pattern = "[{}]".format(remove) # create the pattern
  re_space=re.compile('\s+')
  re_trailing=re.compile('^\s+|\s+?$')
  row=re.sub(pattern, " ", row) 
  row=re.sub(re_space,' ',row)
  row=re.sub(re_trailing,' ',row)
  row = row.strip()
  row =row.lower()
  return row

def standardize_data(df,stopwords):
    hl_cleansed=[]
    remove = punctuation
    #remove = remove.replace("_", "")
    pattern = "[{}]".format(remove) # create the pattern
    re_space=re.compile('\s+')
    re_trailing=re.compile('^\s+|\s+?$')
    for row in df:
        #row = vi_tokenizer(row)
        row=re.sub(pattern, " ", row) 
        row=re.sub(re_space,' ',row)
        row=re.sub(re_trailing,' ',row)
        row = row.strip()
        row = remove_stopwords(stopwords,row)
        #row = remove_accents(row)
        #row = row.lower()
        hl_cleansed.append(row)
    return hl_cleansed

def keywords_extraction(sent):
  rs=""
  for i in pos_tag(sent):
    if i[1] !='P' and i[1] != 'CH':
      rs=rs+' '+i[0]
  return rs.strip()


def sentences_tokenize(text):
    sents = sent_tokenize(text)
    sents = [word_tokenize(s,format = 'text') for s in sents]
    sents = [remove_punctuation(s) for s in sents]
    sents = [s.lower() for s in sents]
    #sents = [remove_stopwords(stopwords,s) for s in sents]
    return sents

## Converting 3D array of array into 1D array 
def arr_convert_1d(arr): 
    arr = np.array(arr) 
    arr = np.concatenate( arr, axis=0 ) 
    arr = np.concatenate( arr, axis=0 ) 
    return arr 
  
## Cosine Similarity 
def cosine(trans): 
    cos = [] 
    cos.append(cosine_similarity(trans[0], trans[1])) 
    return cos

def tfidf(str1, str2,tf_idf_vetor,stopwords):
    str1=standardize_data([str1],stopwords)
    str2=standardize_data([str2],stopwords)  
    corpus = [str1[0],str2[0]] 
    trans = tf_idf_vetor.transform(corpus)
    cos=cosine(trans) 
    return arr_convert_1d(cos)[0]

def relevance_ranking(query,data,vect,stopwords):
  query=standardize_data([query],stopwords)[0]
  print('Query: ',query,'\n')
  score=defaultdict()
  i=0
  for d in data:
    t=tfidf(query, d,vect,stopwords)
    if t!=0.0:
      score[t]=d
    i+=1
  return OrderedDict(sorted(score.items(),reverse=True))

def whoosh_add_document(data):
  #creating the schema
  schema = Schema(doc_id=STORED,content=TEXT(stored=True))
  #creating the index
  if not os.path.exists("index"):
      os.mkdir("index")
  ix = create_in("index",schema)
  ix = open_dir("index")
  writer = ix.writer()
  for i in range(0,len(data)):
    writer.add_document(doc_id="doc_"+str(i+1),content=data[i])
  writer.commit()
  return ix

In [0]:
data=load_data('/content/data/data_QA/data')
stopwords = set(open('/content/data/data_QA/stopwords/stopwords.txt').read().split(' ')[:-1])


In [259]:
#Building index
data_standard=standardize_data(data,stopwords)
vect = TfidfVectorizer(min_df=1, max_df=0.8,max_features=8000,sublinear_tf=True,ngram_range=(1,3)) 
vect.fit(data_standard)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=8000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=True,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [281]:
query="Quang Trung sinh năm bao nhiêu "
query_=keywords_extraction(query)


rs=relevant_ranking(query_,data,vect,stopwords)
i=0

for score,doc in rs.items():
  answer = model.predict(doc,query)
  num_overlap=len(set(query_.split()) & set(doc.split()))
  if answer['confidence']>0.1 and num_overlap >1:
    print('\nQuestion: ',query)
    print('\nOverlap key word: ',num_overlap)
    print('\nAnswer: ',answer['answer'])
    print('\nIR Sccore: ',score)
    print('\nBert Score: ',answer['confidence'])
    print('\nContent: ',doc[:100],'...')
    print('\n==========================================\n\n\n')
    i+=1
  if i==5:
    break

Query:  Quang Trung sinh năm 


Question:  Quang Trung sinh năm bao nhiêu 

Overlap key word:  3

Answer:  1783

IR Sccore:  0.10716875074371207

Bert Score:  0.9360089057989387

Content:  Nguyễn Quang Toản (sinh Qúi Mão 1783- mất Nhâm Tuất 1802)

Cảnh Thịnh Hoàng đế (Thời gian ở ngôi 179 ...





Question:  Quang Trung sinh năm bao nhiêu 

Overlap key word:  2

Answer:  (…- Tân Mão)

IR Sccore:  0.0382599755851343

Bert Score:  0.9219120462611116

Content:  Triệu Quang Phục (…- Tân Mão)

Danh tướng nhà Tiền Lý, con Thái phó Triệu Túc, quê ở Châu Biên, phủ  ...





Question:  Quang Trung sinh năm bao nhiêu 

Overlap key word:  2

Answer:  1911

IR Sccore:  0.022181279576233608

Bert Score:  0.9179584638692404

Content:  Võ Nguyên Giáp (sinh 1911)

Võ Nguyên Giáp (sinh 1911), nhà hoạt động nổi tiếng của Đảng Cộng sản và ...





Question:  Quang Trung sinh năm bao nhiêu 

Overlap key word:  3

Answer:  1890

IR Sccore:  0.018385125506151172

Bert Score:  0.9882463846256808

Content:  H