# Question Answering base on IR


# Requirement
 - Window or Unix/Linux environment
 - Python 2.7
 - nltk
 - StanfordNERTagger : 
   - Download : http://nlp.stanford.edu/software/CRF-NER.shtml#Download
   - Install JDK version >= 8
   - Setting Environment variables for Window :
     - Environment variable for CLASSPATH : path\to\stanford-ner\stanford-ner.jar
     - Environment variable for STANFORD_MODELS : path\to\stanford-ner\classifiers
   - Setting Environment variables for Unix : Open terminal type : gedit ~/.bashrc then move to the end of file and add :
     - export CLASSPATH=path/to/stanford-ner/stanford-ner.jar
     - export STANFORD_MODELS=path/to/stanford-ner/classifiers

 - sklearn
 - googleapiclient : pip install --upgrade google-api-python-client
 - BeautifulSoup
 - plotly

# Import library & Setting Parameter

In [None]:
import pickle
import nltk
from nltk import ne_chunk, pos_tag, word_tokenize
from nltk.tree import Tree
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import *
from googleapiclient.discovery import build
from bs4 import BeautifulSoup
import requests
import timeit
from collections import Counter
from nltk.tag import StanfordNERTagger
import platform
OS =  platform.system()
if OS == 'Windows':
    newline_character = '\n'
else:
    newline_character = '\r\n'

Seach_api_key = "AIzaSyCYZt6vYMXhTn3dykAtVi6KrkQ1b30rd0c"    #Change this key if Get Relevant Document step raise error
Custom_Search_Engine_ID = "005336700654283051786:1mzldt1husk"
num_pages = 30       #Number of retrieval page : 10,20,30,40,50,60,70.....

# Setting environment variables before running this code
ST3class = StanfordNERTagger('english.all.3class.distsim.crf.ser.gz')
ST7class = StanfordNERTagger('english.muc.7class.distsim.crf.ser.gz')

# Load trained AnswerType dection model
 - If this code raise error please run ATD_Tranning.ipynb again in AnswerTypeDection directory

In [None]:
from ATD_Function import getAnswerTypeIndex,clean_str,getPOSTag,feature_extraction,token

answer_types = getAnswerTypeIndex('./AnswerTypeDetection/AnswerType.txt')
ATD_Model = pickle.load(open('./AnswerTypeDetection/ATD_Model.pkl','rb'))
Unigram_Vocabulary = pickle.load(open('./AnswerTypeDetection/Unigram_Vocabulary.pkl','rb'))
UniPOS_List = pickle.load(open('./AnswerTypeDetection/UniPOS_List.pkl','rb'))
CV_Unigram = CountVectorizer(vocabulary = Unigram_Vocabulary,ngram_range = (1,1))
CV_UniPOS = CountVectorizer(vocabulary = UniPOS_List,ngram_range = (1,1),lowercase = False,tokenizer = token)
f = open('./Data/Stopwords.txt')
stopwords_list = f.read().split(newline_character)


# Load list enities and define entity tagger

In [None]:
def stem(sent,stemmer,decode = True):
    try :
        sent = sent.lower()
        if decode:
            sent = sent.decode('utf-8').strip()
        words = sent.split(' ')
        words = [stemmer.stem(w) for w in words]
        words = ' '.join(words)
        return words
    except:
        return None

def EntityTagger(sent,list_entity,stemmer):
    tag_sent = pos_tag(word_tokenize(sent.lower()))
    grammar = "NP: {<DT>? <JJ>* <NN>*}"
    cp = nltk.RegexpParser(grammar)
    result = cp.parse(tag_sent)
    NPs = []
    for i in result:
        if type(i) == Tree:
            if i.label() == "NP":
                text = " ".join([token for token, pos in i.leaves()])
                stem_text = stem(text,stemmer,decode = False)
                if stem_text in list_entity:
                    NPs.append(text)
    return NPs

Entities = ['ANIMAL','BODY','COLOR','CURRENCY','DISMED','FOOD','LANG','LETTER','PLANT','SPORT','VEHICLES']
List_Enity = {}
stemmer = PorterStemmer()
for e in Entities:
    f = open('Entity/' + e + '.txt','r')
    enti = f.read().split(newline_character)
    enti[0] = enti[0].replace('\xef\xbb\xbf','')
    enti = [stem(a,stemmer) for a in enti]
    List_Enity[e] = enti
    f.close()

# Define Class

In [None]:
class Passage:
    def __init__(self,string,rank,num_key,similar):
        self.sent = string            #sentences
        self.ner_tag = []             #named entities tag corresponding
        self.num_ner = 0              #number of entities match answer-type
        self.num_key = num_key        #number of keywords
        self.len_long_seq = 0         #length of longest exact sequence of question keywords
        self.rank = rank              #rank of own document
        self.kw_similar = similar
        self.ngram_overlap = 0        #ngram overlap question

### That is everything this system require . Time to do experiments : 

The QA system are able to answer all below type of question :
 - PERSON
 - LOCATION
 - ORGANIZATION
 - MONEY
 - PERCENT
 - DATE
 - TIME
 - ANIMAL
 - BODY
 - COLOR
 - CURRENCY
 - DISEASE & MEDICINE
 - FOOD
 - LANGUAGE
 - LETTER
 - PLANT
 - SPORT
 - VEHICLES

# Input question

In [None]:
query = "What is the largest animal in the world ?"                      #Change this
question = query.lower()

# AnswerType Detection

In [None]:
#Feature Extraction
num_question = feature_extraction([clean_str(question)],CV_Unigram,CV_UniPOS)
AnswerTypeIdx = ATD_Model.predict(num_question)[0]
AnswerType = answer_types[AnswerTypeIdx]
print query
print 'AnswerType : ' + AnswerType

# Keywords Selection

 - Remove all word from English stopwords list

In [None]:
def keywords_selection(question, stopwords_list):
    question = clean_str(question)
    words = nltk.word_tokenize(question)
    keywords = []
    for w in words:
        if w not in stopwords_list:
            keywords.append(w)
    return keywords

question_keywords = keywords_selection(question,stopwords_list)
print query
print 'Keywords : ' + ' - '.join(question_keywords)

# Get Relevant Document
 - Using google seach API to get relevant document

In [None]:
service = build("customsearch", "v1",
            developerKey=Seach_api_key)

pages_content = []
for i in range(0, int((float(num_pages)/10))):
    if (i == 0):
        res = service.cse().list(q=question,cx = Custom_Search_Engine_ID).execute()
    else:
        res = service.cse().list(q=question,cx = Custom_Search_Engine_ID,num=10,start = i*10).execute()
    pages_content += res[u'items']

document_urls = []
document_titles = []
for page in pages_content:
    if 'fileFormat' in page:
        print 'Skip ' +  page[u'link']
        continue
    document_urls.append(page[u'link'])
    document_titles.append(page[u'title'])
    
for i in range(0,len(document_urls)):
    print document_titles[i]
    print document_urls[i]

# Passage Retrieval
 - Get all sentences from all document

In [None]:
def get_num_keyword(sent,keywords,stemmer):
    stem_sent = stem(sent,stemmer,decode = False)
    num_key = 0
    for kw in keywords:
        if stemmer.stem(kw) in stem_sent:
            num_key += 1
    return num_key

#### Get all candidate passages from all documents

In [None]:
# Get all candidate passages from all documents
passages = []
combine_all_sent = ""          #Combine all passages to speed up NER tagger
total_start = timeit.default_timer()
for i in range(0,len(document_urls)):
    start = timeit.default_timer()
    try:
        html = requests.get(document_urls[i], timeout = 5)
    except:
        print 'Cannot read ' + document_urls[i]
        continue
    stop = timeit.default_timer()
    tree = BeautifulSoup(html.text,'lxml')
    print 'Analyzing ' + document_urls[i] + ' : ' + str(round(stop - start,2)) + 's'
    # Remove invisible elements
    for invisible_elem in tree.find_all(['script', 'style']):
        invisible_elem.extract()
    sents = nltk.sent_tokenize(tree.get_text())
    for sent in sents:
        for sub_sent in sent.split('\n'):
            sub_sent = sub_sent.strip()
            if (len(sub_sent) > 0 and len(sub_sent) < 1000):
                num_keyword = get_num_keyword(sub_sent,question_keywords,stemmer)
                if (num_keyword > 0):
                    passages.append(Passage(sub_sent,i,num_keyword,0))
                    combine_all_sent += sub_sent + " Endofsent "
                
total_end = timeit.default_timer()
print 'Time elapse : ' + str(round(total_end - total_start,2)) + 's'

#### Tagging name enity for each passages

In [None]:
start = timeit.default_timer()

from nltk.tag import StanfordNERTagger

if (AnswerTypeIdx >= 7):
    for p in passages:
        p.ner_tag = EntityTagger(p.sent,List_Enity[AnswerType],stemmer)
else:
    if (AnswerType == "PERSON" or AnswerType == "LOCATION" or AnswerType == "ORGANIZATION"):
        classified_text = ST3class.tag(word_tokenize(combine_all_sent))
    else:
        classified_text = ST7class.tag(word_tokenize(combine_all_sent))
    i = 0
    words = []
    for t in classified_text:
        if (t[0] == 'Endofsent'):
            i += 1
            continue
        if (t[1] == AnswerType):
            words.append(t[0])
        else:
            if (len(words) > 0 ):
                passages[i].ner_tag.append(' '.join(words))
                words = []
                
stop = timeit.default_timer()
print 'Time elapse : ' + str(stop - start)

#### Eliminate passages have no entity match answer type

In [None]:
def entity_filter(tags,question):
    if len(tags) == 0:
        return False
    for t in tags:
        if t.lower() not in question:
            return True
    return False
    
print 'Total number of passages : ' + str(len(passages))
passages = [p for p in passages if entity_filter(p.ner_tag,question)]
print 'After Filtering : ' + str(len(passages))

#### Filter Passages by number of keyword

 - Find the maximum number of question keyword contain in a passages
 - Keep passages have number of question keyword < MAX

In [None]:
print 'Total passages : ' +  str(len(passages))
max_keyword = 0
min_num_passages = 20
for p in passages:
    if p.num_key > max_keyword:
        max_keyword = p.num_key
    
while (True):
    num_candidate_passages = 0
    for p in passages:
        if p.num_key >= max_keyword:
            num_candidate_passages += 1
    if (num_candidate_passages >= min_num_passages or max_keyword == 1):
        break
    else:
        max_keyword -=1 
print 'Max number of question keyword : ' + str(max_keyword)
passages = [p for p in passages if p.num_key >= max_keyword]
print 'After filtering : ' +  str(len(passages)) + '\n'
for i in range(0,min(10,len(passages))):
    print str(i) + ' - ' + passages[i].sent + '\n'

# Answer Extracting

 - Correct answer is the entity with highest frequency

In [None]:
def contain(s1,s2,stemmer):
    s1 = stemmer.stem(s1.lower())
    s2 = stemmer.stem(s2.lower())
    s1 = s1.split()
    s2 = s2.split()
    for w in s1:
        if w not in s2:
            return False
    return True

def get_most_common(answers):
    answers = Counter(answers).most_common()
    for i in range(0,len(answers)):
        if type(answers[i]) != tuple:
            continue
        full_answer = ''
        for j in range(i + 1,len(answers)):
            if type(answers[j]) != tuple:
                continue
            if contain(answers[i][0],answers[j][0],stemmer) and full_answer =='' and answers[j][1] > 1:
                full_answer = answers[j][0]
            if contain(answers[i][0],answers[j][0],stemmer) or contain(answers[j][0],answers[i][0],stemmer):
                answers[i] = (answers[i][0],answers[i][1] + answers[j][1])
                answers[j] = -1
        if full_answer != '':
            answers[i] = (full_answer,answers[i][1])
            
    answers = [a for a in answers if type(a)== tuple] 
    answers = (sorted(answers, key=lambda tup: tup[1]))
    answers.reverse()
    return answers

In [None]:
candidates_answer = []
for p in passages:
    candidates_answer += p.ner_tag
candidates_answer = [a for a in candidates_answer if a.lower() not in question]
final_answers = get_most_common(candidates_answer)
final_answers = final_answers[0 : min(7,len(final_answers))]
print '\nThe final answer is : ' + final_answers[0][0]
names = [a[0] for a in final_answers]
names.reverse()
freqs = [a[1] for a in final_answers]
freqs.reverse()
import plotly as py
import plotly.graph_objs as go
py.offline.init_notebook_mode()
data = [go.Bar(
            x=freqs,
            y=names,
            orientation = 'h'
)]
layout = go.Layout(margin=dict(l=150,r=10,t=10,b=80))
fig = go.Figure(data=data, layout=layout)
py.offline.iplot(fig)