# Testing Code for the Answer Model

imports

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import spacy
# from transformers import BertTokenizer

Getting the Data/Cleaning:

In [2]:
Answer_File = '../Data/Question_Answer_Dataset_v1.2/S10/question_answer_pairs.txt'

In [3]:
with open(Answer_File,'r',encoding="ISO-8859-1") as f:
    Questions = f.read().split('\n')

In [4]:
Data = {key:[] for key in Questions[0].split('\t')}

In [5]:
keys = list(Data.keys())

In [6]:
for row in range(1,len(Questions)):
    data_point = Questions[row].split('\t')
    if len(data_point)>1:
        for i in range(len(data_point)):
            Data[keys[i]].append(data_point[i])

In [7]:
df = pd.DataFrame(Data)

In [8]:
df.head()

Unnamed: 0,ArticleTitle,Question,Answer,DifficultyFromQuestioner,DifficultyFromAnswerer,ArticleFile
0,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,Alessandro Volta was not a professor of chemis...,easy,easy,data/set4/a10
1,Alessandro_Volta,Was Alessandro Volta a professor of chemistry?,No,easy,hard,data/set4/a10
2,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Alessandro Volta did invent the remotely opera...,easy,easy,data/set4/a10
3,Alessandro_Volta,Did Alessandro Volta invent the remotely opera...,Yes,easy,easy,data/set4/a10
4,Alessandro_Volta,Was Alessandro Volta taught in public schools?,Volta was taught in public schools.,easy,easy,data/set4/a10


## Parsing a text file input

In [9]:
nlp = spacy.load('en_core_web_sm')

In [10]:
def is_token_allowed(token):
    '''
        Only allow valid tokens which are not stop words
        and punctuation symbols.
    '''
    if (not token or not str(token).strip() or token.is_stop or token.is_punct):
        return False
    return True

def preprocess_token(token):
    # Reduce token to its lowercase lemma form
    return token.lemma_.strip().lower()

We can now do this for every single input and use these tokens as inputs to our model

In [11]:
def Get_Tokens(file_path):
    with open('../Data/Question_Answer_Dataset_v1.2/'+'S10/'+file_path+'.txt','r') as g:
        text = g.read()
    Article = nlp(text)
    return Article

In [12]:
def Get_Token_Sentences(file_path):
    with open('../Data/Question_Answer_Dataset_v1.2/'+'S10/'+file_path+'.txt','r') as g:
        text = g.read()
    Article = nlp(text)
    complete_filtered_tokens = [token for token in Article if token]
    return complete_filtered_tokens

In [13]:
from collections import defaultdict

Articles = defaultdict(list)

for i,path in enumerate(df['ArticleFile']):
    if df['ArticleTitle'][i] not in list(Articles.keys()):
        Articles[df['ArticleTitle'][i]] = Get_Tokens(path)

# Finding Similar Sentences

In [14]:
def find_similar_sentences(raw_text,question):
    nlp = spacy.load('en_core_web_md')
    
    # Break the text into sentences
    nlp.add_pipe('sentencizer') # updated
    
    question = nlp(question)
    sentences = [sent.text.strip() for sent in raw_text.sents]
    highly_similar_sentences = {}
    sims = []
    for i,sentence in enumerate(sentences):
        if sentence == '':
            continue
        sentence = nlp(sentence)
        sentence_no_stop_words = nlp(' '.join([str(t) for t in sentence if not t.is_stop]))
        question_no_stop_words = nlp(' '.join([str(t) for t in question if not t.is_stop]))
        
        sim = sentence_no_stop_words.similarity(question_no_stop_words)
        #if sim >= 0.7:
            #highly_similar_sentences.update({i:sim})
        
        sims.append(sim)
    highly_similar_sentences.update({np.argmax(sims):np.max(sims)})
            
    return np.array(sentences)[list(highly_similar_sentences.keys())],highly_similar_sentences



In [15]:
len(df)

1458

In [16]:
from collections import defaultdict
def get_similar_sentences(df,Articles,length=10):
    sol = defaultdict(list)
    for i in range(length):
        print(i)
        similar_sentence, scores = find_similar_sentences(Articles[df.iloc[i]['ArticleTitle']],df.iloc[i]['Question'])
        sol['Questions'].append(df.iloc[i]['Question'])
        sol['Similar_sentence'].append(similar_sentence)
        sol['Similarity_Score'].append(scores)
    return sol

In [17]:
sol = get_similar_sentences(df,Articles)

0


  sim = sentence_no_stop_words.similarity(question_no_stop_words)


1
2
3
4
5
6
7
8
9


In [18]:
df_results = pd.DataFrame(sol)

In [19]:
similar_sentence, scores = find_similar_sentences(Articles[df.iloc[4]['ArticleTitle']],df.iloc[4]['Question'])

  sim = sentence_no_stop_words.similarity(question_no_stop_words)


In [20]:
similar_sentence

array(['Volta was born in Como, Italy and was taught in the public schools there.'],
      dtype='<U384')

In [43]:
scores

{2: 0.8862109755892149, 3: 0.6920588262695283, 4: 0.4340390059526703}

In [19]:
df_results.iloc[4]['Similar_sentence']

array(['Volta was born in Como, Italy and was taught in the public schools there.'],
      dtype='<U384')

In [23]:
df_results.iloc[4]['Questions']

'Was Alessandro Volta taught in public schools?'

In [21]:
df_results.iloc[2]['Similar_sentence']

array(['*  (\n\nAn additional invention pioneered by Volta, was the remotely operated pistol.'],
      dtype='<U384')

In [22]:
df_results

Unnamed: 0,Questions,Similar_sentence,Similarity_Score
0,Was Alessandro Volta a professor of chemistry?,[In 1774 he became a professor of physics at t...,{3: 0.7189053719218323}
1,Was Alessandro Volta a professor of chemistry?,[In 1774 he became a professor of physics at t...,{3: 0.7189053719218323}
2,Did Alessandro Volta invent the remotely opera...,[* (\n\nAn additional invention pioneered by ...,{26: 0.6861985771989005}
3,Did Alessandro Volta invent the remotely opera...,[* (\n\nAn additional invention pioneered by ...,{26: 0.6861985771989005}
4,Was Alessandro Volta taught in public schools?,"[Volta was born in Como, Italy and was taught ...",{2: 0.8862109755892149}
5,Was Alessandro Volta taught in public schools?,"[Volta was born in Como, Italy and was taught ...",{2: 0.8862109755892149}
6,Who did Alessandro Volta marry?,"[In announcing his discovery of the pile, Volt...",{25: 0.6575958942488963}
7,Who did Alessandro Volta marry?,"[In announcing his discovery of the pile, Volt...",{25: 0.6575958942488963}
8,What did Alessandro Volta invent in 1800?,[A Pioneer in Electrochemistry\n* Count Aless...,{63: 0.7280840376233481}
9,What did Alessandro Volta invent in 1800?,[A Pioneer in Electrochemistry\n* Count Aless...,{63: 0.7280840376233481}


In [19]:
#!pip install --upgrade tensorflow_hub

import tensorflow_hub as hub

In [20]:
from transformers import BertForQuestionAnswering, AutoTokenizer

modelname = 'deepset/bert-base-cased-squad2'

model = BertForQuestionAnswering.from_pretrained(modelname)
tokenizer = AutoTokenizer.from_pretrained(modelname)

In [25]:
from transformers import pipeline
nlp_bert = pipeline('question-answering', model=model, tokenizer=tokenizer)

In [28]:
context = "The Intergovernmental Panel on Climate Change (IPCC) is a scientific intergovernmental body under the auspices of the United Nations, set up at the request of member governments. It was first established in 1988 by two United Nations organizations, the World Meteorological Organization (WMO) and the United Nations Environment Programme (UNEP), and later endorsed by the United Nations General Assembly through Resolution 43/53. Membership of the IPCC is open to all members of the WMO and UNEP. The IPCC produces reports that support the United Nations Framework Convention on Climate Change (UNFCCC), which is the main international treaty on climate change. The ultimate objective of the UNFCCC is to \"stabilize greenhouse gas concentrations in the atmosphere at a level that would prevent dangerous anthropogenic [i.e., human-induced] interference with the climate system\". IPCC reports cover \"the scientific, technical and socio-economic information relevant to understanding the scientific basis of risk of human-induced climate change, its potential impacts and options for adaptation and mitigation.\""

nlp_bert({
    'question': 'What organization is the IPCC a part of?',
    'context': context
})

{'score': 0.48815828561782837,
 'start': 118,
 'end': 132,
 'answer': 'United Nations'}

In [39]:
nlp = spacy.load('en_core_web_md')
    
# Break the text into sentences
nlp.add_pipe('sentencizer') # updated

context = Articles[df.iloc[2]['ArticleTitle']]
question = df.iloc[2]['Question']

nlp_bert({'question':str(question),'context':str(context)})

{'score': 0.05283139646053314, 'start': 3368, 'end': 3369, 'answer': '*'}

In [40]:
df.iloc[2]['Question']

'Did Alessandro Volta invent the remotely operated pistol?'

In [44]:
nlp = spacy.load('en_core_web_md')
    
# Break the text into sentences
nlp.add_pipe('sentencizer') # updated

context = Articles[df.iloc[2]['ArticleTitle']]
question = df.iloc[2]['Question']

nlp_bert({'question':str(question),'context':similar_sentences[0]})

{'score': 0.004538378212600946,
 'start': 0,
 'end': 48,
 'answer': '*  (\n\nAn additional invention pioneered by Volta'}

In [41]:
similar_sentences, scores = find_similar_sentences(Articles[df.iloc[2]['ArticleTitle']],df.iloc[2]['Question'])

  sim = sentence_no_stop_words.similarity(question_no_stop_words)


In [42]:
similar_sentences[0]

'*  (\n\nAn additional invention pioneered by Volta, was the remotely operated pistol.'

In [18]:
df.iloc[0]['Answer']

'Alessandro Volta was not a professor of chemistry.'

In [20]:
text = nlp('professor of physics')

In [21]:
question = nlp('professor of chemistry')

In [22]:
text.similarity(question)

  text.similarity(question)


0.9140061561511593

In [24]:
question.similarity(nlp('proffesor of chemistry'))

1.0