## 1. Read the data

In [1]:
import pandas as pd
import spacy

df = pd.read_csv("news_dataset.csv", encoding="ISO-8859-1")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       1000 non-null   int64 
 1   author   994 non-null    object
 2   date     1000 non-null   object
 3   year     1000 non-null   object
 4   month    1000 non-null   object
 5   topic    1000 non-null   object
 6   article  1000 non-null   object
dtypes: int64(1), object(6)
memory usage: 54.8+ KB


## 2. Data preprocessing

In [2]:
df_clean = df.dropna(subset=['author'])

df_clean.info()

<class 'pandas.core.frame.DataFrame'>
Index: 994 entries, 0 to 999
Data columns (total 7 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       994 non-null    int64 
 1   author   994 non-null    object
 2   date     994 non-null    object
 3   year     994 non-null    object
 4   month    994 non-null    object
 5   topic    994 non-null    object
 6   article  994 non-null    object
dtypes: int64(1), object(6)
memory usage: 62.1+ KB


## 3. Information Retrieval-based Question Answering (IR-QA) components

### 3.1.1 **Article retrieval** to find the top 10 relevant article

`The code is provided as a demonstration; it will different when implemented in the actual application.`

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

tfidf_configs = {
    'lowercase': True,
    'analyzer': 'word',
    'stop_words': 'english',
    'binary': True,
    'max_df': 0.9,
    'max_features': 10_000
}

retriever_configs = {
    'n_neighbors': 10,  # Retrieve top 10 nearest neighbors
    'metric': 'cosine'
}

embedding = TfidfVectorizer(**tfidf_configs)
retriever = NearestNeighbors(**retriever_configs)



def article_retrieval(df, question):
    # Fit the TF-IDF vectorizer and the NearestNeighbors model
    X = embedding.fit_transform(df['article'])
    retriever.fit(X, df['id'].values.reshape(-1, 1))

    # Transform the question to find its nearest neighbors
    X_question = embedding.transform([question])

    # Find the indices and distances of the 10 nearest neighbors
    distances, indices = retriever.kneighbors(X_question, return_distance=True)

    # Retrieve the articles, IDs, and distances for the top 10 nearest neighbors
    top_articles = df.iloc[indices[0]]
    for i, (index, row) in enumerate(top_articles.iterrows()):
        distance = distances[0][i]
        print(f"ID: {row['id']}| Article: {row['article'][:50]}...| Similarity: {1-distance:.4f}")
    return top_articles

### 3.1.2 Example of `article_retrieval` function

In [4]:
question = "Who is the vice chairman of Samsung"

top_articles = article_retrieval(df_clean,question)

ID: 17574| Article: SEOUL, South Korea  ?   A special prosecutor inves...| Similarity: 0.1257
ID: 17764| Article: SEOUL, South Korea  ?   A South Korean court on Th...| Similarity: 0.1235
ID: 18375| Article: The main reason for the gender gaps at work  ?   w...| Similarity: 0.0920
ID: 17734| Article: Good morning.  Here?s what you need to know: ? Pre...| Similarity: 0.0819
ID: 18222| Article: MODERN homes today are getting   light bulbs, ther...| Similarity: 0.0757
ID: 17929| Article: LONDON  ?   Bernie Ecclestone?s longstanding leade...| Similarity: 0.0700
ID: 17487| Article: SEATTLE  ?   For a technology to crack the mainstr...| Similarity: 0.0629
ID: 17851| Article: SAN FRANCISCO  ?   By mastering some tough compute...| Similarity: 0.0610
ID: 18410| Article: For anyone wondering how former President Barack O...| Similarity: 0.0592
ID: 17657| Article: ESCONDIDO, Calif.  ?   In Southern California in t...| Similarity: 0.0546


### 3.2.1 **Coreference Resolution** for top 10 relevant article

`The code is provided as a demonstration; it will different when implemented in the actual application.`

(Otmazgin, Cattan & Goldberg, 2022). 

In [6]:
from fastcoref import spacy_component
from fastcoref import LingMessCoref
import spacy

model = LingMessCoref()
def Coreference_Resolution(text):
    nlp = spacy.load("en_core_web_sm")
    nlp.add_pipe("fastcoref")
    doc = nlp(text)
    # nlp.add_pipe("fastcoref", config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'})
    doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
    return doc._.resolved_text


03/29/2024 11:21:10 - INFO - 	 missing_keys: []
03/29/2024 11:21:10 - INFO - 	 unexpected_keys: []
03/29/2024 11:21:10 - INFO - 	 mismatched_keys: []
03/29/2024 11:21:10 - INFO - 	 error_msgs: []
03/29/2024 11:21:10 - INFO - 	 Model Parameters: 590.0M, Transformer: 434.6M, Coref head: 155.4M


### 3.2.2 Example of `Coreference_Resolution` function

In [7]:
from spacy import displacy

sample = "The de facto leader, Jay Y. Lee, the vice chairman of Samsung, will be questioned on Thursday, according to the special prosecutor?s office, which recommended that he also be investigated on suspicion of perjury."
resolved_text = Coreference_Resolution(sample)

nlp = spacy.load("en_core_web_sm")

doc = nlp(sample)
sample_spans = list(doc.sents)
print("\nOriginal text:")
displacy.render(sample_spans, style="ent")

doc = nlp(resolved_text)
resolved_spans = list(doc.sents)
print("Resolved text:")
displacy.render(resolved_spans, style="ent")

03/29/2024 11:21:16 - INFO - 	 missing_keys: []
03/29/2024 11:21:16 - INFO - 	 unexpected_keys: []
03/29/2024 11:21:16 - INFO - 	 mismatched_keys: []
03/29/2024 11:21:16 - INFO - 	 error_msgs: []
03/29/2024 11:21:16 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/29/2024 11:21:16 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 95.83 examples/s]
03/29/2024 11:21:16 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 13.22it/s]
03/29/2024 11:21:16 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 249.94 examples/s]
03/29/2024 11:21:17 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 33.33it/s]



Original text:


Resolved text:


### 3.3.1 **Text matching** to find the most relevant sentence

`The code is provided as a demonstration; it will different when implemented in the actual application.`

In [8]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk

def text_matching(top_articles,question):
    sentences = nltk.sent_tokenize(" ".join(top_articles))
    vectorizer = TfidfVectorizer(stop_words='english')
    combined_text = [question] + sentences
    tfidf_matrix = vectorizer.fit_transform(combined_text)
    cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

    # Find the index of the highest similarity score
    most_similar_sentence_index = cosine_similarities.argmax()

    # Return the most relevant sentence and its similarity score
    return sentences[most_similar_sentence_index], cosine_similarities[0, most_similar_sentence_index]

### 3.3.2 Example of `text_matching` function

In [9]:
resolved_article = [Coreference_Resolution(i) for i in top_articles['article']]

03/29/2024 11:21:32 - INFO - 	 missing_keys: []
03/29/2024 11:21:32 - INFO - 	 unexpected_keys: []
03/29/2024 11:21:32 - INFO - 	 mismatched_keys: []
03/29/2024 11:21:32 - INFO - 	 error_msgs: []
03/29/2024 11:21:32 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/29/2024 11:21:32 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 49.98 examples/s]
03/29/2024 11:21:32 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.04it/s]
03/29/2024 11:21:32 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 52.65 examples/s]
03/29/2024 11:21:32 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.41it/s]
03/29/2024 11:21:33 - INFO - 	 missing_keys: []
03/29/2024 11:21:33 - INFO - 	 unexpected_keys: []
03/29/2024 11:21:33 - INFO - 	 mismatched_keys: []
03/29/2024 11:21:33 - INFO - 	 error_msgs: []
03/29/2024 11:21:33 - INFO - 	 Model 

In [10]:
sentence, similarity = text_matching(resolved_article,question)

print(f"The most related sentence: {sentence}\nWith {similarity:.4f} similarity")

The most related sentence: Jay Y. Lee, the Samsung vice chairman, has denied accusations that Jay Y. Lee, the Samsung vice chairman, paid more than $36 million in bribes to organizations linked to Choi   the unofficial adviser at the center of the corruption scandal that engulfed President Park  .
With 0.4675 similarity


### 3.4.1 **Type-based Answer Extraction** by name entity

`The code is provided as a demonstration; it will different when implemented in the actual application.`

In [12]:
def Answer_Extraction(sentence,similarity,question):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(sentence)
    sample_doc = {i.text:i.label_ for i in doc.ents}
    question = question.lower()
    if similarity > 0.35:
            try:
                if question.startswith('who'):
                    who_list = [k for k,v in sample_doc.items() if v=='PERSON']
                    return who_list[0] if len (who_list)>0 else 'NO ANSWER'
                elif question.startswith('where'):
                    where_list = [k for k,v in sample_doc.items() if v=='LOC' or  v=='GPE']
                    return where_list[0] if len (where_list) >0 else 'NO ANSWER'
                elif question.startswith('when'):
                    when_list = [k for k,v in sample_doc.items() if v=='DATE']
                    return when_list[0] if len (when_list) >0 else 'NO ANSWER'
                elif question.startswith('what'):
                    what_list = [k for k,v in sample_doc.items() if v=='ORG']
                    return what_list[0] if len (what_list) >0 else 'NO ANSWER'
                elif question.startswith('how many'):
                    how_many_list = [k for k,v in sample_doc.items() if v=='MONEY' or  v=='DATE']
                    return how_many_list[0] if len (how_many_list) >0 else 'NO ANSWER'
            except:
                return 'NO ANSWER'
    else:
        return "NO ANSWER"

### 3.4.2 Example of `Answer_Extraction` function

In [13]:
Answer_Extraction(sentence,similarity,question)

'Jay Y. Lee'

### 3.5.1 Bert for Question Answering

`The code is provided as a demonstration; it will different when implemented in the actual application.`

In [14]:
import torch
from transformers import BertForQuestionAnswering, BertTokenizer

def bert(sentence,similarity,question):
    if similarity > 0.35:
        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        
        input_ids = tokenizer.encode(question, sentence, max_length=512, truncation=True)

        tokens = tokenizer.convert_ids_to_tokens(input_ids)
        sep_index = input_ids.index(tokenizer.sep_token_id)
        num_seg_a = sep_index + 1
        num_seg_b = len(input_ids) - num_seg_a
        segment_ids = [0]*num_seg_a + [1]*num_seg_b

        model_scores = model.forward(torch.tensor([input_ids]),token_type_ids=torch.tensor([segment_ids]))

        start_scores = model_scores.start_logits
        end_scores = model_scores.end_logits

        answer_start = torch.argmax(start_scores)
        answer_end = torch.argmax(end_scores)

        return ' '.join(tokens[answer_start:answer_end+1])
    
    else:
        return "NO ANSWER"

### 3.5.2 Example of `bert` function

In [15]:
bert(sentence,similarity,question)

'jay y . lee'

## 4. The IR-QA system and model selection

### 4.1 The IR-QA system for medel selection

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from fastcoref import spacy_component
from fastcoref import LingMessCoref
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import torch
from transformers import BertForQuestionAnswering, BertTokenizer


class Information_Retrieval_System_model_selection:
    def __init__(self,model, df, question):
        self.df = df
        self.question = question
        self.model = model

        self.top_articles = self.article_retrieval(df, question)
        self.resolved_articles = [self.Coreference_Resolution(article) for article in self.top_articles['article']]
        self.most_relevant_sentence, self.similarity = self.text_matching(self.resolved_articles, question)

    
    def article_retrieval(self,df, question):
        tfidf_configs = {
            'lowercase': True,
            'analyzer': 'word',
            'stop_words': 'english',
            'binary': True,
            'max_df': 0.9,
            'max_features': 10_000
        }

        retriever_configs = {
            'n_neighbors': 10, 
            'metric': 'cosine'
        }

        embedding = TfidfVectorizer(**tfidf_configs)
        retriever = NearestNeighbors(**retriever_configs)

        X = embedding.fit_transform(df['article'])
        retriever.fit(X, df['id'].values.reshape(-1, 1))

        X_question = embedding.transform([question])

        distances, indices = retriever.kneighbors(X_question, return_distance=True)

        return df.iloc[indices[0]]
    
    def Coreference_Resolution(self,text):
        nlp = spacy.load(self.model)
        nlp.add_pipe("fastcoref")
        doc = nlp(text)
        # nlp.add_pipe("fastcoref", config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'})
        doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
        return doc._.resolved_text
    
    def text_matching(self,top_articles,question):
        sentences = nltk.sent_tokenize(" ".join(top_articles))
        vectorizer = TfidfVectorizer(stop_words='english')
        combined_text = [question] + sentences
        tfidf_matrix = vectorizer.fit_transform(combined_text)
        cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

        # Find the index of the highest similarity score
        most_similar_sentence_index = cosine_similarities.argmax()

        # Return the most relevant sentence and its similarity score
        return sentences[most_similar_sentence_index], cosine_similarities[0, most_similar_sentence_index]
    
    def Answer_Extraction(self,sentence,similarity,question):
        nlp = spacy.load(self.model)
        doc = nlp(sentence)
        sample_doc = {i.text:i.label_ for i in doc.ents}
        question = question.lower()
        if similarity > 0.35:
                try:
                    if question.startswith('who'):
                        who_list = [k for k,v in sample_doc.items() if v=='PERSON']
                        return who_list[0] if len (who_list)>0 else 'NO ANSWER'
                    elif question.startswith('where'):
                        where_list = [k for k,v in sample_doc.items() if v=='LOC' or  v=='GPE']
                        return where_list[0] if len (where_list) >0 else 'NO ANSWER'
                    elif question.startswith('when'):
                        when_list = [k for k,v in sample_doc.items() if v=='DATE']
                        return when_list[0] if len (when_list) >0 else 'NO ANSWER'
                    elif question.startswith('what'):
                        what_list = [k for k,v in sample_doc.items() if v=='ORG']
                        return what_list[0] if len (what_list) >0 else 'NO ANSWER'
                    elif question.startswith('how many'):
                        how_many_list = [k for k,v in sample_doc.items() if v=='MONEY' or  v=='DATE']
                        return how_many_list[0] if len (how_many_list) >0 else 'NO ANSWER'
                except:
                    return 'NO ANSWER'
        else:
            return "NO ANSWER"

    def Rule_base(self):
        return self.Answer_Extraction(self.most_relevant_sentence, self.similarity, self.question)

    
    def Bert(self):
        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        if self.similarity > 0.35:
            input_ids = tokenizer.encode(self.question, self.most_relevant_sentence, max_length=512, truncation=True)

            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            sep_index = input_ids.index(tokenizer.sep_token_id)
            num_seg_a = sep_index + 1
            num_seg_b = len(input_ids) - num_seg_a
            segment_ids = [0]*num_seg_a + [1]*num_seg_b

            model_scores = model.forward(torch.tensor([input_ids]),token_type_ids=torch.tensor([segment_ids]))

            start_scores = model_scores.start_logits
            end_scores = model_scores.end_logits

            answer_start = torch.argmax(start_scores)
            answer_end = torch.argmax(end_scores)

            return ' '.join(tokens[answer_start:answer_end+1])
        else:
            return "NO ANSWER"




### 4.2 The 20 self-defined questions

In [38]:
self_define_question_df = pd.read_csv("Test_question.csv", encoding="ISO-8859-1")

self_define_question_df.head()

Unnamed: 0,id,question,answer
0,17574,Who is the vice chairman of Samsung?,Jay Y. Lee
1,17344,What was Chen Zhongshu the head of?',Panzhihua Land and Resources Bureau
2,17579,What unexpected product contains added sugar?,NO ANSWER
3,17598,What does the Trump administration frequently ...,NO ANSWER
4,17620,What date did Steve Harvey meet Donald Trump?,NO ANSWER


### 4.3 Evaluation metric

F1 score, Exact Match and output timing was used to evaluate the model. Since the usual F1 from sklearn is designed for calculate numeric data, therefore it is not applicable in the NLP task. Referring to the Evaluation Scripts v2.0 from SQuAD GitHub, it provides the calculation of F1 score and Exact Match. In this assignment will use it as an evaluation metrics (The Stanford NLP Group 2024).

In [30]:
import collections
import re
import string

def normalize_answer(s):
  """Lower text and remove punctuation, articles and extra whitespace."""
  def remove_articles(text):
    regex = re.compile(r'\b(a|an|the)\b', re.UNICODE)
    return re.sub(regex, ' ', text)
  def white_space_fix(text):
    return ' '.join(text.split())
  def remove_punc(text):
    exclude = set(string.punctuation)
    return ''.join(ch for ch in text if ch not in exclude)
  def lower(text):
    return text.lower()
  return white_space_fix(remove_articles(remove_punc(lower(s))))

def get_tokens(s):
  if not s: return []
  return normalize_answer(s).split()

def compute_exact(a_gold, a_pred):
  return int(normalize_answer(a_gold) == normalize_answer(a_pred))

def compute_f1(a_gold, a_pred):
  gold_toks = get_tokens(a_gold)
  pred_toks = get_tokens(a_pred)
  common = collections.Counter(gold_toks) & collections.Counter(pred_toks)
  num_same = sum(common.values())
  if len(gold_toks) == 0 or len(pred_toks) == 0:
    # If either is no-answer, then F1 is 1 if they agree, 0 otherwise
    return int(gold_toks == pred_toks)
  if num_same == 0:
    return 0
  precision = 1.0 * num_same / len(pred_toks)
  recall = 1.0 * num_same / len(gold_toks)
  f1 = (2 * precision * recall) / (precision + recall)
  return f1

### 4.4 Model selection with validation

In [36]:
import time
import pandas as pd
from sklearn.model_selection import KFold

# Initialize KFold
kf = KFold(n_splits=5)

# Evaluation function
def evaluate_model(model_name, data):
    fold_f1_scores_rule = []
    fold_exact_matches_rule = []
    fold_times_rule = []

    fold_f1_scores_bert = []
    fold_exact_matches_bert = []
    fold_times_bert = []

    for _, test_index in kf.split(data):
        test_data = data.iloc[test_index]
        
        f1_scores_rule = []
        exact_matches_rule = []
        times_rule = []

        f1_scores_bert = []
        exact_matches_bert = []
        times_bert = []

        for _, row in test_data.iterrows():
            question = row['question']
            gold_answer = row['answer']

            # Initialize the system for each question
            system = Information_Retrieval_System_model_selection(model_name, df_clean, question)

            # Rule-based prediction
            start_time_rule = time.time()
            predicted_answer_rule = system.Rule_base()
            times_rule.append(time.time() - start_time_rule)
            
            f1_scores_rule.append(compute_f1(gold_answer, predicted_answer_rule))
            exact_matches_rule.append(compute_exact(gold_answer, predicted_answer_rule))

            # BERT-based prediction
            start_time_bert = time.time()
            predicted_answer_bert = system.Bert()
            times_bert.append(time.time() - start_time_bert)

            f1_scores_bert.append(compute_f1(gold_answer, predicted_answer_bert))
            exact_matches_bert.append(compute_exact(gold_answer, predicted_answer_bert))

        # Calculate and store the averages for this fold
        fold_f1_scores_rule.append(sum(f1_scores_rule) / len(f1_scores_rule))
        fold_exact_matches_rule.append(sum(exact_matches_rule) / len(exact_matches_rule))
        fold_times_rule.append(sum(times_rule) / len(times_rule))

        fold_f1_scores_bert.append(sum(f1_scores_bert) / len(f1_scores_bert))
        fold_exact_matches_bert.append(sum(exact_matches_bert) / len(exact_matches_bert))
        fold_times_bert.append(sum(times_bert) / len(times_bert))

    # Calculate overall average scores across all folds
    overall_avg_f1_rule = sum(fold_f1_scores_rule) / len(fold_f1_scores_rule)
    overall_avg_exact_match_rule = sum(fold_exact_matches_rule) / len(fold_exact_matches_rule)
    overall_avg_time_rule = sum(fold_times_rule) / len(fold_times_rule)

    overall_avg_f1_bert = sum(fold_f1_scores_bert) / len(fold_f1_scores_bert)
    overall_avg_exact_match_bert = sum(fold_exact_matches_bert) / len(fold_exact_matches_bert)
    overall_avg_time_bert = sum(fold_times_bert) / len(fold_times_bert)

    return (overall_avg_f1_rule, overall_avg_exact_match_rule, overall_avg_time_rule,
            overall_avg_f1_bert, overall_avg_exact_match_bert, overall_avg_time_bert)



In [37]:
# Main evaluation loop
model_list = ["en_core_web_sm", "en_core_web_trf", "en_core_web_md", "en_core_web_lg"]
model_results = {}

for model_name in model_list:
    results = evaluate_model(model_name, self_define_question_df)
    model_results[model_name] = {
        'Rule-based': {'F1': results[0],'Exact Match': results[1],'Time': results[2]},
        'BERT': {'F1': results[3],'Exact Match': results[4],'Time': results[5]}
    }

# Display the results
for model, scores in model_results.items():
    print(f"Model: {model}")
    for method, metrics in scores.items():
        print(f"  {method}: F1: {metrics['F1']}, Exact Match: {metrics['Exact Match']}, Time: {metrics['Time']} sec")



03/30/2024 12:04:05 - INFO - 	 missing_keys: []
03/30/2024 12:04:05 - INFO - 	 unexpected_keys: []
03/30/2024 12:04:05 - INFO - 	 mismatched_keys: []
03/30/2024 12:04:05 - INFO - 	 error_msgs: []
03/30/2024 12:04:05 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/30/2024 12:04:05 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 45.49 examples/s]
03/30/2024 12:04:05 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.50it/s]
03/30/2024 12:04:05 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 45.44 examples/s]
03/30/2024 12:04:05 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.71it/s]
03/30/2024 12:04:07 - INFO - 	 missing_keys: []
03/30/2024 12:04:07 - INFO - 	 unexpected_keys: []
03/30/2024 12:04:07 - INFO - 	 mismatched_keys: []
03/30/2024 12:04:07 - INFO - 	 error_msgs: []
03/30/2024 12:04:07 - INFO - 	 Model 

Model: en_core_web_sm
  Rule-based: F1: 0.6833333333333333, Exact Match: 0.65, Time: 0.31165207624435426 sec
  BERT: F1: 0.6365079365079365, Exact Match: 0.5, Time: 1.7874438524246217 sec
Model: en_core_web_trf
  Rule-based: F1: 0.7833333333333333, Exact Match: 0.75, Time: 1.4920156717300415 sec
  BERT: F1: 0.6365079365079365, Exact Match: 0.5, Time: 1.759422981739044 sec
Model: en_core_web_md
  Rule-based: F1: 0.7833333333333333, Exact Match: 0.75, Time: 0.7362376689910889 sec
  BERT: F1: 0.6365079365079365, Exact Match: 0.5, Time: 1.8217221975326539 sec
Model: en_core_web_lg
  Rule-based: F1: 0.7333333333333333, Exact Match: 0.7, Time: 0.9263777494430542 sec
  BERT: F1: 0.6365079365079365, Exact Match: 0.5, Time: 1.8769330739974976 sec


| Model            | Method     | F1                 | Exact Match | Time (sec)          |
|------------------|------------|--------------------|-------------|---------------------|
| en_core_web_sm   | Rule-based | 0.6833333333333333 | 0.65        | 0.31165207624435426 |
| en_core_web_sm   | BERT       | 0.6365079365079365 | 0.5         | 1.7874438524246217  |
| en_core_web_trf  | Rule-based | 0.7833333333333333 | 0.75        | 1.4920156717300415  |
| en_core_web_trf  | BERT       | 0.6365079365079365 | 0.5         | 1.759422981739044   |
| **en_core_web_md**   | **Rule-based** | **0.7833333333333333** | **0.75**        | **0.7362376689910889**  |
| **en_core_web_md**   | **BERT**       | **0.6365079365079365** | **0.5**         | **1.8217221975326539**  |
| en_core_web_lg   | Rule-based | 0.7333333333333333 | 0.7         | 0.9263777494430542  |
| en_core_web_lg   | BERT       | 0.6365079365079365 | 0.5         | 1.8769330739974976  |


### 4.5.1 The IR-QA system with "en_core_web_md" model

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from fastcoref import spacy_component
from fastcoref import LingMessCoref
import spacy
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import nltk
import torch
from transformers import BertForQuestionAnswering, BertTokenizer


class Information_Retrieval_System:
    def __init__(self, df, question):
        self.df = df
        self.question = question

        self.top_articles = self.article_retrieval(df, question)
        self.resolved_articles = [self.Coreference_Resolution(article) for article in self.top_articles['article']]
        self.most_relevant_sentence, self.similarity = self.text_matching(self.resolved_articles, question)

    
    def article_retrieval(self,df, question):
        tfidf_configs = {
            'lowercase': True,
            'analyzer': 'word',
            'stop_words': 'english',
            'binary': True,
            'max_df': 0.9,
            'max_features': 10_000
        }

        retriever_configs = {
            'n_neighbors': 10, 
            'metric': 'cosine'
        }

        embedding = TfidfVectorizer(**tfidf_configs)
        retriever = NearestNeighbors(**retriever_configs)

        X = embedding.fit_transform(df['article'])
        retriever.fit(X, df['id'].values.reshape(-1, 1))

        X_question = embedding.transform([question])

        distances, indices = retriever.kneighbors(X_question, return_distance=True)

        return df.iloc[indices[0]]
    
    def Coreference_Resolution(self,text):
        nlp = spacy.load("en_core_web_md")
        nlp.add_pipe("fastcoref")
        doc = nlp(text)
        # nlp.add_pipe("fastcoref", config={'model_architecture': 'LingMessCoref', 'model_path': 'biu-nlp/lingmess-coref', 'device': 'cpu'})
        doc = nlp(text, component_cfg={"fastcoref": {'resolve_text': True}})
        return doc._.resolved_text
    
    def text_matching(self,top_articles,question):
        sentences = nltk.sent_tokenize(" ".join(top_articles))
        vectorizer = TfidfVectorizer(stop_words='english')
        combined_text = [question] + sentences
        tfidf_matrix = vectorizer.fit_transform(combined_text)
        cosine_similarities = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:])

        # Find the index of the highest similarity score
        most_similar_sentence_index = cosine_similarities.argmax()

        # Return the most relevant sentence and its similarity score
        return sentences[most_similar_sentence_index], cosine_similarities[0, most_similar_sentence_index]
    
    def Answer_Extraction(self,sentence,similarity,question):
        nlp = spacy.load("en_core_web_md")
        doc = nlp(sentence)
        sample_doc = {i.text:i.label_ for i in doc.ents}
        question = question.lower()
        if similarity > 0.35:
                try:
                    if question.startswith('who'):
                        who_list = [k for k,v in sample_doc.items() if v=='PERSON']
                        return who_list[0] if len (who_list)>0 else 'NO ANSWER'
                    elif question.startswith('where'):
                        where_list = [k for k,v in sample_doc.items() if v=='LOC' or  v=='GPE']
                        return where_list[0] if len (where_list) >0 else 'NO ANSWER'
                    elif question.startswith('when'):
                        when_list = [k for k,v in sample_doc.items() if v=='DATE']
                        return when_list[0] if len (when_list) >0 else 'NO ANSWER'
                    elif question.startswith('what'):
                        what_list = [k for k,v in sample_doc.items() if v=='ORG']
                        return what_list[0] if len (what_list) >0 else 'NO ANSWER'
                    elif question.startswith('how many'):
                        how_many_list = [k for k,v in sample_doc.items() if v=='MONEY' or  v=='DATE']
                        return how_many_list[0] if len (how_many_list) >0 else 'NO ANSWER'
                except:
                    return 'NO ANSWER'
        else:
            return "NO ANSWER"

    def Rule_base(self):
        return self.Answer_Extraction(self.most_relevant_sentence, self.similarity, self.question)

    
    def Bert(self):
        model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
        tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

        if self.similarity > 0.35:
            input_ids = tokenizer.encode(self.question, self.most_relevant_sentence, max_length=512, truncation=True)

            tokens = tokenizer.convert_ids_to_tokens(input_ids)
            sep_index = input_ids.index(tokenizer.sep_token_id)
            num_seg_a = sep_index + 1
            num_seg_b = len(input_ids) - num_seg_a
            segment_ids = [0]*num_seg_a + [1]*num_seg_b

            model_scores = model.forward(torch.tensor([input_ids]),token_type_ids=torch.tensor([segment_ids]))

            start_scores = model_scores.start_logits
            end_scores = model_scores.end_logits

            answer_start = torch.argmax(start_scores)
            answer_end = torch.argmax(end_scores)

            return ' '.join(tokens[answer_start:answer_end+1])
        else:
            return "NO ANSWER"


### 4.5.2 Demonstration of `Information_Retrieval_System`

In [41]:
question = 'Who is the vice chairman of Samsung?'


ir = Information_Retrieval_System(df_clean,question)

03/30/2024 14:04:45 - INFO - 	 missing_keys: []
03/30/2024 14:04:45 - INFO - 	 unexpected_keys: []
03/30/2024 14:04:45 - INFO - 	 mismatched_keys: []
03/30/2024 14:04:45 - INFO - 	 error_msgs: []
03/30/2024 14:04:45 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/30/2024 14:04:45 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 37.04 examples/s]
03/30/2024 14:04:45 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.37it/s]
03/30/2024 14:04:45 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 38.41 examples/s]
03/30/2024 14:04:46 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00,  6.51it/s]
03/30/2024 14:04:47 - INFO - 	 missing_keys: []
03/30/2024 14:04:47 - INFO - 	 unexpected_keys: []
03/30/2024 14:04:47 - INFO - 	 mismatched_keys: []
03/30/2024 14:04:47 - INFO - 	 error_msgs: []
03/30/2024 14:04:47 - INFO - 	 Model 

In [42]:
print(ir.Rule_base())
print(ir.Bert())

Jay Y. Lee
jay y . lee


## 5. Testing

### 5.1 The evaluation function for IR-QA system

In [44]:
import pandas as pd
import time

def evaluate_information_retrieval_system(df, questions, answers):
    rule_based_exact_matches = []
    rule_based_f1_scores = []
    bert_exact_matches = []
    bert_f1_scores = []
    
    rule_based_timings = []
    bert_timings = []

    for question, true_answer in zip(questions, answers):
        # Initialize the Information Retrieval System for each question
        ir_system = Information_Retrieval_System(df, question)
        
        # Rule-based answer retrieval and timing
        start_time = time.time()
        rule_based_answer = ir_system.Rule_base()
        end_time = time.time()
        rule_based_timings.append(end_time - start_time)
        
        # Compute Exact Match and F1 score for Rule-based method
        rule_based_exact_matches.append(compute_exact(true_answer, rule_based_answer))
        rule_based_f1_scores.append(compute_f1(true_answer, rule_based_answer))
        
        # BERT answer retrieval and timing
        start_time = time.time()
        bert_answer = ir_system.Bert()
        end_time = time.time()
        bert_timings.append(end_time - start_time)
        
        # Compute Exact Match and F1 score for BERT method
        bert_exact_matches.append(compute_exact(true_answer, bert_answer))
        bert_f1_scores.append(compute_f1(true_answer, bert_answer))
    
    # Calculate and return the average Exact Match, F1 scores, and timings for both methods
    avg_rule_based_exact_match = sum(rule_based_exact_matches) / len(rule_based_exact_matches)
    avg_rule_based_f1 = sum(rule_based_f1_scores) / len(rule_based_f1_scores)
    avg_bert_exact_match = sum(bert_exact_matches) / len(bert_exact_matches)
    avg_bert_f1 = sum(bert_f1_scores) / len(bert_f1_scores)
    avg_rule_based_time = sum(rule_based_timings) / len(rule_based_timings)
    avg_bert_time = sum(bert_timings) / len(bert_timings)
    
    return {
        "Rule-based": {"Exact Match": avg_rule_based_exact_match, "F1": avg_rule_based_f1, "Avg Time (s)": avg_rule_based_time},
        "BERT": {"Exact Match": avg_bert_exact_match, "F1": avg_bert_f1, "Avg Time (s)": avg_bert_time}
    }


### 5.2 SQuAD Dev Set for Testing

The Stanford Question Answering Dataset (SQuAD) Dev v2.0, a benchmark in reading comprehension, was employed to test the system's proficiency in handling real-world data. It required converting the dataset from JSON to a more manageable data frame format, enabling a structured testing environment where each article's first question and answer were extracted to assess the system's effectiveness in understanding and responding to varied queries (The Stanford NLP Group 2024).

In [45]:
test_data = pd.read_json("dev-v2.0.json")

dataset = []

for i in range(len(test_data["data"])):
    for e in range(len(test_data["data"][i]["paragraphs"])):
        data_dict = {}
        data_dict["id"] = test_data["data"][i]["paragraphs"][e]["qas"][0]["id"]
        data_dict["article"] = test_data["data"][i]["paragraphs"][e]["context"]
        data_dict["question"] = test_data["data"][i]["paragraphs"][e]["qas"][0]["question"]
        data_dict["answer"] = test_data["data"][i]["paragraphs"][e]["qas"][0]["answers"][0]["text"]
        dataset.append(data_dict)

test_df = pd.DataFrame(dataset)[0:500]

test_df.head()

Unnamed: 0,id,article,question,answer
0,56ddde6b9a695914005b9628,The Normans (Norman: Nourmands; French: Norman...,In what country is Normandy located?,France
1,56dddf4066d3e219004dad5f,"The Norman dynasty had a major political, cult...",Who was the duke in the battle of Hastings?,William the Conqueror
2,56dde0379a695914005b9636,"The English name ""Normans"" comes from the Fren...",What is the original meaning of the word Norman?,Viking
3,56dde0ba66d3e219004dad75,"In the course of the 10th century, the initial...",When was the Duchy of Normandy founded?,911
4,56dde1d966d3e219004dad8d,"Before Rollo's arrival, its populations did no...",Who upon arriving gave the original viking set...,Rollo


In [None]:
index = 254
squad_df = test_df[index-20:index]
squad_question = squad_df["question"]
squad_answer = squad_df["answer"]

evaluate_information_retrieval_system(test_df,squad_question, squad_answer)

03/29/2024 13:50:20 - INFO - 	 missing_keys: []
03/29/2024 13:50:20 - INFO - 	 unexpected_keys: []
03/29/2024 13:50:20 - INFO - 	 mismatched_keys: []
03/29/2024 13:50:20 - INFO - 	 error_msgs: []
03/29/2024 13:50:20 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/29/2024 13:50:20 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 166.69 examples/s]
03/29/2024 13:50:20 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 26.25it/s]
03/29/2024 13:50:20 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 166.47 examples/s]
03/29/2024 13:50:20 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 27.02it/s]
03/29/2024 13:50:21 - INFO - 	 missing_keys: []
03/29/2024 13:50:21 - INFO - 	 unexpected_keys: []
03/29/2024 13:50:21 - INFO - 	 mismatched_keys: []
03/29/2024 13:50:21 - INFO - 	 error_msgs: []
03/29/2024 13:50:21 - INFO - 	 Mode

{'Rule-based': {'Exact Match': 0.15,
  'F1': 0.15,
  'Avg Time (s)': 1.5228856801986694},
 'BERT': {'Exact Match': 0.25,
  'F1': 0.2977777777777778,
  'Avg Time (s)': 1.981279981136322}}

|                | Exact Match | F1                | Avg Time (s)      |
|----------------|-------------|-------------------|-------------------|
| Rule-based     | 0.15        | 0.15              | 1.5228856801986694|
| BERT           | 0.25        | 0.2977777777777778| 1.981279981136322 |


## B. References

Ke, Y.H., Jin, L., Elangovan, K., Abdullah, H.R., Liu, N., Sia, A.T.H., Soh, C.R., Tung, J.Y.M., Ong, J.C.L. & Ting, D.S.W. 2024, 'Development and Testing of Retrieval Augmented Generation in Large Language Models -- A Case Study Report', ArXiv, pp. 1-22

The Stanford Natural Language Processing Group. 2024, Dev Set v2.0, SQuAD2.0 The Stanford Question Answering Dataset, viewed 29 March 2024, <https://rajpurkar.github.io/SQuAD-explorer/>.

The Stanford Natural Language Processing Group. 2024, Evaluation Script 2.0, SQuAD2.0 The Stanford Question Answering Dataset, viewed 29 March 2024, <https://rajpurkar.github.io/SQuAD-explorer/>.

spaCy. 2024, Trained Pipelines English, viewed 29 March 2024, <https://spacy.io/models/en> 

Otmazgin, S., Cattan, A. & Goldberg, Y. 2022, 'F-coref: Fast, Accurate and Easy to Use Coreference Resolution', Proceedings of the AACL


## C. Appendix

### The threshold for "NO ANSWER" question

In [35]:
def similarity_threshold(df, questions):
    similarity_list = []

    for i in questions:
        ir_system = Information_Retrieval_System(df, i)
        similarity_list.append(ir_system.similarity)
    
    similarity_avg = sum(similarity_list) / len(similarity_list)

    return similarity_avg


In [36]:
no_answer_df = self_define_question_df[self_define_question_df["Answer"]=="NO ANSWER"]
no_answer_question = no_answer_df["Question"]

similarity_threshold(df_clean,no_answer_question)

03/29/2024 12:01:33 - INFO - 	 missing_keys: []
03/29/2024 12:01:33 - INFO - 	 unexpected_keys: []
03/29/2024 12:01:33 - INFO - 	 mismatched_keys: []
03/29/2024 12:01:33 - INFO - 	 error_msgs: []
03/29/2024 12:01:33 - INFO - 	 Model Parameters: 90.5M, Transformer: 82.1M, Coref head: 8.4M
03/29/2024 12:01:33 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 90.88 examples/s]
03/29/2024 12:01:33 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 11.88it/s]
03/29/2024 12:01:33 - INFO - 	 Tokenize 1 inputs...
Map: 100%|██████████| 1/1 [00:00<00:00, 95.67 examples/s]
03/29/2024 12:01:33 - INFO - 	 ***** Running Inference on 1 texts *****
Inference: 100%|██████████| 1/1 [00:00<00:00, 11.94it/s]
03/29/2024 12:01:34 - INFO - 	 missing_keys: []
03/29/2024 12:01:34 - INFO - 	 unexpected_keys: []
03/29/2024 12:01:34 - INFO - 	 mismatched_keys: []
03/29/2024 12:01:34 - INFO - 	 error_msgs: []
03/29/2024 12:01:34 - INFO - 	 Model 

0.31066734131806223