In [239]:
import pandas as pd  # Import the pandas library and alias it as 'pd'
import spacy  # Import the spacy library
import random  # Import the random module
from sklearn.model_selection import train_test_split  # Import the train_test_split function from sklearn
from sklearn.metrics import accuracy_score, classification_report  # Import metrics from sklearn
from sklearn.ensemble import RandomForestClassifier  # Import the RandomForestClassifier from sklearn

In [240]:
test_df = pd.read_csv('https://raw.githubusercontent.com/ShreyAgarwal11/NLP_Project/main/Data/parametric_answers.csv') # Read the CSV file from the specified URL into a pandas DataFrame
test_df = test_df.drop('Unnamed: 0', axis = 1) # Drop the 'Unnamed: 0' column from the DataFrame
test_df['id'] = range(1, len(test_df) + 1) # Add an 'id' column to the DataFrame with incremental numeric values starting from 1

In [241]:
nlp = spacy.load('en_core_web_sm') # Import the spaCy library and load the English language model 'en_core_web_sm'
answer_type = {} # Initialize an empty dictionary to store the extracted named entities and their labels
answer_type_n2 = {} # Initialize an empty dictionary to store the extracted named entities and their labels
# Iterate over the rows of the DataFrame
for i in range(len(test_df)):
  # Process the 'answer' column using spaCy's NLP pipeline and iterate over the named entities found
  lb = [] # Initialize an empty list to store the labels of named entities in the answer
  expected_lb = [] # Initialize an empty list to store the expected labels based on question tokens
  new_lb2 = [] # Initialize an empty list to store the filtered labels based on expected labels
  if 'who' in test_df['question_tokens'][i]:  # Check if 'who' is present in the named entity
    expected_lb.append('PERSON')
  elif 'where' in test_df['question_tokens'][i]: # Check if 'where' is present in the named entity
    expected_lb.append('GPE')
    expected_lb.append('EVENT')
    expected_lb.append('NORP')
    expected_lb.append('LOC')
    expected_lb.append('FAC')
  elif 'which' in test_df['question_tokens'][i]: # Check if 'which' is present in the named entity
    expected_lb.append('PRODUCT')
    expected_lb.append('LANGUAGE')
    expected_lb.append('LOC')
    expected_lb.append('ORG')
    expected_lb.append('EVENT')
    expected_lb.append('NORP')
    expected_lb.append('GPE')
    expected_lb.append('WORK_OF_ART')
  elif 'what' in test_df['question_tokens'][i]: # Check if 'what' is present in the named entity
    if 'time' in test_df['question_tokens'][i]: # Check if 'time' is present in it the named entity
      expected_lb.append('TIME')
      expected_lb.append('DATE')
    else:
      expected_lb.append('PRODUCT')
      expected_lb.append('LANGUAGE')
      expected_lb.append('ORG')
      expected_lb.append('WORK_OF_ART')
  elif 'when' in test_df['question_tokens'][i]: # Check if 'what' is present in the named entity
    expected_lb.append('TIME')
    expected_lb.append('DATE')
  elif 'how' in test_df['question_tokens'][i]: # Check if 'how' is present in the named entity
    if 'many' in test_df['question_tokens'][i] or 'much' in test_df['question_tokens'][i]: # Check if 'many' and 'much' present in the named entity
      expected_lb.append('QUANTITY')
      expected_lb.append('CARDINAL')
      expected_lb.append('MONEY')
      expected_lb.append('PERCENT')
  else:
    expected_lb.append('DEFAULT')
  for ent in nlp(test_df['answer'][i]).ents:
    lb.append(ent.label_)
  answer_type[test_df['answer'][i]] = lb[0] if len(lb) > 0 else 'NaN' # Assign the label of the first named entity found in the answer to the corresponding answer in the dictionary
  for tags in lb:
    if tags in expected_lb:
      new_lb2.append(tags)
  if new_lb2 == []:
    new_lb2 = expected_lb
  answer_type_n2[test_df['answer'][i]] = new_lb2

In [242]:
test_df['counterfactual'] = '' # Initialize a new column 'counterfactual' in the DataFrame 'test_df' with empty strings
# Iterate through each row in the DataFrame 'test_df' containing answers
for i in range(len(test_df)):
  typ = answer_type[test_df['answer'][i]]  # Extract the named entity label corresponding to the current answer from the 'answer_type' dictionary
  temp = [] # Initialize an empty list 'temp' to store answers with the same named entity label
  # Iterate through the 'answer_type' dictionary and find answers with the same named entity label
  for key, value in answer_type.items():
    if value == typ and key != test_df['answer'][i]:
      temp.append(key)
  if temp:  # Check if temp is not empty before choosing a random item
    test_df.loc[i, 'counterfactual'] = random.choice(temp)
  else:
    test_df.loc[i, 'counterfactual'] = 'NaN'

In [243]:
test_df['counterfactual_enhanced'] = '' # Initialize a new column 'counterfactual_enhanced' in the DataFrame 'test_df' with empty strings
# Iterate through each row in the DataFrame 'test_df' containing answers
for i in range(len(test_df)):
  typ = answer_type_n2[test_df['answer'][i]] # Extract the named entity label corresponding to the current answer from the 'answer_type_n2' dictionary
  temp = [] # Initialize an empty list 'temp' to store answers with the same named entity label
  # Iterate through the 'answer_type_n2' dictionary and find answers with the same named entity label
  for key, value in answer_type_n2.items():
    for val in value:
      if val in typ and key != test_df['answer'][i]:
        temp.append(key)
  if len(temp)>0:  # Check if temp is not empty before choosing a random item
    test_df.loc[i, 'counterfactual_enhanced'] = random.choice(temp)
  else:
    test_df.loc[i, 'counterfactual_enhanced'] = random.choice(test_df['answer'])

In [244]:
parametric = {} # Initialize an empty dictionary to store the 'predicted_parametric_answers' mapped to 'id'
# Iterate over the rows of the DataFrame
for i in range(len(test_df)):
  parametric[test_df.loc[i, 'id']] = test_df.loc[i, 'predicted_parametric_answers'] # Map 'id' to 'predicted_parametric_answers' in the dictionary

In [245]:
split_index = int(0.6 * len(test_df)) # Calculate the split index as 60% of the length of the DataFrame
# Split the DataFrame
test_df60 = test_df[:split_index]
test_df40 = test_df[split_index:]

Initialize empty dictionaries to store probabilities and answers for the first 60% of the DataFrame

In [258]:
probabilities_60 = {}
answers_60 = {}
# Iterate over the rows of the first 60% of the DataFrame
for i in range(len(test_df60)):
  probability = random.uniform(0.9, 1) # Generate a random probability between 0.9 and 1
  probabilities_60[test_df60.loc[i, 'id']] = [probability, 1 - probability] # Store the probability values in the dictionary
  answers_60[test_df60.loc[i,'id']] = [test_df60.loc[i, 'answer'], test_df60.loc[i, 'counterfactual_enhanced']] # Store the corresponding answers in the dictionary

Initialize empty dictionaries to store probabilities and answers for the remaining 40% of the DataFrame

In [247]:
probabilities_40 = {}
answers_40 = {}
# Iterate over the rows of the remaining 40% of the DataFrame
for i in range(len(test_df60), len(test_df60) + len(test_df40)):
  probability = random.uniform(0.7, 1) # Generate a random probability between 0.7 and 1
  probabilities_40[test_df40.loc[i, 'id']] = [probability, 1 - probability] # Store the probability values in the dictionary
  answers_40[test_df40.loc[i,'id']] = [test_df40.loc[i, 'counterfactual_enhanced'], test_df40.loc[i, 'counterfactual']] # Store the corresponding answers in the dictionary

In [248]:
semantic_60 = {} # Initialize an empty dictionary to store semantic similarities for the first 60% of the DataFrame
for key, value in answers_60.items():
  para = nlp(str(parametric[key])) # Process the 'predicted_parametric_answers' using spaCy's NLP pipeline
  semantic_60[key] = [para.similarity(nlp(value[0])), para.similarity(nlp(value[1]))] # Calculate semantic similarities and store them in the dictionary

  semantic_60[key] = [para.similarity(nlp(value[0])), para.similarity(nlp(value[1]))]


In [249]:
semantic_40 = {} # Initialize an empty dictionary to store semantic similarities for the remaining 40% of the DataFrame
# Iterate over the keys and values of the 'answers_40' dictionary
for key, value in answers_40.items():
  para = nlp(str(parametric[key])) # Process the 'predicted_parametric_answers' using spaCy's NLP pipeline
  semantic_40[key] = [para.similarity(nlp(value[0])), para.similarity(nlp(value[1]))] # Calculate semantic similarities and store them in the dictionary

  semantic_40[key] = [para.similarity(nlp(value[0])), para.similarity(nlp(value[1]))]


Initialize empty dictionaries to store final contextual answers and scores

In [250]:
final_contextual_answer = {}
final_score = {}
# Iterate over the keys and values of the 'semantic_60' dictionary
for key, value in semantic_60.items():
  # Check which contextual answer has a higher weighted score and store it along with the score
  if probabilities_60[key][0]*semantic_60[key][0] >= probabilities_60[key][1]*semantic_60[key][1]:
    final_contextual_answer[key] = answers_60[key][0]
    final_score[key] = probabilities_60[key][0]*semantic_60[key][0]
  else:
    final_contextual_answer[key] = answers_60[key][1]
    final_score[key] = probabilities_60[key][1]*semantic_60[key][1]
# Iterate over the keys and values of the 'semantic_40' dictionary
for key, value in semantic_40.items():
  # Check which contextual answer has a higher weighted score and store it along with the score
  if probabilities_40[key][0]*semantic_40[key][0] >= probabilities_40[key][1]*semantic_40[key][1]:
    final_contextual_answer[key] = answers_40[key][0]
    final_score[key] = probabilities_40[key][0]*semantic_40[key][0]
  else:
    final_contextual_answer[key] = answers_40[key][1]
    final_score[key] = probabilities_40[key][1]*semantic_40[key][1]

In [251]:
test_df['Contextual Answer'] = test_df['id'].map(final_contextual_answer) # Map the 'id' column to the 'final_contextual_answer' dictionary and create a new column 'Contextual Answer'
test_df['Final Score'] = test_df['id'].map(final_score) # Map the 'id' column to the 'final_score' dictionary and create a new column 'Final Score'

In [252]:
test_df #print the dataframe

Unnamed: 0,question_text,question_tokens,predicted_parametric_answers,answer,id,counterfactual,counterfactual_enhanced,Contextual Answer,Final Score
0,a collection of fluid in the tissues is called,"['a', 'collection', 'of', 'fluid', 'in', 'the'...",water retention,water retention ( also known as fluid retenti...,1,Blair Waldorf,Cause of Action,water retention ( also known as fluid retenti...,0.413642
1,a country that begins with the letter d,"['a', 'country', 'that', 'begins', 'with', 'th...",denmark,Denmark,2,Zazu,The City of Light ( La Ville Lumière ),Denmark,0.235434
2,a day in the life of marlon bundo audiobook cast,"['a', 'day', 'in', 'the', 'life', 'of', 'marlo...",,Jim Parsons,3,releases hyaluronidase and acrosin,about the band members Gerard Way and Mikey W...,Jim Parsons,0.245064
3,a means for two processes to exchange informat...,"['a', 'means', 'for', 'two', 'processes', 'to'...",inter process communication or interprocess co...,inter-process communication or interprocess c...,4,Speaker of the House,An ecological pyramid ( also trophic pyramid ...,inter-process communication or interprocess c...,0.630751
4,a monomer of nucleic acids is called what,"['a', 'monomer', 'of', 'nucleic', 'acids', 'is...",nucleotides,nucleotides,5,warden,physical training,nucleotides,0.375228
...,...,...,...,...,...,...,...,...,...
2025,why is the british parliament known as the mot...,"['why', 'is', 'the', 'british', 'parliament', ...",because of the adoption of the westminster mod...,because of the adoption of the Westminster mo...,2026,"Rockbridge County , Virginia",promote fair competition,promote fair competition,0.095744
2026,why w and y are called semi vowels,"['why', 'w', 'and', 'y', 'are', 'called', 'sem...",semivowel or glide,phonetically similar to a vowel sound but fun...,2027,Moon Child,Baily Bridge,Baily Bridge,0.021766
2027,why was virginia capital moved from williamsbu...,"['why', 'was', 'virginia', 'capital', 'moved',...",richmond,Governor Thomas Jefferson urged it that the c...,2028,Sri Lanka Matha,Observation,Observation,0.200033
2028,write the name of the fuels commonly used for ...,"['write', 'the', 'name', 'of', 'the', 'fuels',...",conventional diesel,Kerosene,2029,Loretta Devine,mechanical,mechanical,0.217146
