In [1]:
import pandas as pd
import numpy as np
from pandas import json_normalize
import json

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [3]:
with open(r"D:\02 Personal Files\Thesis Related\Main Thesis Project\main\thesis\datasets\squad2.0\train-v2.0.json", "r") as f:
    data = json.load(f)

df_samples = pd.read_csv(r"D:\02 Personal Files\Thesis Related\Main Thesis Project\main\thesis\model_training\output.csv")

# Check the first few rows to confirm it loaded correctly
print(df_samples.head())

   Unnamed: 0    title                                            context  \
0           0  Beyoncé  Beyoncé Giselle Knowles-Carter (/BiːˈJɒnseɪ/ B...   
1           1  Beyoncé  Beyoncé Giselle Knowles-Carter (/BiːˈJɒnseɪ/ B...   
2           2  Beyoncé  Beyoncé Giselle Knowles-Carter (/BiːˈJɒnseɪ/ B...   
3           3  Beyoncé  Beyoncé Giselle Knowles-Carter (/BiːˈJɒnseɪ/ B...   
4           4  Beyoncé  Beyoncé Giselle Knowles-Carter (/BiːˈJɒnseɪ/ B...   

                                            question                  answers  \
0           When Did Beyonce Start Becoming Popular?    ['In The Late 1990S']   
1  What Areas Did Beyonce Compete In When She Was...  ['Singing And Dancing']   
2  When Did Beyonce Leave Destiny'S Child And Bec...                 ['2003']   
3      In What City And State Did Beyonce  Grow Up?        ['Houston, Texas']   
4         In Which Decade Did Beyonce Become Famous?           ['Late 1990S']   

  question_type  
0       Literal  
1       Litera

In [6]:
def prepare_data(data):
    articles = []

    for article in data["data"]:
        title = article["title"].replace('_', ' ').title()
        
        for paragraph in article["paragraphs"]:
            context = paragraph["context"].title()

            for qa in paragraph["qas"]:
                question = qa["question"].title()
                id = qa["id"]
                answer_texts = [answer["text"].title() for answer in qa["answers"]]

                inputs = {
                    "title": title,
                    "context": context,
                    "question": question,
                    "answers": answer_texts
                }
                articles.append(inputs)

    return articles


In [None]:
data = pd.read_json(r"D:\02 Personal Files\Thesis Related\Main Thesis Project\main\thesis\generated_questions")

In [None]:
def learneddata(data):
    articles = []

    for item in data:
        title = item["passage"].title()
        context = item["passage"].title()

        question = item["questions-choices-answer"]["question"].title()
        answers = item["questions-choices-answer"]["choices"]
        
        answer = item["questions-choices-answer"]["answer"].title()

        articles.append({
            "title": title,
            "context": context,
            "question": question,
            "answers": answers,
            "answer": answer
        })

    return articles


In [7]:
rawdata = prepare_data(data)

df = pd.DataFrame(rawdata)

df = df.loc[df['answers'].apply(lambda x: bool(x))]



In [6]:
df.columns

Index(['title', 'context', 'question', 'answers'], dtype='object')

In [9]:
def classify_question(question):
    if any(word in question.lower() for word in ["what", "who", "when", "where"]):
        return "Literal"
    elif any(word in question.lower() for word in ["why", "how", "infer", "suggest"]):
        return "Inferential"
    elif any(word in question.lower() for word in ["evaluate", "opinion", "discuss"]):
        return "Evaluative"
    else:
        return "Unknown"

In [10]:
df['question_type'] = df['question'].apply(classify_question)

In [11]:
df_cleaned = df[df['question_type'] != "Unknown"]

In [12]:
def classify_using_cosine_similarity(question, df_cleaned, threshold=0.5):
    vectorizer = TfidfVectorizer()
    
    categories = df_cleaned['question_type'].unique()
    sample_questions = {category: df_cleaned[df_cleaned['question_type'] == category]['question'].tolist() for category in categories}

    all_samples = sum(sample_questions.values(), [])
    vectors = vectorizer.fit_transform([question] + all_samples)

    question_vector = vectors[0]
    sample_vectors = vectors[1:]

    similarities = cosine_similarity(question_vector, sample_vectors).flatten()

    category_similarities = {
        category: max(similarities[i: i + len(questions)])
        for i, (category, questions) in enumerate(sample_questions.items())
    }

    best_category, best_similarity = max(category_similarities.items(), key=lambda item: item[1])

    if best_similarity >= threshold:
        return best_category
    else:
        return "Unknown"

In [16]:
df_unknown = df[df['question_type'] == "Unknown"]

In [17]:
if not df_unknown.empty:
    df.loc[df['question_type'] == "Unknown", 'question_type'] = df_unknown['question'].apply(
        lambda x: classify_using_cosine_similarity(x, df_samples, threshold=0.5)
    )

In [19]:
df_cleaned = df[df['question_type'] != "Unknown"]

In [20]:
df_cleaned.to_csv("output.csv")