## Specs Detail

In [None]:
import os
import psutil
import torch

# CPU Model
cpu_info = !lscpu | grep "Model name"
print(f"CPU: {cpu_info[0]}")

# Number of CPU Cores
# physical_cores = psutil.cpu_count(logical=False)
# logical_cores = psutil.cpu_count(logical=True)
# print(f"CPU Cores: {physical_cores} physical, {logical_cores} logical")

# Total RAM
ram = psutil.virtual_memory()
print(f"Total RAM: {ram.total / 1e9:.2f} GB")

# Disk Space
disk = psutil.disk_usage('/')
print(f"Disk Space: {disk.total / 1e9:.2f} GB")

# GPU Details
gpu_name = !nvidia-smi --query-gpu=name --format=csv,noheader
if torch.cuda.is_available():
    device_id = torch.cuda.current_device()  # Get current CUDA device index
    print(f"CUDA Device ID: {device_id}")
    print(f"CUDA Available: {torch.cuda.is_available()}")
    print(f"CUDA Version: {torch.version.cuda}")
else:
    print("No GPU found.")

print(pyth)

In [None]:
!python --version

## Drive connect

Download Datasets and Basic Analyzing


In [None]:
from google.colab import drive

drive.mount('/content/drive')


In [None]:
%cd '/content/drive/MyDrive/datasets'


In [None]:
%ls

## Dataset #1

In [None]:

import pandas as pd
import numpy as np

d1_path = "/content/drive/MyDrive/datasets/advs7235-sup-0001-suppmat.csv"

d1 = pd.read_csv(d1_path)
d1.head()

In [None]:
d1.describe()

In [None]:
d1.isnull().sum()

In [None]:
pip install requests pandas tools


In [None]:
!pip install pymupdf
!pip install --upgrade pymupdf


In [None]:
pip install pdf2image pytesseract


In [None]:
import fitz
print(dir(fitz))

In [None]:
print(len(d1))
print(len(d1.columns))
print(d1.columns)

Fetch Abstract text from DOIs using Unpaywall or Crossref API to get open-access full text or abstract.

In [None]:
import pandas as pd
import requests
from time import sleep
import fitz  # PyMuPDF
import re
from bs4 import BeautifulSoup

import pytesseract
from pdf2image import convert_from_path

patterns = [
    r'https?://\S+',    # Remove URLs
    r'\bdoi\b[\w\./:]+', # Remove DOI patterns
    r'\b(Copyright|Ltd|Company|All Rights Reserved|Trademark)\b.*', # Remove copyright, Ltd, etc.
    r'[^\w\s]', # Remove non-alphanumeric symbols
    r"\s+", # whitespace
]


def crop_text(text):
    # Find the first occurrence of 'Introduction' and the last occurrence of 'References', 'Acknowledgements', etc.
    start_index = re.search(r'\bIntroduction\b', text)
    end_index = re.search(r'\b(?:References|Acknowledgements|Conflicts of Interest)\b', text)

    # Ensure the keywords exist and crop the text
    if start_index and end_index:
        text = text[start_index.start():end_index.end()]
    return text




def clean_text(text):
    if not text:
        return ""

    # Remove HTML tags (for Crossref abstract)
    text = BeautifulSoup(text, "html.parser").get_text()
    text = crop_text(text)

    # Remove URLs
    for pattern in patterns:
      text = re.sub(pattern, ' ', text)

    # Remove page numbers, headers/footers (common artifacts)
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue


        if re.match(r"^Page\s*\d+$", line) or len(line) < 5:
            continue
        cleaned_lines.append(line)

    return " ".join(cleaned_lines)


def get_open_access_url(doi):
    url = f"https://api.unpaywall.org/v2/{doi}?email=sharon.nemekhbayar1009@gmail.com"
    try:
        res = requests.get(url)
        data = res.json()
        return data.get("best_oa_location", {}).get("url_for_pdf") or \
               data.get("best_oa_location", {}).get("url")
    except:
        return None

def get_crossref_abstract(doi):
    url = f"https://api.crossref.org/works/{doi}"
    try:
        res = requests.get(url)
        data = res.json()
        return data["message"].get("abstract")
    except:
        return None


def extract_text_from_pdf_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            with fitz.open(stream=response.content, filetype="pdf") as doc:
                text = ""
                is_scanned_pdf = False
                for page in doc:
                    page_text = page.get_text()
                    if not page_text.strip():
                        is_scanned_pdf = True
                    text += page_text

                if text.strip() == "" and is_scanned_pdf:
                    # If no text is found and it's a scanned PDF, use OCR to extract text
                    images = convert_from_path(url)
                    text = ""
                    for image in images:
                        text += pytesseract.image_to_string(image)
                return text
    except Exception as e:
        print("PDF extraction failed:", e)
    return None



df = d1.copy()
full = 0
abs = 0

for i, row in df.iterrows():
    doi = str(row["DOI"]).strip()
    if doi == "nan" or not doi or doi in processed_dois:
        continue

    # Unpaywall for full text
    full_text_url = get_open_access_url(doi)
    if full_text_url:
        pdf_text = extract_text_from_pdf_url(full_text_url)
        if pdf_text:
            cleaned = clean_text(pdf_text)
            docs.append({"text": cleaned, "metadata": row.to_dict()})
            print(f"[{i}] Full text extracted and cleaned.")
            full += 1
            continue

    abstract = get_crossref_abstract(doi)
    if abstract:
        cleaned = clean_text(abstract)
        docs.append({"text": cleaned, "metadata": row.to_dict()})
        print(f"[{i}] Abstract fetched and cleaned.")
        abs += 1
    else:
        print(f"[{i}] Nothing found for DOI: {doi}")

    sleep(1)

print(f"✅ Full texts added: {full}")
print(f"📄 Abstracts added: {abs}")
print(f"📚 Total new documents added: {full + abs}")


In [None]:
len(docs)


In [None]:
processed_dois = {entry["metadata"]["DOI"].strip().lower() for entry in docs if "DOI" in entry["metadata"]}
processed_dois

## Clean the fetched texts

In [None]:
import json

documents = json.load(open("documents.json"))



In [None]:
import re
from bs4 import BeautifulSoup
import unicodedata

def normalize_unicode_ligatures(text):
    return unicodedata.normalize("NFKD", text)

patterns = [
    r'\bdoi\b[\w\./:]+',  # DOI patterns
    r'\(\s*(et al\.,?|[A-Z][a-z]+[^)]*\d{4})\s*\)',  # Citations like (Smith et al., 2021)
    r'^\s*Figure\s*\d+[a-zA-Z]?[.:]?\s*(shows|illustrates|depicts|demonstrates|explains|describes|represents)\s*[:]*\s*',
    r'\[\s*\d{1,3}(?:[\s,–-]*\d{1,3})*\s*\]',  # citation references like [24–29,46,53,54]
    r'\[.*?\.(avi|mp4|pdf|docx|zip|pptx|xls|xlsx|txt)\]',  # file references
    r'\b([A-Za-z\s]+\.?\s?\d{4},\s?\d+,\s?[\d\-–,]+(?:\.\s?\d+)?)\b',  # journal references
    r'https?://\S+',  # Remove full URLs
    r'\b(?:www|https?)\.[\w\.-]+\.\w+\b',  # domain names
    r':\/\/\s*:\/{2,}',  # malformed URL
    r'/journal/\s*\d+\s*of\s*\d+',  # journal path patterns
    r'\s+',  # Collapse whitespace
    r'\(?(Fig(?:ure)?\s*\d*[a-zA-Z]?)\)?',        # (Fig 3.a), (Figure a), Figure b
    r'\(?(appendix\s*\d+(?:\.\d+)?)\)?',          # (appendix 1.2), appendix 2.1
    r'Figure\s+[a-zA-Z]',                         # Figure a, Figure b
    r'\(Fig\s+[a-zA-Z]\)',                        # (Fig a), (Fig B)
]



keywords = [
    "Acknowledgement", "Acknowledgements", "ACKNOWLEDGMENTS", "Acknowledgment", "Acknowledgments", "ACKNOWLEDGEMENTS",
    "Conflict of Interest", "Conflicts of Interest",
    "Supporting Information", "Funding", "Author Contributions",
    "Ethics", "Abbreviations", "Data Availability",
    "ORCID", "Consent", "Supplementary Material", "References", "Reference"
]

# Crop full text to useful section only
def crop_text(text):
    start = re.search(r'\bIntroduction\b', text, re.IGNORECASE)
    end = re.search(r'\b(References|Acknowledgements|Conflicts of Interest|Supporting Information|supplementary)\b', text[::-1], re.IGNORECASE)

    start_idx = start.start() if start else 0
    end_idx = end.start() if end else len(text)

    return text[start_idx:end_idx]

# Main cleaner
def clean_text(text):
    if not text:
        return ""

    # Remove HTML if present (e.g., in Crossref abstracts)
    text = BeautifulSoup(text, "html.parser").get_text()

    # Crop to core content
    text = crop_text(text)

    # Apply all regex patterns
    for pattern in patterns:
        text = re.sub(pattern, ' ', text)

    for k in keywords:
        p = rf"{k}[:\[\]\-]?\s*[\s\S]*?$"
        text = re.sub(p, "", text)

    # Remove hyphenation artifacts (e.g., "bio-\nchemical" => "biochemical")
    text = re.sub(r'-\s+', '', text)
    # Fix hyphenated line breaks (e.g., "den-\nsity" → "density")
    text = re.sub(r'(\w+)-\s*\n\s*(\w+)', r'\1\2', text)
    text = text.replace('\n', ' ')

    # Line-by-line cleanup
    lines = text.splitlines()
    cleaned_lines = []
    for line in lines:
        line = line.strip()
        if not line:
            continue
        if re.match(r'^Page\s*\d+(\sof\s*\d+)?$', line, re.IGNORECASE):
            continue
        if len(line) < 5:
            continue
        cleaned_lines.append(line)


    text = normalize_unicode_ligatures(text)

    # Final normalization
    return re.sub(r'\s+', ' ', ' '.join(cleaned_lines)).strip()


In [None]:
for n in nine:
      n['text'] = clean_text(n['text'])


In [None]:
import json

with open("documents.json", "w", encoding="utf-8") as f:
    json.dump(nine, f, ensure_ascii=False, indent=2)


In [None]:
essential_keys = [
    "DOI", "Article Title", "Authors",
    "Source Title", "Publication Year", "Volume", "Start Page",
    "End Page", "Document Type", "Author Keywords"
]

cleaned_docs_eight = []

for doc in docs:
    cleaned_text = clean_text(doc["text"])
    if len(cleaned_text) < 50:
        continue
    cleaned_metadata = {
        key: doc["metadata"].get(key)
        for key in essential_keys
        if key in doc["metadata"] and pd.notna(doc["metadata"][key])
    }

    cleaned_docs_eight.append({
        "text": cleaned_text,
        "metadata": cleaned_metadata
    })


cleaned_docs_five is deemed to be the best of all, so saved as docs.json

In [None]:
documents[3]

In [None]:
'Conclusion' in documents[3]['text']

In [None]:
import json

with open("docs_cleaned_eight.json", "w") as f:
    json.dump(documents, f, ensure_ascii=False, indent=2)

In [None]:
len(documents)

In [None]:
checks = {
    "dangling_references": r'\[\s*\d+([–,-]\s*\d+)*(,\s*\d+)*\s*\]',
    "broken_words": r'\b\w{1,2}\s+\w{1,2}\b',
    "file_links": r'\.(pdf|avi|mp4|zip|docx|pptx|xlsx|txt)',
    "journals_left": r'/journal/|https?://',
    "supporting_info": r'Supporting Information',
    "acknowledgments": r'Acknowledg?ments?:',
    "conflicts_of_interest": r'Conflicts of Interest',
    "weird_punctuation": r'[^\w\s\.,:;()\-–—]',
    "page_numbers": r'\b\d+\s+of\s+\d+\b',
}
for doc in documents:
    text = doc["text"]
    for label, pattern in checks.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            print(f"{label} found: {matches[:3]}")


## Analyzing evaluation

#### biorag_eval

In [None]:
import pandas as pd

rag_df = pd.read_csv('rag_eval/rag_final.csv')
rag_df.head(3)

In [None]:
is_correct = rag_df['ground_truth'] == rag_df['predicted']
is_correct.sum()

In [None]:
rag_df['is_correct'] = is_correct
rag_df.groupby('category')['is_correct'].value_counts()

#### biollm_eval

In [None]:
import pandas as pd

bio = pd.read_csv('rag_eval/bio.csv')
is_correct_bio = bio['ground_truth'] == bio['answer']
is_correct_bio.sum()

In [None]:
bio['is_correct'] = is_correct_bio
bio.groupby('category')['is_correct'].value_counts()

#### basemodel_eval

In [None]:
base = pd.read_csv('rag_eval/base.csv')
is_correct_base = base['ground_truth'] == base['answer']


base['is_correct'] = is_correct_base
base.groupby('category')['is_correct'].value_counts()

In [None]:
is_correct_base.sum()

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt


rag_df['model'] = "BioLLM+RAG"
bio['model'] = "BioLLM"
base['model'] = "Base model"


In [None]:
all_df = pd.concat([rag_df, bio, base], ignore_index=True)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Calculate accuracy per category and model
acc_df = (
    all_df.groupby(['model', 'category'])['is_correct']
    .mean()
    .reset_index()
    .rename(columns={"is_correct": "accuracy"})
)

# Sort categories if needed (optional)
category_order = sorted(acc_df['category'].unique())

# Define custom color palette
palette = {
    "Base model": "#4c72b0",
    "BioLLM": "#55a868",
    "BioLLM+RAG": "#c44e52"
}

# Create the plot
plt.figure(figsize=(12, 7))
barplot = sns.barplot(
    data=acc_df,
    x="category",
    y="accuracy",
    hue="model",
    palette=palette,
    edgecolor="black",
    dodge=True,
    errwidth=1.5,
    width=0.7
)


for bar in barplot.patches:
    height = bar.get_height()
    barplot.annotate(
        f'{height:.2f}',
        (bar.get_x() + bar.get_width() / 2, height),
        ha='center',
        va='bottom',
        fontsize=10,
        color='black',
        xytext=(0, 5),
        textcoords='offset points'
    )

plt.title("Model Accuracy by Category", fontsize=16, fontweight='bold')
plt.xlabel("Question Category", fontsize=13)
plt.ylabel("Accuracy", fontsize=13)
plt.xticks(fontsize=11)
plt.yticks(fontsize=11)
plt.ylim(0, 1.05)
plt.grid(True, axis='y', linestyle='--', alpha=0.5)
plt.legend(title="Model", title_fontsize=12, fontsize=11)
plt.tight_layout()
plt.show()


## Dataset #2

In [None]:
d2_path = "/content/drive/MyDrive/datasets/advs7235-sup-0002-suppmat.csv"

d2 = pd.read_csv(d2_path, index_col=False)
d2.head()

In [None]:
d2.tail(150)

In [None]:
d2['question'][0]

In [None]:
d2['answer'][0]

In [None]:
print(len(d2))
print(len(d2.columns))
print(d2.columns)

In [None]:
d2.columns


We don't need columns ['Unnamed: 0.1', 'Unnamed: 0'], so drop them.

In [None]:
d2 = d2.drop(columns=['Unnamed: 0.1', 'Unnamed: 0'])

In [None]:
d2.isnull().sum()

Let's drop the rows with null

In [None]:
d2_cleaned = d2.dropna(subset=['question', 'answer'])

# Verify the rows were dropped
print(d2_cleaned.isnull().sum())


print(len(d2_cleaned))
print(len(d2_cleaned.columns))
print(d2_cleaned.columns)

In [None]:
# Tokenize questions into word arrays (split by space)
d2_cleaned['tokenized_question'] = d2_cleaned['question'].apply(lambda x: x.split())

# Find identical arrays (questions that have the same tokenized form)
duplicate_questions = d2_cleaned[d2_cleaned.duplicated(subset=['tokenized_question'], keep=False)]

# Display the duplicate questions (those with identical tokenized arrays)
print(duplicate_questions[['question', 'tokenized_question']])


In [None]:
len(duplicate_questions)

In [None]:
# Tokenize the questions into word arrays and convert to tuples for grouping
d2_cleaned['tokenized_question'] = d2_cleaned['question'].apply(lambda x: tuple(x.split()))

# Group by the tokenized_question column (now a tuple)
grouped = d2_cleaned.groupby('tokenized_question')
k=0
# Iterate over groups with more than one entry and print the duplicate pairs
for tokens, group in grouped:
    if len(group) > 1:
        k += len(group)
        print("Matching tokenized question:", tokens)
        print(group[['question']])
        print("-----"*5)
print(k)

In [None]:
d2_filtered = d2_cleaned.drop_duplicates(subset=['tokenized_question']).reset_index(drop=True)

duplicates = d2_filtered[d2_filtered.duplicated(subset=['tokenized_question'], keep=False)]
len(duplicates)

In [None]:
print(d2_filtered.shape)
d2_filtered.head()

In [None]:
d2_filtered.dropna(subset=['question', 'answer'], inplace=True)
d2_filtered.shape

In [None]:
qa_df = d2_filtered.copy()

qa_df['question_length'] = qa_df['question'].astype(str).apply(lambda x: len(x.split()))
qa_df['answer_length'] = qa_df['answer'].apply(lambda x: len(x.split()))

qa_df.head()

In [None]:
avg_question_length = np.mean(qa_df['question_length'])
avg_answer_length = np.mean(qa_df['answer_length'])

print(f"Average Question Length: {avg_question_length:.2f} words")
print(f"Average Answer Length: {avg_answer_length:.2f} words")

In [None]:
qa_df['question'] = qa_df['question'].str.lower().str.strip()
qa_df['answer'] = qa_df['answer'].str.lower().str.strip()

qa_df.head()

In [None]:
qa_df = qa_df.drop(columns=['tokenized_question', 'question_length', 'answer_length'])
qa_df.head()

In [None]:
import unicodedata
import re

def normalize_text(text):
    text = unicodedata.normalize("NFKC", text)  # Normalize Unicode
    text = text.lower().strip()  # Convert to lowercase
    text = re.sub(r"\s+", " ", text)  # Remove extra spaces
    return text


In [None]:
print(normalize_text(qa_df['question'][0]))
print(normalize_text(qa_df['answer'][0]))

qa_df = qa_df.applymap(normalize_text)
qa_df.head()

### Save as JSONL for LLM Fine-Tuning

In [None]:
import json

# Save as JSONL (each line is a JSON object)
with open("qa_dataset.jsonl", "w") as f:
    for _, row in qa_df.iterrows():
        json.dump({"question": row["question"], "answer": row["answer"]}, f)
        f.write("\n")  # Add newline to separate each JSON object


### Let's count TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer(stop_words='english')
corpus = d2_filtered['question'].tolist()

vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(corpus)


# Get the mapping of tokens to feature indices
feature_names = vectorizer.get_feature_names_out()
first_question_tfidf = pd.DataFrame(tfidf_matrix[0].T.todense(),
                                    index=feature_names,
                                    columns=["TF-IDF"])
print(first_question_tfidf.sort_values("TF-IDF", ascending=False))


In [None]:
import json
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet

# Download if not already
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')
nltk.download('stopwords')

# POS tag mapping function
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

word_counter = Counter()

with open("qa_dataset.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        question = data["question"].lower()
        tokens = word_tokenize(question)
        tagged = pos_tag(tokens)

        lemmatized = [
            lemmatizer.lemmatize(word, get_wordnet_pos(tag))
            for word, tag in tagged if word.isalpha() and word not in stop_words
        ]

        word_counter.update(lemmatized)

print(word_counter.most_common(20))


In [None]:
import pandas as pd
qa_word_counter_df = pd.DataFrame(word_counter.most_common(20), columns=['Word', 'Frequency'])
qa_word_counter_df

In [None]:
from wordcloud import WordCloud

# Create a dictionary with words as keys and their TF-IDF scores as values
words_tfidf = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).A1))

# Create the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(words_tfidf)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# # Convert the TF-IDF matrix to a dense format and create a DataFrame
# tfidf_df = pd.DataFrame(tfidf_matrix.todense(), columns=feature_names)

# # Now, you can get a DataFrame of every token's TF-IDF score
# # For example, to see the tokens and their corresponding TF-IDF values for the entire corpus:
# tokens_tfidf = tfidf_df.stack().reset_index(name='TF-IDF')
# tokens_tfidf.columns = ['Document', 'Token', 'TF-IDF']
# tokens_tfidf.to_csv('tokens_tfidf.csv', index=False)

In [None]:

from os import path
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

# Dataset #3

In [None]:
import pandas as pd
import numpy as np

d3_path = "/content/drive/MyDrive/datasets/advs7235-sup-0003-suppmat.csv"

d3 = pd.read_csv(d3_path)
d3.head()

In [None]:
print(d3.shape)
print(d3.columns)

In [None]:
d3.describe()

In [None]:
d3 = d3.drop(columns=['Citation'])
d3.head(2)

In [None]:
d3_duplicates = d3[d3.duplicated(subset=['Question'], keep=False)]
d3_duplicates.count()

In [None]:
d3.shape

In [None]:
mc_df = d3.copy()

mc_df['Question_length'] = mc_df['Question'].astype(str).apply(lambda x: len(x.split()))

mc_df.head()

In [None]:
question_avg_length = np.mean(mc_df['Question_length'])

print(f"Average Question Length: {question_avg_length:.2f} words")

In [None]:
import matplotlib.pyplot as plt


category_counts = mc_df['Category'].value_counts().astype(int)

# Get the labels corresponding to the counts
labels = category_counts.index.tolist()  # Use the index for labels

# Plot the pie chart
plt.pie(category_counts.values, labels=labels)  # Pass values and labels separately
plt.show()

In [None]:
mc_df.dropna(subset=['Question', 'Answer'], inplace=True)
mc_df = mc_df.drop(columns='Question_length')
mc_df.head()

### Normalize text

In [None]:
print(normalize_text(mc_df['Question'][0]))
print(normalize_text(mc_df['Answer'][0]))

mc_df = mc_df.applymap(normalize_text)
mc_df.head()

In [None]:
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


vectorizer = TfidfVectorizer(stop_words='english')
corpus = mc_df['Question'].tolist()
tfidf_matrix = vectorizer.fit_transform(corpus)


feature_names = vectorizer.get_feature_names_out()
first_question_tfidf = pd.DataFrame(tfidf_matrix[0].T.todense(),
                                    index=feature_names,
                                    columns=["TF-IDF"])
print(first_question_tfidf.sort_values("TF-IDF", ascending=False))




In [None]:


# Create a dictionary with words as keys and their TF-IDF scores as values
words_tfidf = dict(zip(vectorizer.get_feature_names_out(), tfidf_matrix.sum(axis=0).A1))

# Create the word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(words_tfidf)

# Display the word cloud
plt.figure(figsize=(10, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
mc_df = mc_df.drop(columns='Question_length')
mc_df.head()

In [None]:
import json

# Save as JSONL (each line is a JSON object)
with open("mc_dataset.jsonl", "w") as f:
    for _, row in mc_df.iterrows():
        json.dump({"question": row["Question"], "answer": row["Answer"], "category": row["Category"]}, f)
        f.write("\n")  # Add newline to separate each JSON object


In [None]:
import json
from collections import Counter
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from nltk.corpus import wordnet

# Download if not already
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('stopwords')

# POS tag mapping function
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN  # default to noun

lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stop_words.add('a')
stop_words.add('b')
stop_words.add('c')
stop_words.add('d')

mc_word_counter = Counter()

with open("mc_dataset.jsonl", "r") as f:
    for line in f:
        data = json.loads(line)
        question = data["question"].lower()
        tokens = word_tokenize(question)
        tagged = pos_tag(tokens)

        lemmatized = [
            lemmatizer.lemmatize(word, get_wordnet_pos(tag))
            for word, tag in tagged if word.isalpha() and word not in stop_words
        ]

        mc_word_counter.update(lemmatized)

print(mc_word_counter.most_common(20))


In [None]:
mc_word_counter_df = pd.DataFrame(mc_word_counter.most_common(20), columns=['Word', 'Frequency'])
mc_word_counter_df