In [1]:
from datasets import load_dataset
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from gensim.models import Word2Vec
import numpy as np
import torch
import torchhd
from sklearn.metrics.pairwise import cosine_similarity
#from keras.preprocessing.text import Tokenizer # kills kernel



  from .autonotebook import tqdm as notebook_tqdm


## Load and Merge QA Datatsets

In [2]:
dsci = load_dataset("allenai/sciq")
print(dsci)
dsci_qs = []
for i in dsci['train']:
    dsci_qs.append({'question': i['question'], 'domain': 'Science'})

DatasetDict({
    train: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 11679
    })
    validation: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
    test: Dataset({
        features: ['question', 'distractor3', 'distractor1', 'distractor2', 'correct_answer', 'support'],
        num_rows: 1000
    })
})


In [3]:
dmed = load_dataset("openlifescienceai/medmcqa")
print(dmed)
dmed_qs = []
for i in dmed['train']:
    dmed_qs.append({'question': i['question'], 'domain': 'Medical'})

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 182822
    })
    test: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 6150
    })
    validation: Dataset({
        features: ['id', 'question', 'opa', 'opb', 'opc', 'opd', 'cop', 'choice_type', 'exp', 'subject_name', 'topic_name'],
        num_rows: 4183
    })
})


In [4]:
dcyber = load_dataset("khangmacon/cybermetric-10000")
print(dcyber)
dcyber_qs = []
for i in dcyber['train']:
    dcyber_qs.append({'question': i['input'].split('?')[0], 'domain': 'Cyber'}) #maybe add 'class': '3'

DatasetDict({
    train: Dataset({
        features: ['system', 'instruction', 'input', 'output', 'info'],
        num_rows: 9189
    })
    validation: Dataset({
        features: ['system', 'instruction', 'input', 'output', 'info'],
        num_rows: 1022
    })
})


In [5]:
dfin = load_dataset("PatronusAI/financebench")
dfin # only 150 rows, huge class imbalance and not enough to learn

DatasetDict({
    train: Dataset({
        features: ['financebench_id', 'company', 'doc_name', 'question_type', 'question_reasoning', 'domain_question_num', 'question', 'answer', 'justification', 'dataset_subset_label', 'evidence', 'gics_sector', 'doc_type', 'doc_period', 'doc_link'],
        num_rows: 150
    })
})

In [6]:
d_all = dsci_qs+dmed_qs+dcyber_qs 
len(d_all)

203690

In [7]:
q_dom = pd.DataFrame(d_all)
q_dom

Unnamed: 0,question,domain
0,What type of organism is commonly used in prep...,Science
1,What phenomenon makes global winds blow northe...,Science
2,Changes from a less-ordered state to a more-or...,Science
3,What is the least dangerous radioactive decay?,Science
4,Kilauea in hawaii is the world’s most continuo...,Science
...,...,...
203685,What is the main goal of the committee set up ...,Cyber
203686,"In the context of cybersecurity, what is the m...",Cyber
203687,What is the purpose of the change management p...,Cyber
203688,"Which of the following, when removed, can incr...",Cyber


In [8]:
# try to encode with word2vec and word2hypervec, calculate time difference
# https://github.com/goktug16/Word2HyperVec-From-Word-Embeddings-to-Hypervectors-for-Hyperdimensional-Computing
# classify with NN vs HDC, calculate time difference (also see SOTA works on this)

## Classify with Word2Vec and Logistic Regression
### Need to check validity still, not sure if its completely right

In [9]:
tokenized_questions = [question.split() for question in q_dom['question']]
word2vec_model = Word2Vec(sentences=tokenized_questions, vector_size=100, window=5, min_count=1, workers=4)

def question_to_vector(question, model):
    vectors = [model.wv[word] for word in question.split() if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(model.vector_size)

question_vectors = np.array([question_to_vector(q, word2vec_model) for q in q_dom['question']])

In [10]:
X_train, X_test, y_train, y_test = train_test_split(question_vectors, q_dom['domain'], test_size=0.2, random_state=1)

In [11]:
# logistic regression
classifier = LogisticRegression(max_iter=200, random_state=1) #class_weight='balanced' gives worse results
classifier.fit(X_train, y_train)

y_pred = classifier.predict(X_test)
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

       Cyber       0.86      0.75      0.80      1868
     Medical       0.98      0.99      0.98     36530
     Science       0.81      0.77      0.79      2340

    accuracy                           0.96     40738
   macro avg       0.88      0.84      0.86     40738
weighted avg       0.96      0.96      0.96     40738



## Classify with HDC
### Messy/wrong implementation so far

In [None]:
preprocessed_questions = q_dom['question']
domain_labels = q_dom['domain']

X_train, X_test, y_train, y_test = train_test_split(q_dom['question'], q_dom['domain'], test_size=0.2, random_state=1)

def positional_encoding(position, d_model):
    """
    Compute positional encodings for a given position and embedding size.
    """
    def get_angle(pos, i, d_model):
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
        return pos * angle_rates

    angle_rads = get_angle(np.arange(position)[:, np.newaxis],
                           np.arange(d_model)[np.newaxis, :],
                           d_model)

    sines = np.sin(angle_rads[:, 0::2])
    cosines = np.cos(angle_rads[:, 1::2])
    pos_encoding = np.concatenate([sines, cosines], axis=-1)
    return pos_encoding

# def encode_questions_with_position(questions, model):
#     """
#     Encode questions with Word2Vec embeddings and add positional encoding.
#     """
#     encoded_questions = []
#     for question in questions:
#         words = question.split()  # Simple tokenization by splitting on spaces
#         word_embeddings = np.array([
#             model.wv[word] if word in model.wv else np.zeros(model.vector_size) 
#             for word in words
#         ])
#         pos_encodings = positional_encoding(len(words), model.vector_size)
#         encoded_question = word_embeddings + pos_encodings
#         encoded_questions.append(encoded_question)
#     return encoded_questions

def normalize_embeddings(embeddings):
    """
    Normalize embeddings using Min-Max normalization.
    """
    normalized_embeddings = []
    for emb in embeddings:
        min_val = np.min(emb, axis=0, keepdims=True)
        max_val = np.max(emb, axis=0, keepdims=True)
        range_val = np.where(max_val - min_val == 0, 1, max_val - min_val)  # Avoid division by zero
        normalized_emb = (emb - min_val) / range_val
        normalized_embeddings.append(normalized_emb)
    return normalized_embeddings


model = word2vec_model

# #load training and testing data
# with open('questions_train_data.pkl', 'rb') as f:
#     X_train, y_train = pickle.load(f)

# with open('questions_test_data.pkl', 'rb') as f:
#     X_test, y_test = pickle.load(f)

# Encode questions with positional information for training and testing sets

def vec_to_hv():
    ...

q_hvs = vec_to_hv(question_vectors)
X_train, X_test, y_train, y_test = train_test_split(q_hvs, q_dom['domain'], test_size=0.2, random_state=1)

X_train_encoded = ... # was 'encode_questions_with_position(X_train, model)' but got errors
X_test_encoded = ...

#normalize embeddings
X_train_norm = normalize_embeddings(X_train_encoded)
X_test_norm = normalize_embeddings(X_test_encoded)

# (Optional) Convert normalized embeddings into binary hypervectors
def binarize_embeddings(embeddings):
    """
    Convert normalized embeddings to binary hypervectors, retaining the list format.
    """
    binary_embeddings = [np.where(emb > 0.5, 1, 0) for emb in embeddings]
    return binary_embeddings

# X_train_binary = binarize_embeddings(X_train_norm)
# X_test_binary = binarize_embeddings(X_test_norm)

In [None]:
def generate_class_hypervectors(X_train_norm, y_train, num_classes):
    """
    Generate class hypervectors by aggregating training data hypervectors for each class.
    """
    class_hypervectors = [np.zeros(X_train_norm[0].shape, dtype=float) for _ in range(num_classes)]
    
    for hv, label in zip(X_train_norm, y_train):
        class_hypervectors[label] += hv

    # print(class_hypervectors)
    # # Normalize to binary by thresholding at 50%
    # class_hypervectors = [np.where(hv > len(y_train) / (2 * num_classes), 1, 0) for hv in class_hypervectors]
    class_hypervectors = [vector / np.linalg.norm(vector) for vector in class_hypervectors]

    print(np.array(class_hypervectors))
    return np.array(class_hypervectors)

def classify_hypervectors(X_test_binary, class_hypervectors):
    """
    Classify test hypervectors by comparing with class hypervectors.
    """
    print(X_test_binary[0])
    predictions = []
    for test_hv in X_test_binary:
        similarities = [1 - cosine_similarity(test_hv.reshape(1, -1), class_hv.reshape(1, -1)) for class_hv in class_hypervectors]
        predicted_class = np.argmax(similarities)
        predictions.append(predicted_class)
    return predictions

# Map class labels to integers
unique_classes = sorted(set(y_train))  # Get sorted unique classes
class_to_index = {cls: idx for idx, cls in enumerate(unique_classes)}  # Map class -> index
index_to_class = {idx: cls for cls, idx in class_to_index.items()}  # Optional: reverse mapping

# Convert y_train and y_test to integer labels
y_train_int = [class_to_index[label] for label in y_train]
y_test_int = [class_to_index[label] for label in y_test]

# Number of classes (update this based on your dataset)
num_classes = len(set(y_train))

# Generate class hypervectors
class_hypervectors = generate_class_hypervectors(X_train_norm, y_train_int, num_classes)

# Classify test data
y_pred_int = classify_hypervectors(X_test_norm, class_hypervectors)

# Convert predictions back to original class labels
y_pred = [index_to_class[pred] for pred in y_pred_int]

# Evaluate performance (e.g., accuracy)
accuracy = np.mean(np.array(y_pred) == np.array(y_test))
print(f"Accuracy: {accuracy:.2%}")

In [None]:
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

# Generate confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)

# Create a heatmap for better visualization
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=unique_classes, yticklabels=unique_classes)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()