In [None]:
# Read jsonl file
import json

train = []
with open("train.jsonl", "r", encoding='utf-8') as f:
    for line in f:
        train.append(json.loads(line))

print(train[0]['question'], train[0]['points'], train[0]['article'])
print(len(train))


In [None]:
# Investigate proportion of testcases where answer can be found in the question
total = 0
found = 0
for set in train:
    total += 1
    if set['article'].lower() in set['question'].lower():
        found += 1

print(found,total,found/total)

In [None]:
# Not as many as I thought, but still a significant amount

# Perhaps the points are a good indicator of more trivial questions, where the answer is in the question?
# Investigate the distribution of points

from statistics import mean
import seaborn as sns
import matplotlib.pyplot as plt

trivial = []
nontrivial = []

for set in train:
    if set['article'].lower() in set['question'].lower():
        trivial.append(set['points'])
    else:
        nontrivial.append(set['points'])

all = trivial + nontrivial

print(mean(trivial), mean(nontrivial))
print(mean(all))

sns.displot(trivial)
sns.displot(nontrivial)
plt.show()

# There is a strong correlation, perhaps we can bias the model towards 
# retrieving the answer from the question if the points are low.

In [None]:
# I suspect most of the answers are nouns. Let's use NLP to check this.
!pip install spacy
!python -m spacy download en_core_web_sm


In [None]:
import spacy
nlp = spacy.load("en_core_web_sm")

is_noun = 0
not_noun = 0
not_noun_examples = []

docs = list(nlp.pipe([set['article'] for set in train]))
for doc in docs:
    if len(list(doc.noun_chunks)) >= 1:
        is_noun += 1
    else:
        not_noun += 1
        not_noun_examples.append(doc.text)

print(is_noun, not_noun, is_noun/(is_noun+not_noun))
print(not_noun_examples[:100])

In [None]:
# More than 85% of the articles are nouns, so we should prioritise nouns in our search.
# Many of the articles not classified as nouns are in fact nouns, many of them being years.

In [None]:
# Next, let's process the wikipedia dataset using parquet
!pip install pyarrow

In [None]:
import pyarrow.parquet as pq

wikipedia = pq.read_table('train-00000-of-00001.parquet').to_pandas()
wikipedia = wikipedia[:10000]
wikipedia = wikipedia[['text', 'title']]
print(wikipedia.tail())

In [None]:
# The best way to tackle this problem should be to use a vector database. Let's set up milvus for this.
# Milvus is being run in a docker container in the milvus folder.

In [None]:
# Connect to milvus server
# Credit to this tutorial by Stephen Collins for information on setting up milvus and text embedding
# https://dev.to/stephenc222/how-to-use-milvus-to-store-and-query-vector-embeddings-5hhl
from pymilvus import connections

def connect_to_milvus():
    try:
        connections.connect("default", host="localhost", port="19530")
        print("Connected to Milvus.")
    except Exception as e:
        print(f"Failed to connect to Milvus: {e}")
        raise

connect_to_milvus()

In [None]:
# Set up schema and create a collection
from pymilvus import FieldSchema, CollectionSchema, DataType, Collection

def create_collection(name, fields, description):
    schema = CollectionSchema(fields, description)
    collection = Collection(name, schema, consistency_level="Strong")
    return collection

def drop_collection(name):
    collection = Collection(name)
    collection.drop()
    
# Define fields for our collection
fields = [
    FieldSchema(name="pk", dtype=DataType.VARCHAR, is_primary=True, auto_id=False, max_length=100),
    FieldSchema(name="embeddings", dtype=DataType.FLOAT_VECTOR, dim=768),
    FieldSchema(name="title", dtype=DataType.VARCHAR, max_length=500),
]

drop_collection("wikipedia_simple")
collection = create_collection("wikipedia_simple", fields, "Text embeddings of the simple wikipedia dataset")

In [None]:
from embedding_util import generate_embeddings
# Generate embeddings for each article
for i, doc in enumerate(wikipedia['text']):
    embedding = generate_embeddings(doc)
    # Write into file
    with open("embeddings.txt", "a", encoding='utf-8') as f:
        f.write(f"{embedding}\n")
    print(f"{i}/{len(wikipedia)}")

In [None]:
# Read embeddings
with open("embeddings.txt", "r", encoding='utf-8') as f:
    embeddings = f.readlines()

embeddings = [[float(value) for value in embedding[1:-2].split(", ")] for embedding in embeddings]
print(embeddings[0])

In [None]:
# Write into milvus
entities = [
    [str(i) for i in range(len(wikipedia))],
    embeddings,
    [str(title) for title in wikipedia['title']],
]

insert_result = collection.insert(entities)
print(insert_result)

In [None]:
# Create index for embeddings
def create_index(collection, field_name, index_type, metric_type, params):
    index = {"index_type": index_type, "metric_type": metric_type, "params": params}
    collection.create_index(field_name, index)

create_index(collection, "embeddings", "IVF_FLAT", "L2", {"nlist": 128})

In [None]:
def search_and_query(collection, search_vectors, search_field, search_params):
    collection.load()
    result = collection.search(search_vectors, search_field, search_params, limit=3, output_fields=["title"])
    return result[0][0].entity.get("title")

# Test search
query = "how do living organisms in a natural environment respond to changes in weather or climate?"
query_vector = generate_embeddings(query)
search_and_query(collection, [query_vector], "embeddings", {"metric_type": "L2", "params": {"nprobe": 10}})

# Correctly returns "Environment"!

In [None]:
# Test the performance of our model
score = 0
totalScore = 0

for set in train[:500]:
    query = set['question']
    query_vector = generate_embeddings(query)
    result = search_and_query(collection, [query_vector], "embeddings", {"metric_type": "L2", "params": {"nprobe": 10}})
    print(f"result: {result}, answer: {set['article']}")
    if result.lower() in set['article'].lower():
        score += set['points']
    totalScore += set['points']

print(f"Our model scored {score}/{totalScore} points on the training set.")

In [None]:
# Our model scored 15618/31274 points on the training set. 
# Let's see if we can improve this by weighing based on the points and whether the answer is in the question.

# Add a is_in_question field to the train set
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix


# Convert the list of dictionaries to a DataFrame
df = pd.DataFrame(train)

# Create the 'is_in_question' column
df['is_in_question'] = df.apply(lambda row: row['article'].lower() in row['question'].lower(), axis=1).astype(int)

X = df[['points']]
y = df['is_in_question']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

# Train the model
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = model.predict(X_test_scaled)
print(f"Logistic regression model accuracy: {accuracy_score(y_test,y_pred):.3f}")




In [94]:
# Modify the search function to take into account the points and whether the answer is in the question
def search_and_query(collection, search_vectors, search_field, search_params, points, question):
    collection.load()
    result = collection.search(search_vectors, search_field, search_params, limit=3, output_fields=["title"])
    in_question_prob = model.predict_proba([[points]])[0][1]

    processed_results = [[article.entity.get("title"), 1 - article.distance] for article in result[0]]

    for article in processed_results:
        if article[0].lower() in question.lower():
            article[1] *= (1-in_question_prob)
        else:
            article[1] *= in_question_prob
    
    processed_results.sort(key=lambda x: x[1], reverse=True)
    return processed_results[0][0]

In [95]:
# Test the performance of our new model
score = 0
totalScore = 0

for set in train[:500]:
    query = set['question']
    query_vector = generate_embeddings(query)
    result = search_and_query(collection, [query_vector], "embeddings", {"metric_type": "L2", "params": {"nprobe": 10}}, set['points'], set['question'])
    print(f"result: {result}, answer: {set['article']}")
    if result.lower() in set['article'].lower():
        score += set['points']
    totalScore += set['points']

print(f"Our model scored {score}/{totalScore} points on the training set.")

result: Environment, answer: Environment
result: Poet, answer: Marcha Real
result: Magnetic resonance imaging, answer: Magnetic resonance imaging
result: Berlin, answer: Boroughs of Berlin
result: Charles Dickens, answer: Heinrich Rudolf Hertz
result: Korean War, answer: Korean War
result: Oprah Winfrey, answer: Oprah Winfrey
result: Cambridge, answer: Cambridgeshire
result: Trailer, answer: Trailer
result: Murcia, answer: Tucumán Province
result: Indonesia, answer: Mount Merapi
result: Nuclear reactor, answer: Nuclear fission
result: Economy, answer: Economy of India
result: Tree, answer: Nothofagus
result: Spacecraft, answer: Alien
result: Jane Fonda, answer: Peter Fonda
result: James Monroe, answer: 1823
result: Idaho, answer: Chief Joseph
result: Somaliland, answer: Somaliland
result: Language isolate, answer: Korean language
result: Ludwig van Beethoven, answer: Ludwig van Beethoven
result: Turtle, answer: Turtle
result: Diesel engine, answer: Diesel engine
result: 1143, answer: 1

In [None]:
# Our model scored 14782/31274 points on the training set.
# Whoops, it's slightly worse than before. Let's stick with the base model.

In [None]:
# Go back to the old model

def search_and_query(collection, search_vectors, search_field, search_params):
    collection.load()
    result = collection.search(search_vectors, search_field, search_params, limit=3, output_fields=["title"])
    return result[0][0].entity.get("title")

# Read the test set

test = []
with open("test.jsonl", "r", encoding='utf-8') as f:
    for line in f:
        test.append(json.loads(line))

print(test[0]['question'], test[0]['points'])
print(len(test))


In [93]:
# Generate results for the test set
for set in test:
    query = set['question']
    query_vector = generate_embeddings(query)
    result = search_and_query(collection, [query_vector], "embeddings", {"metric_type": "L2", "params": {"nprobe": 10}})
    set['answer'] = result

    with open("submission.jsonl", "a", encoding='utf-8') as f:
        f.write(f"{set}\n")

# Your algorithm choice
I used a vector database to store the embeddings of each article with a label of the article title. I then generated embeddings of each question and did a vector similarity search to find the most similar articles.

# How you would extend this algorithm to 100k/1m/1b articles
This method should still be workable for 100k articles, but the database might be too large for 1m/1b articles. I investigated the proportion of nouns at the start because I thought it may be a good idea to do a breadth-first search by looking up the articles for all the nouns in the question, and then looking for the answer in these articles. If those articles don't contain a satisfactory answer, we could then extract the nouns from these articles and search until we find an answer. For the small size of data I was provided this wasn't necessary, this method could work to ensure that only a small number of articles are added to the vector database and we don't run out of space/time.

# Evaluating performance
Since I didn't have to do training, the entire train dataset acted as my train dataset. 
I don't have much experience working with docker and my Milvus container kept stopping on its own (possibly due to memory issues as I'm running this on my laptop). Because of this, I had to limit the sample size to the first 500 entries but I think this still gave a pretty good evaluation of model performance especially as I was comparing the version which weighted based on points.

# Any reference that you found interesting
I've worked with vector databases before with Pinecone but it was my first time setting it up and running it on my laptop with Milvus. It was a good learning process!

# Ideas that worked / did not work 
As discussed, open source language embedding models are advanced enough to make my more primitive methods relying on NLP to be obsolete at this scale.

I was surprised my trick of using the points to observe whether the answer is in the title caused the performance to drop so drastically, but this could be because the relationship between points and the answer being in the question wasn't strong in the first place. 