# Solution 1 (baseline)

In [3]:
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import warnings

warnings.filterwarnings("ignore")

In [4]:
# Read train data and split it into train and validation sets
train_original_df = pd.read_json("../data/train.jsonl", lines=True)

train_df, val_df = train_test_split(train_original_df, test_size=0.2, random_state=42)

In [6]:
def dataframe_to_dict(df):
    # Create an empty dict to store the knowledge base of the NER model
    knowledge_base = {}

    # Add the labels to the doc_bin
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["sentences"]
        for start, end, label in row["ners"]:
            word = text[start:end + 1]
            if word not in knowledge_base:
                knowledge_base[word] = label

    return knowledge_base

In [7]:
train_knowledge_base = dataframe_to_dict(train_df)

100%|██████████| 415/415 [00:00<00:00, 9974.08it/s]


In [8]:
# Read dev and test data
dev_df = pd.read_json("../data/dev.jsonl", lines=True)
test_df = pd.read_json("../data/test.jsonl", lines=True)

In [9]:
def predict_ner(text):
    # Predict NER tags for the text according to the knowledge base
    ners = []
    current_index = 0
    for word in text.split():
        end_index = current_index + len(word)
        if word in train_knowledge_base:
            ners.append((current_index, end_index, train_knowledge_base[word]))
        current_index = end_index + 1
    return ners

In [10]:
def predict_ner_df(df, output_file_path):
    # Predict NER tags for each sentence in the dataframe

    df = df.copy()

    # Predict NER tags for each sentence
    ners_col = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = row["senences"]
        ners = predict_ner(text)
        ners = [(start, end - 1, label) for start, end, label in ners]
        ners_col.append(ners)

    # Add the predicted NER tags to the dataframe
    df["ners"] = ners_col

    # Save the dataframe as a jsonl file
    df.to_json(output_file_path, orient="records", lines=True)

In [11]:
predict_ner_df(dev_df, "dev_predictions.jsonl")

100%|██████████| 65/65 [00:00<00:00, 4334.13it/s]


In [12]:
predict_ner_df(test_df, "test_predictions.jsonl")

100%|██████████| 65/65 [00:00<00:00, 5416.09it/s]
