In [1]:
import pandas as pd
import json
from sklearn.model_selection import train_test_split
import re
from collections import defaultdict

# Load dataset
df = pd.read_csv("/kaggle/input/kpdlhlv-raw-training/articles_training.tsv", sep="\t")

# Save the raw dataset before processing
# df.to_csv("raw_dataset.csv", index=False, encoding="utf-8")
print("Raw dataset saved as raw_dataset.csv")

# Function to tokenize and tag
def convert_to_ner_format(content, tags):
    tokens = re.split(r"(\s+|\W+)", content)
    tokens = [t for t in tokens if t.strip()]  # Remove empty tokens
    tags_set = set(tags.split(","))
    
    # Initialize tags as 'O'
    ner_tags = ["O"] * len(tokens)
    
    # Assign B-KEYWORD and I-KEYWORD
    for entity in tags_set:
        entity_tokens = entity.split()
        for i in range(len(tokens) - len(entity_tokens) + 1):
            if tokens[i:i + len(entity_tokens)] == entity_tokens:
                ner_tags[i] = "B-KEYWORD"
                if len(entity_tokens) > 1:
                    for j in range(1, len(entity_tokens)):
                        ner_tags[i + j] = "I-KEYWORD"
    
    return tokens, ner_tags

# Convert data to NER format
ner_dataset = []
for idx, row in df.iterrows():
    tokens, ner_tags = convert_to_ner_format(row["content"], row["tags"])
    ner_dataset.append({"id": idx + 1, "tokens": tokens, "ner_tags": ner_tags, "tags": row["tags"]})

# Convert to DataFrame
ner_df = pd.DataFrame({
    "id": [sample["id"] for sample in ner_dataset],
    "tokens": [json.dumps(sample["tokens"], ensure_ascii=False) for sample in ner_dataset],
    "ner_tags": [json.dumps(sample["ner_tags"], ensure_ascii=False) for sample in ner_dataset],
    "tags": [sample["tags"] for sample in ner_dataset]
})

# Save the raw train and test datasets
split_ratio = 0.1
test_df = ner_df.sample(frac=split_ratio, random_state=42)
train_df = ner_df.drop(test_df.index)

train_df.to_csv("raw_train.csv", index=False, encoding="utf-8")
test_df.to_csv("raw_test.csv", index=False, encoding="utf-8")
print("Raw train and test datasets saved as raw_train.csv and raw_test.csv")

# Sentence-based splitting function
def split_text_by_sentence_with_ids(tokens, ner_tags, original_id, max_length=256):
    split_data = []
    segment_id = 1
    i = 0
    non_word_regex = re.compile(r'\W')

    while i < len(tokens):
        end_idx = min(i + max_length, len(tokens))
        current_tokens = tokens[i:end_idx]
        current_tags = ner_tags[i:end_idx]

        if len(current_tokens) == max_length:
            split_idx = -1
            for idx, token in enumerate(current_tokens):
                if "." in token:
                    split_idx = idx
                elif non_word_regex.search(token) and split_idx == -1:
                    split_idx = idx

            if split_idx != -1:
                end_idx = i + split_idx + 1
                current_tokens = tokens[i:end_idx]
                current_tags = ner_tags[i:end_idx]
            else:
                raise ValueError(f"No valid boundary found to split within max_length at position {i}. Please check the input.")

        split_data.append({
            "id": f"{original_id}-{segment_id}",
            "tokens": current_tokens,
            "ner_tags": current_tags,
        })
        segment_id += 1
        i = end_idx

    return split_data

# Apply splitting
def apply_splitting_to_dataset(df):
    split_dataset = []
    for idx, row in df.iterrows():
        tokens = json.loads(row["tokens"])
        ner_tags = json.loads(row["ner_tags"])
        splits = split_text_by_sentence_with_ids(tokens, ner_tags, original_id=row["id"])
        split_dataset.extend(splits)
    return pd.DataFrame({
        "id": [sample["id"] for sample in split_dataset],
        "tokens": [json.dumps(sample["tokens"], ensure_ascii=False) for sample in split_dataset],
        "ner_tags": [json.dumps(sample["ner_tags"], ensure_ascii=False) for sample in split_dataset],
    })

# Save split datasets
split_train_df = apply_splitting_to_dataset(train_df)
split_test_df = apply_splitting_to_dataset(test_df)

split_train_df.to_csv("split_train_dataset.csv", index=False, encoding="utf-8")
split_test_df.to_csv("split_test_dataset.csv", index=False, encoding="utf-8")
print("Split train and test datasets saved as split_train_dataset.csv and split_test_dataset.csv")

# Integrate splits and include tags
def integrate_splits(split_df, original_df):
    split_df["original_id"] = split_df["id"].apply(lambda x: x.split("-")[0])
    integrated_data = defaultdict(lambda: {"tokens": [], "ner_tags": [], "tags": ""})
    for _, row in split_df.iterrows():
        original_id = row["original_id"]
        tokens = json.loads(row["tokens"])
        ner_tags = json.loads(row["ner_tags"])
        integrated_data[original_id]["tokens"].extend(tokens)
        integrated_data[original_id]["ner_tags"].extend(ner_tags)
        integrated_data[original_id]["tags"] = original_df.loc[original_df["id"] == int(original_id), "tags"].values[0]

    return pd.DataFrame([
        {"id": original_id,
         "tokens": json.dumps(data["tokens"], ensure_ascii=False),
         "ner_tags": json.dumps(data["ner_tags"], ensure_ascii=False),
         "tags": data["tags"]}
        for original_id, data in integrated_data.items()
    ])

# integrated_train_df = integrate_splits(split_train_df, train_df)
# integrated_test_df = integrate_splits(split_test_df, test_df)

# integrated_train_df.to_csv("integrated_train_dataset.csv", index=False, encoding="utf-8")
# integrated_test_df.to_csv("integrated_test_dataset.csv", index=False, encoding="utf-8")
# print("Integrated train and test datasets saved as integrated_train_dataset.csv and integrated_test_dataset.csv")


Raw dataset saved as raw_dataset.csv
Raw train and test datasets saved as raw_train.csv and raw_test.csv
Split train and test datasets saved as split_train_dataset.csv and split_test_dataset.csv
