# Rule-based prediction

In [1]:
import pandas as pd
from tqdm import tqdm
import os

# Load the training dataset
train_df = pd.read_csv("/kaggle/input/kpdlhlv-9-splits/articles_training.tsv", sep="\t")

# Extract all unique tags from the training dataset
unique_tags = set()
train_df['tags'].str.split(',').apply(unique_tags.update)

# Define the folder containing the split test sets
split_folder = "/kaggle/input/kpdlhlv-9-splits"
output_folder = "/kaggle/working"

# List of split files to process
split_files = [
    "articles_testing_split_1.tsv",
    "articles_testing_split_2.tsv",
    "articles_testing_split_3.tsv",
    "articles_testing_split_4.tsv",
    "articles_testing_split_5.tsv",
    "articles_testing_split_6.tsv",
    "articles_testing_split_7.tsv",
    "articles_testing_split_8.tsv",
    "articles_testing_split_9.tsv",
] 

# Function to find tags in a content entry
def find_tags(content, tag_set):
    found_tags = [tag for tag in tag_set if tag in content]
    return ','.join(found_tags)

# Process each split file
for split_file in split_files:
    split_path = os.path.join(split_folder, split_file)
    
    # Load the split file
    print(f"Processing {split_file}...")
    split_df = pd.read_csv(split_path, sep="\t")
    
    # Apply the tag extraction function with a progress bar
    tqdm.pandas(desc=f"Processing {split_file}")
    split_df['tags'] = split_df['content'].progress_apply(lambda x: find_tags(x, unique_tags))
    
    # Save the updated split file
    output_path = os.path.join(output_folder, f"processed_{split_file}")
    split_df.to_csv(output_path, sep="\t", index=False)
    print(f"Saved processed file to {output_path}")

print("Processing complete for selected split files.")


Processing articles_testing_split_1.tsv...


Processing articles_testing_split_1.tsv: 100%|██████████| 4878/4878 [26:08<00:00,  3.11it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_1.tsv
Processing articles_testing_split_2.tsv...


Processing articles_testing_split_2.tsv: 100%|██████████| 4878/4878 [25:53<00:00,  3.14it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_2.tsv
Processing articles_testing_split_3.tsv...


Processing articles_testing_split_3.tsv: 100%|██████████| 4878/4878 [25:55<00:00,  3.14it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_3.tsv
Processing articles_testing_split_4.tsv...


Processing articles_testing_split_4.tsv: 100%|██████████| 4878/4878 [26:36<00:00,  3.05it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_4.tsv
Processing articles_testing_split_5.tsv...


Processing articles_testing_split_5.tsv: 100%|██████████| 4878/4878 [26:47<00:00,  3.04it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_5.tsv
Processing articles_testing_split_6.tsv...


Processing articles_testing_split_6.tsv: 100%|██████████| 4878/4878 [26:16<00:00,  3.09it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_6.tsv
Processing articles_testing_split_7.tsv...


Processing articles_testing_split_7.tsv: 100%|██████████| 4878/4878 [26:44<00:00,  3.04it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_7.tsv
Processing articles_testing_split_8.tsv...


Processing articles_testing_split_8.tsv: 100%|██████████| 4878/4878 [26:46<00:00,  3.04it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_8.tsv
Processing articles_testing_split_9.tsv...


Processing articles_testing_split_9.tsv: 100%|██████████| 4879/4879 [25:33<00:00,  3.18it/s]


Saved processed file to /kaggle/working/processed_articles_testing_split_9.tsv
Processing complete for selected split files.


# Integrate the predicted splitted sets

In [2]:
import pandas as pd
import os

# Define the folder containing the processed split files
split_folder = "/kaggle/working/"

# List of processed split files in the correct order
processed_files = [
    "processed_articles_testing_split_1.tsv",
    "processed_articles_testing_split_2.tsv",
    "processed_articles_testing_split_3.tsv",
    "processed_articles_testing_split_4.tsv",
    "processed_articles_testing_split_5.tsv",
    "processed_articles_testing_split_6.tsv",
    "processed_articles_testing_split_7.tsv",
    "processed_articles_testing_split_8.tsv",
    "processed_articles_testing_split_9.tsv",
    # Add other split filenames here in the correct order
]

# Read and concatenate the split files in the correct order
merged_df = pd.concat(
    [pd.read_csv(os.path.join(split_folder, file), sep="\t") for file in processed_files],
    ignore_index=True
)

# Save the merged dataset
output_path = os.path.join(split_folder, "articles_testing_with_tags_merged.tsv")
merged_df.to_csv(output_path, sep="\t", index=False)

print(f"Merged file saved as {output_path}")


Merged file saved as /kaggle/working/articles_testing_with_tags_merged.tsv


# Post-processing (keep whole word only)

In [3]:
import pandas as pd
import re

# Function to filter tags based on whole-word or whole-phrase matching
def filter_whole_phrase_tags(content, tags):
    # Prepare content for matching: add space padding to ensure proper boundaries
    padded_content = f" {content} "
    filtered_tags = []
    for tag in tags.split(','):
        # Add space padding to tag to ensure it matches whole phrases only
        pattern = re.escape(tag.strip())
        if re.search(rf'\b{pattern}\b', padded_content):
            filtered_tags.append(tag)
    return ','.join(filtered_tags)

# Load the predicted results file
input_file = "/kaggle/working/articles_testing_with_tags_merged.tsv"  # Replace with the actual file path
output_file = "sol3.tsv"

df = pd.read_csv(input_file, sep="\t")

# Apply the filtering function
df['tags'] = df.apply(lambda row: filter_whole_phrase_tags(row['content'], row['tags']), axis=1)

# Save the filtered results
df.to_csv(output_file, sep="\t", index=False)

print(f"Filtered results saved to {output_file}")


Filtered results saved to sol3.tsv
