In [None]:
#Script to convert events2-.xlsx to a .txt with LOC and O tags
import pandas as pd
import json
import re
from nltk.corpus import stopwords
import nltk

# Ensure stopwords are downloaded
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def extract_locations(location_data):
    """Extract location names from the JSON in the locations column."""
    locations = set()
    try:
        data_list = json.loads(f"[{location_data}]")  # Ensure JSON is parsed as a list
        for entry in data_list:
            if isinstance(entry, dict) and "properties.name" in entry:
                locations.update(entry["properties.name"])
    except Exception:
        pass
    return locations

def tag_text(text, locations):
    """Tag tokens in the text based on extracted locations, handling partial matches while ignoring stopwords."""
    words = text.split()
    tagged_words = []

    # Normalize location names to match variations in text
    normalized_locs = {loc.lower(): loc for loc in locations}
    location_tokens = set()
    for loc in normalized_locs:
        tokens = [token for token in loc.split() if token.lower() not in stop_words]  # Remove stopwords
        location_tokens.update(tokens)

    for word in words:
        word_clean = re.sub(r'[^\w]', '', word).lower()

        # Check exact match or if any meaningful part of a location name matches
        if word_clean in location_tokens or any(word_clean in loc.lower().split() and word_clean not in stop_words for loc in locations):
            tagged_words.append(f"{word} LOC")
        else:
            tagged_words.append(f"{word} O")

    return "\n".join(tagged_words)

def process_excel(file_path, output_txt):
    df = pd.read_excel(file_path)
    df["locations"] = df["locations"].astype(str)

    with open(output_txt, "w", encoding="utf-8") as f:
        for _, row in df.iterrows():
            locations = extract_locations(row["locations"])
            tagged_text = tag_text(row["text"], locations)
            f.write(tagged_text + "\n\n")

# # Example usage:
# process_excel("events-2.xlsx", "output1.txt")