In [None]:
import spacy
import json
import nltk
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, PorterStemmer

nlp = spacy.load("en_core_web_sm")
nltk.download("punkt")
nltk.download("stopwords")

# Configs
LOWER_CASE = True
REMOVE_STOPWORDS = True
LEMMATIZE = True
STEM = False 

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()


In [None]:
def preprocess(text):
    if LOWER_CASE:
        text = text.lower()

    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in string.punctuation]

    if REMOVE_STOPWORDS:
        tokens = [t for t in tokens if t not in stop_words]

    doc = nlp(" ".join(tokens))
    tokens = [token.lemma_ for token in doc]

    if STEM:
        tokens = [stemmer.stem(t) for t in tokens]

    return " ".join(tokens)

In [None]:
# Load data
input_path = "data/train-claims.json"
with open(input_path, "r", encoding="utf-8") as f:
    data = json.load(f)

# Preprocess based on configs
for claim_id, claim in data.items():
    original_text = claim.get("claim_text", "")
    processed_text = preprocess(original_text)
    claim["processed_text"] = processed_text

# Save output
output_path = f"processed/processed-{input_path}.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2)

print(f"Processed claims saved to: {output_path}")