In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# Load the dataset
df = pd.read_csv(r"encyclopedia_single_definitions.csv")

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text) or text == '':
        return ''
    
    # Convert to lowercase
    text = str(text).lower()
    
    # Replace special characters with spaces
    text = re.sub(r'[^\w\s]', ' ', text)
    
    # Replace multiple spaces with single space
    text = re.sub(r'\s+', ' ', text)
    
    # Strip leading/trailing spaces
    return text.strip()

# Apply preprocessing and overwrite the column
df['vectorization_text'] = df['vectorization_text'].fillna('').apply(lambda x: preprocess_text(x))

# Drop rows with missing or empty ICD11 code
df_cleaned = df[df['code'].notna() & (df['code'].astype(str).str.strip() != '')]

# Save to CSV
df_cleaned.to_csv("encyclopedia_sd_preprocessed.csv", index=False)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\oldys\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\oldys\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
