In [1]:
import xml.etree.ElementTree as ET
import requests
import gzip
from io import BytesIO
import re
from tqdm.notebook import tqdm
import os

In [2]:
def is_one_word(term):
    """Check if a term is a single word with allowed characters."""
    return re.fullmatch(r"[a-zA-Z0-9\-]+", term) is not None

def extract_sitemap_urls(index_file_path, max_sitemaps=1520):
    """Extract all .gz sitemap URLs from the main sitemap index."""
    tree = ET.parse(index_file_path)
    root = tree.getroot()
    ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
    urls = [loc.text for loc in root.findall("ns:sitemap/ns:loc", ns)]
    return urls[:max_sitemaps]

In [3]:
def extract_terms_from_sitemap(sitemap_url):
    """Download and extract one-word slang terms from a sitemap .gz file."""
    try:
        response = requests.get(sitemap_url, timeout=20)
        with gzip.GzipFile(fileobj=BytesIO(response.content)) as f:
            tree = ET.parse(f)
            root = tree.getroot()
            ns = {'ns': 'http://www.sitemaps.org/schemas/sitemap/0.9'}
            terms = []
            for url_elem in root.findall("ns:url", ns):
                loc = url_elem.find("ns:loc", ns).text
                if "define.php?term=" in loc:
                    term = loc.split("term=")[-1].replace("+", " ").strip()
                    if is_one_word(term):
                        terms.append(term.lower())
            return terms
    except Exception as e:
        print(f"⚠️ Error processing {sitemap_url}: {e}")
        return []

In [4]:
# === CONFIGURATION ===
SITEMAP_INDEX_FILE = r"C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\Final Project\urbandict_sitemap_gz.xml"
MAX_SITEMAPS = 1520  # You can reduce this temporarily for testing
OUTPUT_FILE = "urban_dictionary_one_word_slangs.txt"

# === MAIN PROCESS ===

sitemap_urls = extract_sitemap_urls(SITEMAP_INDEX_FILE, max_sitemaps=MAX_SITEMAPS)
print(f"📦 Found {len(sitemap_urls)} sitemap files.")

all_terms = set()

for sitemap_url in tqdm(sitemap_urls, desc="🔄 Processing sitemaps"):
    terms = extract_terms_from_sitemap(sitemap_url)
    all_terms.update(terms)

print(f"\n✅ Extracted {len(all_terms):,} unique one-word slang terms.")

📦 Found 1520 sitemap files.


🔄 Processing sitemaps:   0%|          | 0/1520 [00:00<?, ?it/s]


✅ Extracted 1,636,457 unique one-word slang terms.


In [6]:
# 💡 Filter out any terms that contain non-alphabetic characters
filtered_terms = {term for term in all_terms if term.isalpha()}

print(f"🧹 Filtered terms: {len(filtered_terms):,} remain after removing non-alphabetic ones.")

🧹 Filtered terms: 1,540,197 remain after removing non-alphabetic ones.


In [7]:
with open(OUTPUT_FILE, "w", encoding="utf-8") as f:
    for term in sorted(filtered_terms):
        f.write(term + "\n")

print(f"💾 Saved one-word slang terms to: {os.path.abspath(OUTPUT_FILE)}")

💾 Saved one-word slang terms to: C:\Users\neelb\Desktop\MEDD8925 - Analyzing Data Quantitatively\Final Project\urban_dictionary_one_word_slangs.txt
