In [32]:
from lxml import etree

def clean_xml(xml_file: str, output_file: str):
    """
    Clean a Tableau workbook XML (TWB).
    Removes thumbnails, embedded images, filter values, and extracts.
    Saves cleaned XML to output_file.
    """
    parser = etree.XMLParser(
        remove_blank_text=True,
        recover=True,
        encoding="utf-8",
        strip_cdata=True
    )
    tree = etree.parse(xml_file, parser)
    root = tree.getroot()

    # 1. Remove thumbnails
    for elem in root.findall(".//thumbnail"):
        parent = elem.getparent()
        if parent is not None:
            parent.remove(elem)

    # 2. Remove embedded images
    for elem in root.findall(".//image"):
        parent = elem.getparent()
        if parent is not None:
            parent.remove(elem)

    # 3. Remove filter values
    for filter_elem in root.findall(".//filter"):
        for value_elem in filter_elem.findall(".//value"):
            parent = value_elem.getparent()
            if parent is not None:
                parent.remove(value_elem)

    # 4. Remove extracts
    for elem in root.findall(".//extract"):
        parent = elem.getparent()
        if parent is not None:
            parent.remove(elem)

    # Save cleaned XML
    tree.write(output_file, pretty_print=True, encoding="utf-8", xml_declaration=True)
    print(f"✅ Cleaned XML saved to {output_file}")


In [36]:
orignal_file = os.getcwd()+'/tableau.twb'
clean_file = os.getcwd()+'/tableau_cleaned.twb'

In [35]:
orignal_file

'/Users/mj/Documents/udemy_llm/projects/llm_engineering/week2/TableauGenAI/tableau.twb'

In [37]:
clean_xml(orignal_file,clean_file)

✅ Cleaned XML saved to /Users/mj/Documents/udemy_llm/projects/llm_engineering/week2/TableauGenAI/tableau_cleaned.twb


In [44]:
import xml.etree.ElementTree as ET

def clean_xml(twb_file, output_file):
    """
    Clean Tableau TWB XML:
    - Remove unnecessary tags like thumbnails, embedded images, styles, formatting, window info
    - Strip workbook-level customizations not needed for metadata extraction
    """
    parser = ET.XMLParser(encoding="utf-8")
    tree = ET.parse(twb_file, parser=parser)
    root = tree.getroot()

    # Tags commonly safe to remove
    remove_tags = [
        "style",                # XML styling
        "window",               # window definitions
        "window-info",          # Tableau window info
        "repository-location",  # repository paths
        "color-palette",        # UI colors
        "color",                # UI colors
        "format",               # visual formatting
        "annotation",           # user annotations
        "thumbnails",           # preview thumbnails
        "thumbnail",            # embedded thumbnail images
        "formatted-text",       # text formatting
        "repository-url",       # repository links
        "custom-views",         # personalized views
        "user",                 # user personalization info
        "metadata-record",      # internal metadata cache
        "image",                # embedded images
        "picture",              # embedded pictures
        "graphic",              # UI graphics
        "map-source",           # Tableau maps
        "map",                  # map layers
        "bin"                   # bin
    ]

    # Remove all specified tags
    for tag in remove_tags:
        for elem in root.findall(f".//{tag}"):
            parent = elem.getparent() if hasattr(elem, "getparent") else None
            if parent is not None:
                parent.remove(elem)

    # Also remove attributes that may carry noise (style, formatting)
    noisy_attributes = ["style", "color", "font", "caption", "alias"]
    for elem in root.iter():
        for noisy_attr in noisy_attributes:
            if noisy_attr in elem.attrib:
                elem.attrib.pop(noisy_attr, None)

    # Write cleaned XML
    tree.write(output_file, encoding="utf-8", xml_declaration=True)
    print(f"Cleaned XML written to {output_file}")


In [45]:
clean_file = os.getcwd()+'/tableau_cleaned_1.twb'

In [46]:
clean_xml(orignal_file,clean_file)

Cleaned XML written to /Users/mj/Documents/udemy_llm/projects/llm_engineering/week2/TableauGenAI/tableau_cleaned_1.twb


In [53]:
from lxml import etree

# Tags to remove completely (no business meaning)
REMOVE_TAGS = {
    "style", "window", "format", "page", "style-rule", "drill-path",
    "conditional-format", "style-map", "groupfilter", "filter-values",
    "map-style", "bin", "color-palette", "thumbnail", "color", "legend",
    "aliases", "presentation", "viewpoint", "value"  # ensure <value> inside <bin> also removed
}

# Attributes to strip
REMOVE_ATTRS = {
    "caption", "width", "height", "color", "size", "alias", "thumbnail",
    "format", "style", "pane", "table", "zone", "x", "y", "z",
    "color-palette-name", "worksheet-position"
}

def clean_tableau_xml(input_file: str, output_file: str):
    parser = etree.XMLParser(remove_blank_text=True, recover=True)
    
    # Parse input XML
    tree = etree.parse(input_file, parser)
    root = tree.getroot()

    # Remove unwanted tags (and their children)
    for tag in REMOVE_TAGS:
        for elem in root.findall(f".//{tag}"):
            parent = elem.getparent()
            if parent is not None:
                parent.remove(elem)

    # Remove unwanted attributes
    for elem in root.iter():
        for attr in list(elem.attrib.keys()):
            if attr in REMOVE_ATTRS:
                del elem.attrib[attr]

    # Strip CDATA content
    for elem in root.iter():
        if elem.text and "<![CDATA[" in elem.text:
            elem.text = ""  # remove CDATA content

    # Write cleaned XML to file
    tree.write(output_file, pretty_print=True, encoding="utf-8", xml_declaration=True)

    print(f"✅ Cleaned XML written to {output_file}")


In [59]:
clean_tableau_xml(orignal_file,clean_file)

✅ Cleaned XML written to /Users/mj/Documents/udemy_llm/projects/llm_engineering/week2/TableauGenAI/tableau_cleaned_1.twb


In [77]:
from lxml import etree

# Tags to remove completely (no business meaning)
REMOVE_TAGS = {
    "style", "window", "format", "page", "style-rule", "drill-path",
    "conditional-format", "style-map", "groupfilter", "filter-values",
    "map-style", "bin", "color-palette", "thumbnail", "color", "legend",
    "aliases", "presentation", "viewpoint", "value",
    "panes", "pane", "view", "mark", "encodings",
    "customized-label", "formatted-text", "run", "breakdown"
}


# Attributes to strip
REMOVE_ATTRS = {
    "caption", "width", "height", "color", "size", "alias", "thumbnail",
    "format", "style", "pane", "table", "zone", "x", "y", "z",
    "color-palette-name", "worksheet-position"
}

def clean_tableau_xml(input_file: str, output_file: str):
    parser = etree.XMLParser(remove_blank_text=True, recover=True)
    
    # Parse input XML
    tree = etree.parse(input_file, parser)
    root = tree.getroot()

    # Remove unwanted tags (namespace independent)
    for tag in REMOVE_TAGS:
        for elem in root.xpath(f".//*[local-name()='{tag}']"):
            parent = elem.getparent()
            if parent is not None:
                parent.remove(elem)

    # Remove unwanted attributes
    for elem in root.iter():
        for attr in list(elem.attrib.keys()):
            if attr in REMOVE_ATTRS:
                del elem.attrib[attr]

    # Strip CDATA content
    for elem in root.iter():
        if elem.text and "<![CDATA[" in elem.text:
            elem.text = ""  # remove CDATA content

    # Write cleaned XML to file
    tree.write(output_file, pretty_print=True, encoding="utf-8", xml_declaration=True)

    print(f"✅ Cleaned XML written to {output_file}")


In [78]:
clean_file = os.getcwd()+'/tableau_cleaned_2.twb'

In [79]:
clean_tableau_xml(orignal_file,clean_file)

✅ Cleaned XML written to /Users/mj/Documents/udemy_llm/projects/llm_engineering/week2/TableauGenAI/tableau_cleaned_2.twb
