🇪🇺 Arne Krueger and Chad G. Petey Presents 🤓: 
# 🦅 Arne Krueger's Fun 🎉 with 📚 Patent Classifikation ✨

## Dear Patent Information Professionals - this is Session 2,

Patent classification systems, such as the International Patent Classification (IPC), are vital tools for organizing, searching, and analyzing patent information. IPC data is hierarchical, providing granular insights into technological fields, ranging from broad sections to detailed subgroups. To effectively work with and analyze this data, parsing IPC XML files is a critical skill, especially when large datasets need to be structured for meaningful insights.


In [12]:
from lxml import etree
import pandas as pd
import time
import os

# File path
file_path = "./ipc_schemes/EN_ipc_scheme_20240101.xml"

# Check if file exists
if not os.path.exists(file_path):
    print(f"Error: File '{file_path}' not found.")
    exit(1)

# Define XML tags and namespace
ipcEntry_tag = '{http://www.wipo.int/classifications/ipc/masterfiles}ipcEntry'
textBody_tag = '{http://www.wipo.int/classifications/ipc/masterfiles}textBody'

# Define levels of interest and mappings
kind_to_levelTitle = {
    's': 'section', 't': 'sub-section title', 'c': 'class', 'i': 'sub-class index',
    'u': 'sub-class', 'g': 'guidance heading', 'm': 'main group', '1': '.subgroup',
    '2': '..subgroup', '3': '...subgroup', '4': '....subgroup', '5': '.....subgroup',
    '6': '......subgroup', '7': '.......subgroup', '8': '........subgroup',
    '9': '.........subgroup', 'A': '..........subgroup', 'B': '...........subgroup',
    'n': 'note'
}
kind_to_level = {k: i + 1 for i, k in enumerate(kind_to_levelTitle.keys())}  # Convert keys to levels

# Data container for Pandas DataFrame
data = []
level_counts = {level: 0 for level in kind_to_level.values()}  # Initialize counts for each level

# Function to extract text from <textBody>
def get_text_body(entry):
    """Extract title text from <textBody> child of <ipcEntry>."""
    for child in entry:
        if child.tag == textBody_tag:
            # Collect all text from <textBody> and its descendants
            return "".join(child.itertext()).strip()
    return None  # Explicitly return None if <textBody> is missing

# Recursive walker function
def recWalker(node, kind_filter=None):
    """
    Recursively walks through XML nodes, extracting relevant information.
    :param node: Current XML node.
    :param kind_filter: Specific 'kind' to filter, or None to include all kinds.
    """
    for child in node:
        if child.tag == ipcEntry_tag:
            kind = child.attrib.get("kind")
            symbol = child.attrib.get("symbol")

            # Filter by 'kind', if specified
            if kind_filter is None or kind in kind_filter:
                title = get_text_body(child)  # Extract title
                level = kind_to_level.get(kind, None)  # Map kind to level
                leveltitle = kind_to_levelTitle.get(kind, None)  # Map kind to leveltitle

                data.append({
                    "kind": kind,
                    "symbol": symbol,
                    "title": title,
                    "level": level,
                    "leveltitle": leveltitle
                })

                # Update level count
                if level is not None:
                    level_counts[level] += 1

        # Recursive call for child nodes
        recWalker(child, kind_filter)

# Start the timer
start_time = time.time()

# Parse the XML and get the root element
parser = etree.XMLParser(remove_blank_text=True)
tree = etree.parse(file_path, parser)
root = tree.getroot()

print("Starting XML parsing...")

# Run the recursive walker
recWalker(root)

# Stop the timer
execution_time = time.time() - start_time

# Create Pandas DataFrame
df = pd.DataFrame(data)

# Print summary of the DataFrame
print(f"\nExtracted {len(df)} entries in {execution_time:.2f} seconds.\n")
print("DataFrame Content (first 15 rows):")
print(df.head(15))

# Print summary of counts per level
print("\nSummary of Counts Per Level:")
for level, count in sorted(level_counts.items()):
    if count > 0:
        title = next((k for k, v in kind_to_level.items() if v == level), None)
        title_desc = kind_to_levelTitle.get(title, 'Unknown Title')
        print(f"Level: {level} ({title_desc}), Count: {count}")

Starting XML parsing...

Extracted 81425 entries in 0.86 seconds.

DataFrame Content (first 15 rows):
   kind          symbol                                              title  \
0     s               A                                  HUMAN NECESSITIES   
1     t             A01                                        AGRICULTURE   
2     c             A01  AGRICULTUREFORESTRYANIMAL HUSBANDRYHUNTINGTRAP...   
3     u            A01B  SOIL WORKING IN AGRICULTURE OR FORESTRYPARTS, ...   
4     i            A01B  HAND TOOLS PLOUGHSGeneral construction , , , S...   
5     m  A01B0001000000                  Hand toolsedge trimmers for lawns   
6     1  A01B0001020000                                      SpadesShovels   
7     2  A01B0001040000                                         with teeth   
8     1  A01B0001060000                               HoesHand cultivators   
9     2  A01B0001080000                                with a single blade   
10    2  A01B0001100000                 

## Step 2

asdasdasd

In [13]:
import plotly.express as px


# Start the timer
start_time = time.time()

# Add a column for main class (first letter of the symbol)
df["main_class"] = df["symbol"].str[0]  # Extract the first letter of the symbol

# Print execution time
execution_time = time.time() - start_time
print(f"Added column mainclass in: {execution_time:.2f} seconds.")
start_time = time.time()

# Create a circle packing diagram with a custom color scale
fig = px.treemap(
    df,
    path=["main_class", "symbol", "title"],  # Hierarchy: Main Class > Symbol > Title
    values=None,                             # No aggregation; each item is shown
    color="main_class",                      # Color by main class
    color_discrete_sequence=px.colors.qualitative.Set3,  # Use a colorful preset
    hover_data={"kind": True},               # Keep 'kind' visible in the tooltip
    title="IPC Classifications - Circle Packing Visualization"
)

# Print execution time
execution_time = time.time() - start_time
print(f"Created the diagram in: {execution_time:.2f} seconds.")
start_time = time.time()

# Show the chart
print(df.head(15))
#fig.show()

Added column mainclass in: 0.02 seconds.
Created the diagram in: 16.47 seconds.
   kind          symbol                                              title  \
0     s               A                                  HUMAN NECESSITIES   
1     t             A01                                        AGRICULTURE   
2     c             A01  AGRICULTUREFORESTRYANIMAL HUSBANDRYHUNTINGTRAP...   
3     u            A01B  SOIL WORKING IN AGRICULTURE OR FORESTRYPARTS, ...   
4     i            A01B  HAND TOOLS PLOUGHSGeneral construction , , , S...   
5     m  A01B0001000000                  Hand toolsedge trimmers for lawns   
6     1  A01B0001020000                                      SpadesShovels   
7     2  A01B0001040000                                         with teeth   
8     1  A01B0001060000                               HoesHand cultivators   
9     2  A01B0001080000                                with a single blade   
10    2  A01B0001100000                            with two or

In [None]:
import plotly.express as px

df["main_class"] = df["symbol"].str[0]  # Extract the first letter of the symbol

# Dynamically extract section titles where kind == 's'
main_class_titles = df[df["kind"] == "s"].set_index("main_class")["title"].to_dict()

# Map the section titles to the `main_class` column
df["main_class_title"] = df["main_class"].map(main_class_titles)

# Truncate titles for display
df["short_title"] = df["title"].apply(lambda x: (x[:10] + "...") if len(x) > 10 else x)

# Create a circle packing diagram
fig = px.treemap(
    df,
    path=["main_class_title", "short_title"],  # Hierarchy: Main Class Title > Short Title
    values=None,                              # No aggregation; each item is shown
    color="main_class",                       # Color by main class
    color_discrete_sequence=px.colors.qualitative.Set3,  # Use a colorful preset
    hover_data={"title": True, "symbol": True, "kind": True},  # Show full title, symbol, and kind in tooltip
    title="IPC Classifications - Titles Only Visualization"
)

# Show the chart

print(df.head(15))
# fig.show()



In [None]:
import pandas as pd
import plotly.express as px
from ipywidgets import interact, widgets


# Dynamically extract section titles where kind == 's'
main_class_titles = df[df["kind"] == "s"].set_index("main_class")["title"].to_dict()

# Map the section titles to the `main_class` column
df["main_class_title"] = df["main_class"].map(main_class_titles)

# Truncate titles for display
df["short_title"] = df["title"].apply(lambda x: (x[:10] + "...") if len(x) > 10 else x)


# Function to filter and update the treemap
def update_treemap(selected_section):
    if selected_section == "All":
        filtered_df = df
        title = "IPC Classifications - All Sections"
    else:
        filtered_df = df[df["main_class_title"] == selected_section]
        title = f"IPC Classifications - {selected_section}"

    fig = px.treemap(
        filtered_df,
        path=["main_class_title", "short_title"],  # Hierarchy: Main Class Title > Short Title
        values=None,
        color="main_class",
        color_discrete_sequence=px.colors.qualitative.Set3,
        hover_data={"title": True, "symbol": True, "kind": True},
        title=title,
    )
    print(df.head(15))
    #fig.show()

# Create a dropdown for section selection
sections = ["All"] + df["main_class_title"].unique().tolist()
dropdown = widgets.Dropdown(
    options=sections,
    value="All",
    description="Section:",
    style={"description_width": "initial"},
)

# Use `interact` to link the dropdown with the update function
# interact(update_treemap, selected_section=dropdown)

