## Description
In this Notebook the datasets german descriptions are preprocessed (stemming and lemmatization) for better usage on the LLMs. This code does not handle the error, that there are foreign languages in the germand descriptions.

#### Result
Consistent data that can be used for further implementation of the NLP.

The code was created with the assistance of ChatGPT-4.

In [None]:
import pandas as pd

# load csv
df= pd.read_csv("data/dataset_descriptions.csv", delimiter=';')

In [2]:
import re

# Function for text preprocessing
def preprocess_descriptions(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)

    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)

    # Remove duplicate terms
    text = re.sub(r'(\b\w+(?: \(\w+\))?\b)(, \1)+', r'\1', text)

    # Remove square brackets and their content
    text = re.sub(r'\[.*?\]', '', text)

    # Remove round brackets and their content
    text = re.sub(r'\(.*?\)', '', text)

    # Replace "+" with a space
    text = re.sub(r'\+', ' ', text)

    # Replace "#" and "|" with a space
    text = re.sub(r'[#|]', ' ', text)

    # Remove excessive hyphens and spaces
    text = re.sub(r'-+', ' ', text)

    # Remove punctuation marks
    text = re.sub(r'[,.\-\(\)%’:;!?\'"/]', ' ', text)  

    # Remove all words containing an underscore
    text = re.sub(r'\b\w*_\w*\b', '', text)

    # Remove single standalone letters
    text = re.sub(r'\b\w\b', '', text)

    # Remove all numbers
    text = re.sub(r'\b\d+\b', '', text)

    # Remove all types of quotation marks and asterisks
    text = re.sub(r'["„“‚‘«»*]', '', text)

    # Remove common stopwords (optional - can be expanded)
    stopwords = {"schweiz", "mobil", "geo", "daten", "kanton", "verordnung", "information", "system"}
    text = ' '.join(word for word in text.split() if word not in stopwords)

    # Clean up double spaces
    text = re.sub(r'\s+', ' ', text)

    return text.strip().lower()


In [3]:
# remove commas
df['dataset_description_DE'] = df['dataset_description_DE'].map(lambda x: preprocess_descriptions(str(x)) if pd.notna(x) else x)

In [None]:
# removes all duplicate titles
df.dropna(subset=['dataset_description_DE']) \
  .drop_duplicates(subset=['dataset_description_DE']) \
  .to_csv(
      "data/dataset_descriptions_preprocessed.txt",
      columns=["dataset_description_DE"],
      header=None,
      index=False
  )