## Description
In this notebook, the preprocessing of the German titles, descriptions, and keywords of each entry is performed. The cleaned and combined text data will be prepared for further use.

#### Improvments to consider
- Entries without any valid text are deleted, so currently, only German datasets are retained.
- Applying keyword-extraction might improve the results. This can be tested later.

The code was created with the assistance of ChatGPT-4.

In [47]:
import pandas as pd
import re

inputdata_file = 'data/merged_dataset_metadata.csv'
outputdata_file ='data/01_preprocessed_merged_dataset_metadata.csv'

df = pd.read_csv(inputdata_file)

0                                                      NaN
1        ['haushalte', 'kantonzuerich', 'gemeinden', 'b...
2                                                      NaN
3                                                      NaN
4                                                      NaN
                               ...                        
29074                                                  NaN
29075                                                  NaN
29076                                                  NaN
29077                                                  NaN
29078                                                  NaN
Name: dataset_keyword_DE, Length: 29079, dtype: object


  df = pd.read_csv(inputdata_file)


In [50]:
# Function for text preprocessing dataset_title_DE
def preprocess_dataset_title_DE(text):
    # Remove duplicate terms
    text = re.sub(r'(\b\w+(?: \(\w+\))?\b)(, \1)+', r'\1', text)
    # Remove square brackets and their content
    text = re.sub(r'\[.*?\]', '', text)
    # Remove round brackets and their content
    text = re.sub(r'\(.*?\)', '', text)
    # Remove all words containing a dot
    text = re.sub(r'\b\w*\.\w*\b', '', text)
    # Remove file formats
    text = re.sub(r'\b\w+\.(csv|json|shp|xls|parquet|rdfxml|jsonld|jsonl|dxf|gpkg|turtle)\b', '', text)
    # Replace "+" with a space
    text = re.sub(r'\+', ' ', text)
    # Replace "#" and "|" with a space
    text = re.sub(r'[#|]', ' ', text)
    # Remove excessive hyphens and spaces
    text = re.sub(r'[-]+', ' ', text)
    # Remove punctuation marks
    text = re.sub(r'[,.«»‚’°%*:;!?\'"/]', '', text)
    # Remove all words containing an underscore
    text = re.sub(r'\b\w*_\w*\b', '', text)
    # Remove all numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Remove single standalone letters
    text = re.sub(r'\b\w\b', '', text)
    # Clean up double spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove extra whitespace and convert to lowercase
    return text.strip().lower()

# Function for text preprocessing dataset_description_DE
def preprocess_dataset_description_DE(text):
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # Remove URLs
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove duplicate terms
    text = re.sub(r'(\b\w+(?: \(\w+\))?\b)(, \1)+', r'\1', text)
    # Remove square brackets and their content
    text = re.sub(r'\[.*?\]', '', text)
    # Remove round brackets and their content
    text = re.sub(r'\(.*?\)', '', text)
    # Replace "+" with a space
    text = re.sub(r'\+', ' ', text)
    # Replace "#" and "|" with a space
    text = re.sub(r'[#|]', ' ', text)
    # Remove excessive hyphens and spaces
    text = re.sub(r'-+', ' ', text)
    # Remove punctuation marks
    text = re.sub(r'[,.\-\(\)%’:;!?\'"/]', ' ', text)  
    # Remove all words containing an underscore
    text = re.sub(r'\b\w*_\w*\b', '', text)
    # Remove single standalone letters
    text = re.sub(r'\b\w\b', '', text)
    # Remove all numbers
    text = re.sub(r'\b\d+\b', '', text)
    # Remove all types of quotation marks and asterisks
    text = re.sub(r'["„“‚‘«»*]', '', text)
    # Clean up double spaces
    text = re.sub(r'\s+', ' ', text)
    # Remove extra whitespace and convert to lowercase
    return text.strip().lower()

def preprocess_dataset_keyword_DE(text_list):
    # Join the list into a single string separated by spaces
    text = ''.join(text_list)
    # Remove all specified punctuation (commas, dots, semicolons, etc.)
    text = re.sub(r'[.,-;:!\'?\[\]]', '', text)
    # Clean up double spaces
    text = re.sub(r'\s+', ' ', text)
    # Convert to lowercase and strip extra spaces
    return text.strip().lower()

def combine_preprocessed_columns(title_DE, description_DE, keyword_DE):
    # Convert each input to string because NaN-Values are float
    title_DE = str(title_DE) if pd.notna(title_DE) else ''
    description_DE = str(description_DE) if pd.notna(description_DE) else ''
    keyword_DE = str(keyword_DE) if pd.notna(keyword_DE) else ''
    
    # Combine the columns with a space separator
    combined_columns = ' '.join([title_DE, description_DE, keyword_DE]).strip()
    return combined_columns


In [51]:
# preprocess title, description and keyword DE:
df['dataset_title_DE_preprocessed'] = df['dataset_title_DE'].map(lambda x: preprocess_dataset_title_DE(str(x)) if pd.notna(x) else x)
df['dataset_description_DE_preprocessed'] = df['dataset_description_DE'].map(lambda x: preprocess_dataset_description_DE(str(x)) if pd.notna(x) else x)
df['dataset_keyword_DE_preprocessed'] = df['dataset_keyword_DE'].map(lambda x: preprocess_dataset_keyword_DE(str(x)) if pd.notna(x) else x)

# Combine the three columns into a new column 'Combined'
df['dataset_combined_title_description_keyword_preprocessed'] = df.apply(
    lambda row: combine_preprocessed_columns(
        row['dataset_title_DE_preprocessed'], 
        row['dataset_description_DE_preprocessed'], 
        row['dataset_keyword_DE_preprocessed']
    ), axis=1
)

# Drop rows where the combined column is empty or only contains whitespace
# !!! This Step should not be neccessary later !!!
original_row_count = len(df)
df = df[df['dataset_combined_title_description_keyword_preprocessed'].str.strip() != '']
new_row_count = len(df)
deleted_rows = original_row_count - new_row_count
print(f"Number of entries deleted: {deleted_rows}")

# drop the not used columns that are newly created!
df.drop(['dataset_title_DE_preprocessed', 
         'dataset_description_DE_preprocessed', 
         'dataset_keyword_DE_preprocessed'], axis=1, inplace=True)

# create column for labelling
df['mobility_dataset'] = None

Number of entries deleted: 5462


In [52]:
# Write dataframe in new csv-File
df.to_csv(outputdata_file, index=False)

print(f'Die Datei wurde erfolgreich als {outputdata_file} gespeichert.')

Die Datei wurde erfolgreich als data/01_preprocessed_merged_dataset_metadata.csv gespeichert.
