## Description
In this notebook, the keywords of the German titles and descriptions of each entry is performed. The extraced keywords are than combined with the german keywords into one entry.

#### Improvments to consider
- Entries without any valid text are deleted, so currently, only German datasets are retained.
- The used keyword extractor does support different languages, which should be included later.
- No longer used columns could be dropped at the end.

The code was created with the assistance of ChatGPT-4.

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
import nltk
import pandas as pd
import re
import ast

nltk.download('stopwords')

inputdata_file = 'data/01_preprocessed_data.csv'
outputdata_file ='data/02_extracted_keywords_data.csv'

df = pd.read_csv(inputdata_file, low_memory=False)

df['dataset_keyword_DE'] = df['dataset_keyword_DE'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else ([] if pd.isna(x) else x)
)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\haabs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
# function to extract keywords (the output of the score is currently disabled)

def extract_keywords_tfidf(docs, language='german', top_n=1000):
    # using top_n as parameter to set maximum keywords to be extracted
    # german as default language
    
    stop_words = stopwords.words(language)

    # keep words combined with - or . 
    token_pattern = r'\b[a-zA-ZäöüÄÖÜß][a-zA-Z0-9äöüÄÖÜß\.-]*\b'

    vectorizer = TfidfVectorizer(stop_words=stop_words, lowercase=True, token_pattern=token_pattern)

    tfidf_matrix = vectorizer.fit_transform(docs)
    feature_names = vectorizer.get_feature_names_out()

    results = []

    for doc_idx in range(tfidf_matrix.shape[0]):
        row = tfidf_matrix[doc_idx].tocoo()
        word_scores = [(feature_names[i], score) for i, score in zip(row.col, row.data)]
        top_keywords = [word for word, _ in sorted(word_scores, key=lambda x: x[1], reverse=True)[:top_n]]
        results.append(top_keywords)

    return results

In [19]:
# Define the columns to which the function should be applied
text_columns = ['dataset_title_DE_preprocessed', 'dataset_description_DE_preprocessed']  # <- Adjust column names if needed

# Extract keywords and add them as new columns
for col in text_columns:
    df[f'{col}_keywords'] = extract_keywords_tfidf(df[col].fillna(''), language='german')

In [20]:
# Function to preprocess the keywords and combine those, also duplicate keywords are dropped!
# Combines title, description, and keyword list into a single cleaned string of unique keywords. Removes punctuation, converts to lowercase, and eliminates duplicates.

def generate_cleaned_keyword_string(title_DE, description_DE, keyword_DE):

    # Convert NaNs to empty strings
    title_DE = str(title_DE) if pd.notna(title_DE) else ''
    description_DE = str(description_DE) if pd.notna(description_DE) else ''

    # Parse the keyword list string safely
    try:
        keyword_list = ast.literal_eval(keyword_DE) if pd.notna(keyword_DE) else []
        if not isinstance(keyword_list, list):
            keyword_list = []
    except:
        keyword_list = []

    # Combine all parts into one list of words
    combined_text = f"{title_DE} {description_DE} {' '.join(keyword_list)}"
    word_list = combined_text.split()

    # Remove duplicates while preserving order
    unique_words = list(dict.fromkeys(word_list))

    # Join the list into a string
    text = ' '.join(unique_words)

    # Remove specified punctuation
    text = re.sub(r'[.,\-;:!\'?\[\]]', '', text)

    # Normalize whitespace and lowercase
    text = re.sub(r'\s+', ' ', text).strip().lower()

    return text

In [None]:
def generate_cleaned_keyword_string(title_DE, description_DE, keywords_DE):
    # combine the three lists to one
    combined = title_DE + description_DE + keywords_DE
    # remove duplicates
    combined = list(set(combined))
    return combined

df['combined_DE_keywords'] = df.apply(
    lambda row: generate_cleaned_keyword_string(
        row['dataset_title_DE_preprocessed_keywords'],
        row['dataset_description_DE_preprocessed_keywords'],
        row['dataset_keyword_DE']
    ),
    axis=1
)

In [25]:
# Write dataframe in new csv-File
df.to_csv(outputdata_file, index=False)

print(f'The file has been successfully saved as {outputdata_file}.')

The file has been successfully saved as data/02_extracted_keywords_data.csv.
