In [9]:
import pandas as pd
import numpy as np
from collections import Counter
import re

In [10]:
import chardet

def detect_file_encoding(file_path):
    with open(file_path, 'rb') as file:
        raw_data = file.read()
        result = chardet.detect(raw_data)
        return result['encoding']

# Use it like this:
file_path = 'Dataset/production_comments.txt'
encoding = detect_file_encoding(file_path)
print(f"Detected encoding: {encoding}")

# Then use the detected encoding
df = pd.read_csv(file_path, sep='\t', encoding=encoding)


Detected encoding: utf-8


In [31]:
df.head()

Unnamed: 0,Schicht,Maschinen-ID,Produktionslinie,Betreiber,Kommentar
0,Schicht1,M01,Linie1,Lisa Fischer,"Die Maschine arbeitete optimal, alle Parameter..."
1,Schicht1,M02,Linie2,Tobias Schulz,"Häufige Kalibrierungen waren notwendig, was di..."
2,Schicht1,M03,Linie3,Anna Braun,Die Qualität der produzierten Teile war überdu...
3,Schicht2,M04,Linie1,Frank Bauer,Ein unerwarteter Stillstand aufgrund eines Sof...
4,Schicht2,M05,Linie2,Emma Weber,"Die Maschinen liefen stabil, jedoch gab es ein..."


In [24]:
def process_comments(text):
    # Convert to lowercase
    text = str(text).lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-ZäöüßÄÖÜ\s]', '', text)
    # Split into words
    words = text.split()
    return words


In [25]:
# Process all comments
all_words = []
for comment in df['Kommentar']:
    words = process_comments(comment)
    all_words.extend(words)

In [26]:
# Get unique words
unique_words = sorted(list(set(all_words)))
print(f"Number of unique words: {len(unique_words)}")
print("\nFirst 20 unique words:")
print(unique_words[:20])


Number of unique words: 327

First 20 unique words:
['abhängig', 'ablauf', 'abläufe', 'abnutzungspuren', 'abweichungen', 'adressiert', 'alle', 'alles', 'allgemeinen', 'als', 'alters', 'an', 'analyse', 'analysen', 'anforderungen', 'angehalten', 'anlaufphase', 'anpassungen', 'anspruch', 'anstieg']


In [27]:
# Get word frequency
word_freq = Counter(all_words)
most_common_words = word_freq.most_common(10)

In [28]:
most_common_words

[('die', 71),
 ('der', 30),
 ('war', 25),
 ('ein', 22),
 ('maschine', 15),
 ('produktion', 13),
 ('waren', 12),
 ('qualität', 10),
 ('hat', 10),
 ('ausschuss', 8)]

In [29]:
# Create a DataFrame of word frequencies
word_freq_df = pd.DataFrame(word_freq.items(), columns=['Word', 'Frequency'])
word_freq_df = word_freq_df.sort_values('Frequency', ascending=False)
word_freq_df.to_csv('word_frequencies.csv', index=False, encoding='utf-8-sig')

In [30]:
word_freq_df.head(10)

Unnamed: 0,Word,Frequency
0,die,71
17,der,30
20,war,25
23,ein,22
1,maschine,15
40,produktion,13
6,waren,12
57,hat,10
16,qualität,10
89,ausschuss,8


In [32]:
def process_comments(texts):
    # Domain-specific stopwords (German)
    stopwords = set(get_stop_words('german') + [
        'maschine', 'produktion', 'schicht', 'linie', 
        'system', 'parameter', 'prozess', 'teil'
    ])
    
    processed_texts = []
    for text in texts:
        # Clean and normalize
        text = str(text).lower()
        text = re.sub(r'[^a-zA-Zäöüß\s-]', '', text)  # Keep hyphens for compound words
        text = re.sub(r'\b\d+\b', '', text)  # Remove standalone numbers
        
        # Tokenize and filter
        tokens = [
            token for token in text.split() 
            if token not in stopwords and len(token) > 2
        ]
        
        # Rebuild text for BERTopic
        processed_texts.append(' '.join(tokens))
    
    return processed_texts

In [None]:
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from sklearn.cluster import KMeans

# Generate embeddings
gbert = SentenceTransformer('deepset/gbert-base')
embeddings = gbert.encode(processed_comments)