In [13]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import warnings
import spacy

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

# Load spaCy German model for advanced NLP (install if not already)
# Uncomment the following line to download the model if not already installed
!python -m spacy download de_core_news_sm

nlp = spacy.load('de_core_news_sm')


Using device: cpu
Collecting de-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.8.0/de_core_news_sm-3.8.0-py3-none-any.whl (14.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.6/14.6 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')


In [4]:
# Load the training data
training_data = pd.read_csv('./data/nexxt_change_data_for_model_training_updated_branchen.csv')

# Display the first few rows
training_data.head()

Unnamed: 0,date,title,description,location,url,long_description,standort,branchen,mitarbeiter,jahresumsatz,preisvorstellung,international
0,08.11.2024,"Gasthausbrauere, Biergarten, moderner Innengas...",Gasthausbrauerei(Microbrewing) moderne Innenga...,"Bayern / Mittelfranken / Ansbach, Landkreis",https://www.nexxt-change.org/DE/Verkaufsangebo...,\nSchöner Biergarten (bis zu 180 Plätze) und s...,"Bayern > Mittelfranken > Ansbach, Landkreis",Gastgewerbe > Gastronomie,bis 5 Beschäftigte,über 250 - 500 Tsd. Euro,"über 500 Tsd. - 2,5 Mio. Euro",Nein
1,08.11.2024,SaaS-Plattform im Automotive Bereich,Das Unternehmen agiert als etablierter Anbiete...,Schleswig-Holstein / Schleswig-Holstein,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nMit einer beachtlichen Expertise in der Entw...,Schleswig-Holstein > Schleswig-Holstein,Information und Kommunikation > Dienstleistung...,6 - 10 Beschäftigte,"über 500 Tsd. - 2,5 Mio. Euro",Nicht veröffentlicht,Nein
2,08.11.2024,"Nachfolge / Teilhaber für mittelständisches, f...",Teilhaber bzw. Käufer für wirtschaftlich erfol...,Baden-Württemberg / Karlsruhe / Heidelberg,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nWir sind ein mittelständisches Familienunter...,Baden-Württemberg > Karlsruhe > Heidelberg,Baugewerbe > Hoch- und Tiefbau,21 - 50 Beschäftigte,"über 2,5 Mio. Euro",Nicht veröffentlicht,Nein
3,08.11.2024,PREMIUM-DESTILLERIE,Wir suchen eine Nachfolge für unsere vielfach ...,Berlin / Berlin / Berlin,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nWir suchen eine Nachfolge für unsere vielfac...,Berlin > Berlin > Berlin,Verarbeitendes Gewerbe > Herstellung von Nahru...,bis 5 Beschäftigte,über 250 - 500 Tsd. Euro,"über 500 Tsd. - 2,5 Mio. Euro",Nein
4,08.11.2024,"Startup Speziallösung für Wallbox-, Elektro,- ...",Das Unternehmen bietet eine Kombination von Ph...,Nordrhein-Westfalen,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nDas Unternehmen bietet eine Kombination von ...,Nordrhein-Westfalen,Handwerk > Ausbaugewerbe > Elektrotechniker; G...,6 - 10 Beschäftigte,"über 500 Tsd. - 2,5 Mio. Euro",Nicht veröffentlicht,Nein


In [15]:
# Fill NaN values
training_data.fillna('', inplace=True)

# Preprocess text fields
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to relevant text columns
text_columns = ['title', 'description', 'long_description']
for col in text_columns:
    training_data[col] = training_data[col].apply(preprocess_text)

# Combine text fields to create a single input for the model
training_data['combined_text'] = training_data['title'] + ' ' + training_data['description'] + ' ' + training_data['long_description']

# Display the preprocessed data
training_data[['combined_text', 'branchen']].head()


Unnamed: 0,combined_text,branchen
0,gasthausbrauere biergarten moderner innengastr...,Gastgewerbe > Gastronomie
1,saasplattform im automotive bereich das untern...,Information und Kommunikation > Dienstleistung...
2,nachfolge teilhaber für mittelständisches fam...,Baugewerbe > Hoch- und Tiefbau
3,premiumdestillerie wir suchen eine nachfolge f...,Verarbeitendes Gewerbe > Herstellung von Nahru...
4,startup speziallösung für wallbox elektro und ...,Handwerk > Ausbaugewerbe > Elektrotechniker; G...


In [16]:
# Check if 'branchen' has multiple labels separated by commas
training_data['branchen'] = training_data['branchen'].apply(lambda x: [item.strip() for item in x.split(' > ')] if ' > ' in x else [x.strip()])

# Initialize MultiLabelBinarizer for multi-label encoding
mlb = MultiLabelBinarizer()
training_data_encoded = training_data.copy()
training_data_encoded = training_data_encoded[training_data_encoded['branchen'].apply(lambda x: x != [''])]  # Remove rows without 'branchen' labels

# Fit and transform the labels
training_data_encoded['branchen_encoded'] = mlb.fit_transform(training_data_encoded['branchen']).tolist()

# Display the encoded labels
print(f"Classes: {mlb.classes_}")
training_data_encoded[['branchen', 'branchen_encoded']].head()

Classes: ['Abbrucharbeiten und vorbereitende Baustellenarbeiten'
 'Abbrucharbeiten und vorbereitende Baustellenarbeiten; Baugewerbe'
 'Anbringen von Stuckaturen, Gipserei und Verputzerei; Baugewerbe'
 'Architektur- und Ingenieurbüros'
 'Architektur- und Ingenieurbüros; Baugewerbe'
 'Architektur- und Ingenieurbüros; Dienstleistung'
 'Architektur- und Ingenieurbüros; Gastgewerbe'
 'Architektur- und Ingenieurbüros; Verarbeitendes Gewerbe' 'Augenoptiker'
 'Augenoptiker; Handwerk' 'Augenoptiker; Verarbeitendes Gewerbe' 'Ausbau'
 'Ausbau; Baugewerbe' 'Ausbau; Handwerk' 'Ausbau; Verarbeitendes Gewerbe'
 'Ausbaugewerbe' 'Ausbaugewerbe; Baugewerbe' 'Ausbaugewerbe; Handwerk'
 'Ausbaugewerbe; Verarbeitendes Gewerbe' 'Back- und Teigwaren'
 'Back- und Teigwaren; Handel' 'Back- und Teigwaren; Handwerk'
 'Baugewerbe' 'Baugewerbe; Baugewerbe'
 'Baugewerbe; Grundstücks- und Wohnungswesen'
 'Baugewerbe; Grundstücks- und Wohnungswesen; Dienstleistung'
 'Baugewerbe; Grundstücks- und Wohnungswesen; Handwer

Unnamed: 0,branchen,branchen_encoded
0,"[Gastgewerbe, Gastronomie]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[Information und Kommunikation, Dienstleistung...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Baugewerbe, Hoch- und Tiefbau]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[Verarbeitendes Gewerbe, Herstellung von Nahru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[Handwerk, Ausbaugewerbe, Elektrotechniker; Gr...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [17]:
from transformers import TrainingArguments, Trainer
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Define the model name
model_name = 'bert-base-german-dbmdz-uncased'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

# Tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=256)

train_texts, val_texts, train_labels, val_labels = train_test_split(
    training_data_encoded['combined_text'].tolist(),
    training_data_encoded['branchen_encoded'].tolist(),
    test_size=0.2,
    random_state=42,
    stratify=training_data_encoded['branchen_encoded']
)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create a custom Dataset class
import torch

class IndustryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item
    
    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = IndustryDataset(train_encodings, train_labels)
val_dataset = IndustryDataset(val_encodings, val_labels)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-dbmdz-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.

In [3]:
# Step 1: Install Required Libraries
# Run this cell first to install necessary packages

# Step 2: Import Libraries
import pandas as pd
import numpy as np
import re
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
import warnings
import spacy
from iterstrat.ml_stratifiers import MultilabelStratifiedShuffleSplit

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Check if GPU is available
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(f'Using device: {device}')

# Load spaCy German model for advanced NLP
nlp = spacy.load('de_core_news_sm')

# Step 3: Load and Explore Data
# Replace 'training_data.csv' with your actual file path
training_data = pd.read_csv('./data/nexxt_change_data_for_model_training_updated_branchen.csv')

# Display the first few rows
display(training_data.head())

# Step 4: Data Preprocessing
# Fill NaN values
training_data.fillna('', inplace=True)

# Preprocess text fields
def preprocess_text(text):
    text = str(text).lower()
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# Apply preprocessing to relevant text columns
text_columns = ['title', 'description', 'long_description']
for col in text_columns:
    training_data[col] = training_data[col].apply(preprocess_text)

# Combine text fields to create a single input for the model
training_data['combined_text'] = training_data['title'] + ' ' + training_data['description'] + ' ' + training_data['long_description']

# Display the preprocessed data
display(training_data[['combined_text', 'branchen']].head())

# Step 5: Encode Labels
# Check if 'branchen' has multiple labels separated by commas
training_data['branchen'] = training_data['branchen'].apply(
    lambda x: [item.strip() for item in x.split('>')] if '>' in x else [x.strip()] if x.strip() else []
)

# Initialize MultiLabelBinarizer for multi-label encoding
mlb = MultiLabelBinarizer()
training_data_encoded = training_data.copy()

# Remove rows without 'branchen' labels
training_data_encoded = training_data_encoded[training_data_encoded['branchen'].apply(lambda x: len(x) > 0)]

# Fit and transform the labels
training_data_encoded['branchen_encoded'] = list(mlb.fit_transform(training_data_encoded['branchen']))

# Display the encoded labels
print(f"Classes: {mlb.classes_}")
display(training_data_encoded[['branchen', 'branchen_encoded']].head())

# Ensure that each class has at least two samples
label_counts = np.sum(training_data_encoded['branchen_encoded'].tolist(), axis=0)
for cls, count in zip(mlb.classes_, label_counts):
    print(f"Class '{cls}' has {count} samples.")

# Remove classes with fewer than 2 samples
classes_to_keep = [cls for cls, count in zip(mlb.classes_, label_counts) if count >= 2]
print(f"Classes to keep: {classes_to_keep}")

# Reinitialize MultiLabelBinarizer with the kept classes
mlb = MultiLabelBinarizer(classes=classes_to_keep)
training_data_encoded['branchen_encoded'] = list(mlb.fit_transform(training_data_encoded['branchen']))

print(f"Updated Classes: {mlb.classes_}")

# Remove samples that have no remaining classes after filtering
training_data_encoded = training_data_encoded[training_data_encoded['branchen_encoded'].map(sum) > 0]

# Step 6: Prepare Dataset for Training
# Define the model name
model_name = 'bert-base-german-dbmdz-uncased'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(mlb.classes_),
    problem_type="multi_label_classification"
)

# Prepare data and labels
X = training_data_encoded['combined_text'].tolist()
y = training_data_encoded['branchen_encoded'].tolist()

# Initialize the multi-label stratified splitter
msss = MultilabelStratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=42)

# Perform the split
for train_index, val_index in msss.split(X, y):
    train_texts = [X[i] for i in train_index]
    val_texts = [X[i] for i in val_index]
    train_labels = [y[i] for i in train_index]
    val_labels = [y[i] for i in val_index]

print(f'Training samples: {len(train_texts)}')
print(f'Validation samples: {len(val_texts)}')

# Tokenize the texts
def tokenize_function(texts):
    return tokenizer(texts, padding='max_length', truncation=True, max_length=256)

train_encodings = tokenize_function(train_texts)
val_encodings = tokenize_function(val_texts)

# Create a custom Dataset class
class IndustryDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx]).float()
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = IndustryDataset(train_encodings, train_labels)
val_dataset = IndustryDataset(val_encodings, val_labels)

# Step 7: Fine-Tune the Model
# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,              # Adjust based on your dataset size and overfitting
    per_device_train_batch_size=16,  # Adjust based on your GPU memory
    per_device_eval_batch_size=32,
    warmup_steps=500,                # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # Strength of weight decay
    logging_dir='./logs',
    logging_steps=100,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    greater_is_better=True
)

# Define evaluation metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    sigmoid = torch.nn.Sigmoid()
    probs = sigmoid(torch.tensor(logits))
    preds = (probs > 0.5).int().numpy()
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted', zero_division=0)
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()

# Step 8: Evaluate the Model
# Evaluate the model
results = trainer.evaluate()

print(f"Validation Accuracy: {results['eval_accuracy']:.4f}")
print(f"Validation Precision: {results['eval_precision']:.4f}")
print(f"Validation Recall: {results['eval_recall']:.4f}")
print(f"Validation F1 Score: {results['eval_f1']:.4f}")

# Step 9: Save the Fine-Tuned Model
# Save the model and tokenizer
model.save_pretrained('./fine_tuned_bert_industry')
tokenizer.save_pretrained('./fine_tuned_bert_industry')


Using device: cpu


Unnamed: 0,date,title,description,location,url,long_description,standort,branchen,mitarbeiter,jahresumsatz,preisvorstellung,international
0,08.11.2024,"Gasthausbrauere, Biergarten, moderner Innengas...",Gasthausbrauerei(Microbrewing) moderne Innenga...,"Bayern / Mittelfranken / Ansbach, Landkreis",https://www.nexxt-change.org/DE/Verkaufsangebo...,\nSchöner Biergarten (bis zu 180 Plätze) und s...,"Bayern > Mittelfranken > Ansbach, Landkreis",Gastgewerbe > Gastronomie,bis 5 Beschäftigte,über 250 - 500 Tsd. Euro,"über 500 Tsd. - 2,5 Mio. Euro",Nein
1,08.11.2024,SaaS-Plattform im Automotive Bereich,Das Unternehmen agiert als etablierter Anbiete...,Schleswig-Holstein / Schleswig-Holstein,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nMit einer beachtlichen Expertise in der Entw...,Schleswig-Holstein > Schleswig-Holstein,Information und Kommunikation > Dienstleistung...,6 - 10 Beschäftigte,"über 500 Tsd. - 2,5 Mio. Euro",Nicht veröffentlicht,Nein
2,08.11.2024,"Nachfolge / Teilhaber für mittelständisches, f...",Teilhaber bzw. Käufer für wirtschaftlich erfol...,Baden-Württemberg / Karlsruhe / Heidelberg,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nWir sind ein mittelständisches Familienunter...,Baden-Württemberg > Karlsruhe > Heidelberg,Baugewerbe > Hoch- und Tiefbau,21 - 50 Beschäftigte,"über 2,5 Mio. Euro",Nicht veröffentlicht,Nein
3,08.11.2024,PREMIUM-DESTILLERIE,Wir suchen eine Nachfolge für unsere vielfach ...,Berlin / Berlin / Berlin,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nWir suchen eine Nachfolge für unsere vielfac...,Berlin > Berlin > Berlin,Verarbeitendes Gewerbe > Herstellung von Nahru...,bis 5 Beschäftigte,über 250 - 500 Tsd. Euro,"über 500 Tsd. - 2,5 Mio. Euro",Nein
4,08.11.2024,"Startup Speziallösung für Wallbox-, Elektro,- ...",Das Unternehmen bietet eine Kombination von Ph...,Nordrhein-Westfalen,https://www.nexxt-change.org/DE/Verkaufsangebo...,\nDas Unternehmen bietet eine Kombination von ...,Nordrhein-Westfalen,Handwerk > Ausbaugewerbe > Elektrotechniker; G...,6 - 10 Beschäftigte,"über 500 Tsd. - 2,5 Mio. Euro",Nicht veröffentlicht,Nein


Unnamed: 0,combined_text,branchen
0,gasthausbrauere biergarten moderner innengastr...,Gastgewerbe > Gastronomie
1,saasplattform im automotive bereich das untern...,Information und Kommunikation > Dienstleistung...
2,nachfolge teilhaber für mittelständisches fam...,Baugewerbe > Hoch- und Tiefbau
3,premiumdestillerie wir suchen eine nachfolge f...,Verarbeitendes Gewerbe > Herstellung von Nahru...
4,startup speziallösung für wallbox elektro und ...,Handwerk > Ausbaugewerbe > Elektrotechniker; G...


Classes: ['Abbrucharbeiten und vorbereitende Baustellenarbeiten'
 'Abbrucharbeiten und vorbereitende Baustellenarbeiten; Baugewerbe'
 'Anbringen von Stuckaturen, Gipserei und Verputzerei; Baugewerbe'
 'Architektur- und Ingenieurbüros'
 'Architektur- und Ingenieurbüros; Baugewerbe'
 'Architektur- und Ingenieurbüros; Dienstleistung'
 'Architektur- und Ingenieurbüros; Gastgewerbe'
 'Architektur- und Ingenieurbüros; Verarbeitendes Gewerbe' 'Augenoptiker'
 'Augenoptiker; Handwerk' 'Augenoptiker; Verarbeitendes Gewerbe' 'Ausbau'
 'Ausbau; Baugewerbe' 'Ausbau; Handwerk' 'Ausbau; Verarbeitendes Gewerbe'
 'Ausbaugewerbe' 'Ausbaugewerbe; Baugewerbe' 'Ausbaugewerbe; Handwerk'
 'Ausbaugewerbe; Verarbeitendes Gewerbe' 'Back- und Teigwaren'
 'Back- und Teigwaren; Handel' 'Back- und Teigwaren; Handwerk'
 'Baugewerbe' 'Baugewerbe; Baugewerbe'
 'Baugewerbe; Grundstücks- und Wohnungswesen'
 'Baugewerbe; Grundstücks- und Wohnungswesen; Dienstleistung'
 'Baugewerbe; Grundstücks- und Wohnungswesen; Handwer

Unnamed: 0,branchen,branchen_encoded
0,"[Gastgewerbe, Gastronomie]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,"[Information und Kommunikation, Dienstleistung...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,"[Baugewerbe, Hoch- und Tiefbau]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,"[Verarbeitendes Gewerbe, Herstellung von Nahru...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,"[Handwerk, Ausbaugewerbe, Elektrotechniker; Gr...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


Class 'Abbrucharbeiten und vorbereitende Baustellenarbeiten' has 3 samples.
Class 'Abbrucharbeiten und vorbereitende Baustellenarbeiten; Baugewerbe' has 4 samples.
Class 'Anbringen von Stuckaturen, Gipserei und Verputzerei; Baugewerbe' has 1 samples.
Class 'Architektur- und Ingenieurbüros' has 33 samples.
Class 'Architektur- und Ingenieurbüros; Baugewerbe' has 5 samples.
Class 'Architektur- und Ingenieurbüros; Dienstleistung' has 5 samples.
Class 'Architektur- und Ingenieurbüros; Gastgewerbe' has 1 samples.
Class 'Architektur- und Ingenieurbüros; Verarbeitendes Gewerbe' has 2 samples.
Class 'Augenoptiker' has 29 samples.
Class 'Augenoptiker; Handwerk' has 2 samples.
Class 'Augenoptiker; Verarbeitendes Gewerbe' has 1 samples.
Class 'Ausbau' has 232 samples.
Class 'Ausbau; Baugewerbe' has 7 samples.
Class 'Ausbau; Handwerk' has 11 samples.
Class 'Ausbau; Verarbeitendes Gewerbe' has 1 samples.
Class 'Ausbaugewerbe' has 597 samples.
Class 'Ausbaugewerbe; Baugewerbe' has 7 samples.
Class 'A

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-german-dbmdz-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training samples: 3923
Validation samples: 973


  0%|          | 0/1230 [00:00<?, ?it/s]

{'loss': 0.6228, 'grad_norm': 0.4188224673271179, 'learning_rate': 1e-05, 'epoch': 0.41}
{'loss': 0.321, 'grad_norm': 0.19975623488426208, 'learning_rate': 2e-05, 'epoch': 0.81}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.11216297745704651, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 14.7391, 'eval_samples_per_second': 66.015, 'eval_steps_per_second': 2.103, 'epoch': 1.0}
{'loss': 0.1167, 'grad_norm': 0.07846853137016296, 'learning_rate': 3e-05, 'epoch': 1.22}
{'loss': 0.0611, 'grad_norm': 0.042384643107652664, 'learning_rate': 4e-05, 'epoch': 1.63}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.0396704226732254, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 14.3526, 'eval_samples_per_second': 67.793, 'eval_steps_per_second': 2.16, 'epoch': 2.0}
{'loss': 0.0441, 'grad_norm': 0.03668174147605896, 'learning_rate': 5e-05, 'epoch': 2.03}
{'loss': 0.0373, 'grad_norm': 0.033823393285274506, 'learning_rate': 4.3150684931506855e-05, 'epoch': 2.44}
{'loss': 0.0345, 'grad_norm': 0.03129350766539574, 'learning_rate': 3.63013698630137e-05, 'epoch': 2.85}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.03299378603696823, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 14.3734, 'eval_samples_per_second': 67.695, 'eval_steps_per_second': 2.157, 'epoch': 3.0}
{'loss': 0.0335, 'grad_norm': 0.028311941772699356, 'learning_rate': 2.945205479452055e-05, 'epoch': 3.25}
{'loss': 0.0327, 'grad_norm': 0.026225728914141655, 'learning_rate': 2.2602739726027396e-05, 'epoch': 3.66}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.0316026508808136, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 14.3918, 'eval_samples_per_second': 67.608, 'eval_steps_per_second': 2.154, 'epoch': 4.0}
{'loss': 0.0317, 'grad_norm': 0.02917182631790638, 'learning_rate': 1.5753424657534248e-05, 'epoch': 4.07}
{'loss': 0.0315, 'grad_norm': 0.024271300062537193, 'learning_rate': 8.904109589041095e-06, 'epoch': 4.47}
{'loss': 0.0318, 'grad_norm': 0.028967181220650673, 'learning_rate': 2.054794520547945e-06, 'epoch': 4.88}


  0%|          | 0/31 [00:00<?, ?it/s]

{'eval_loss': 0.0312691368162632, 'eval_accuracy': 0.0, 'eval_precision': 0.0, 'eval_recall': 0.0, 'eval_f1': 0.0, 'eval_runtime': 14.6412, 'eval_samples_per_second': 66.456, 'eval_steps_per_second': 2.117, 'epoch': 5.0}
{'train_runtime': 999.7523, 'train_samples_per_second': 19.62, 'train_steps_per_second': 1.23, 'train_loss': 0.11450654932153903, 'epoch': 5.0}


  0%|          | 0/31 [00:00<?, ?it/s]

Validation Accuracy: 0.0000
Validation Precision: 0.0000
Validation Recall: 0.0000
Validation F1 Score: 0.0000


('./fine_tuned_bert_industry/tokenizer_config.json',
 './fine_tuned_bert_industry/special_tokens_map.json',
 './fine_tuned_bert_industry/vocab.txt',
 './fine_tuned_bert_industry/added_tokens.json',
 './fine_tuned_bert_industry/tokenizer.json')

In [None]:

# Step 10: Predict Industry Labels for New Data
# Load the fine-tuned model and tokenizer
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained('./fine_tuned_bert_industry').to(device)
fine_tuned_tokenizer = AutoTokenizer.from_pretrained('./fine_tuned_bert_industry')

# Function to predict industries
def predict_industry(texts, model, tokenizer, threshold=0.5, batch_size=32):
    model.eval()
    predictions = []
    with torch.no_grad():
        for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
            batch_texts = texts[i:i+batch_size]
            encodings = tokenizer(batch_texts, padding=True, truncation=True, max_length=256, return_tensors='pt')
            input_ids = encodings['input_ids'].to(device)
            attention_mask = encodings['attention_mask'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            probs = torch.sigmoid(logits)
            preds = (probs > threshold).int().cpu().numpy()
            predictions.extend(preds)
    return predictions

# Load new data for prediction
# Replace 'new_data.csv' with your actual file path
new_data = pd.read_csv('./data/dejuna_data_to_label_industry - buyer dejuna.csv')

# Preprocess new data
for col in text_columns:
    new_data[col] = new_data[col].apply(preprocess_text)
new_data['combined_text'] = new_data['title'] + ' ' + new_data['description'] + ' ' + new_data['long_description']

# Predict industries
new_texts = new_data['combined_text'].tolist()
predicted_labels = predict_industry(new_texts, fine_tuned_model, fine_tuned_tokenizer, threshold=0.7)

# Decode labels
predicted_industries = mlb.inverse_transform(predicted_labels)

# Assign predicted industries to the DataFrame
new_data['predicted_industries'] = predicted_industries

# Define sub-industries mapping (ensure it covers all main industries)
industry_subindustry_map = {
    'Bauunternehmen': ['Erdbau', 'Tiefbau', 'Straßenbau', 'Kanalbau'],
    'Gastronomie': ['Gasthausbrauerei', 'Innengastronomie', 'Biergarten'],
    'Gesundheitswesen': ['Physiotherapie', 'Medizinische Versorgung'],
    'Ingenieurdienstleistungen': ['Gebäudetechnik', 'Brandschutztechnik'],
    'Technologie': ['SaaS', 'Automotive', 'Softwareentwicklung']
    # Add more industries and sub-industries as needed
}

# Function to assign sub-industries based on predicted industries and text
def assign_subindustry(row):
    industries = row['predicted_industries']
    if not industries:
        return []
    subindustries = []
    text = row['combined_text']
    for industry in industries:
        sub_list = industry_subindustry_map.get(industry, [])
        for sub in sub_list:
            if sub.lower() in text:
                subindustries.append(sub)
    return subindustries

# Apply the function
new_data['predicted_subindustries'] = new_data.apply(assign_subindustry, axis=1)

# Save the results
new_data.to_csv('new_data_with_predicted_industry.csv', index=False)

# Display the first few predictions
display(new_data[['title', 'predicted_industries', 'predicted_subindustries']].head())


('4.46.0', '1.1.1')

In [19]:
# Model 3: Transformer-based Model (BERT)
# Encode target labels as integers

# Get unique labels from the training set
train_labels_set = set(y_train)

# Keep only the test samples that have labels seen in the training set
filtered_test_indices = y_test.isin(train_labels_set)
X_test_filtered = X_test[filtered_test_indices]
y_test_filtered = y_test[filtered_test_indices]

label_encoder = LabelEncoder()

# Save the label encoder classes
# Encode the filtered labels
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test_filtered)

# Tokenizer and Model Initialization
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
bert_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels=len(label_encoder.classes_))


# Tokenizing Data (filtered X_test)
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
test_encodings = tokenizer(list(X_test_filtered), truncation=True, padding=True, max_length=128)


# Creating TensorFlow Datasets
def create_tf_dataset(encodings, labels):
    return tf.data.Dataset.from_tensor_slices((dict(encodings), labels)).shuffle(1000).batch(16)

# Creating TensorFlow Datasets
train_dataset = create_tf_dataset(train_encodings, y_train_encoded)
test_dataset = create_tf_dataset(test_encodings, y_test_encoded)
np.save("label_encoder_classes.npy", label_encoder.classes_)


Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [20]:
# Compiling and Training BERT Model
bert_model.compile(optimizer='adam',
                   loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
                   metrics=['accuracy'])



bert_model.fit(train_dataset, epochs=3, validation_data=test_dataset)

# # Evaluation
test_loss, test_acc = bert_model.evaluate(test_dataset)
print("BERT Test Accuracy:", test_acc)

Epoch 1/3
Epoch 2/3
Epoch 3/3
BERT Test Accuracy: 0.8855140209197998


In [21]:
data_to_predict = pd.read_csv("./data/dejuna_data_to_label_industry - buyer dejuna.csv")


In [22]:
# Preprocess Data
data_to_predict['combined_text'] = data_to_predict['title'].fillna('') + ' ' + data_to_predict['description'].fillna('') + ' ' + data_to_predict['long_description'].fillna('')
data_to_predict['combined_text'] = data_to_predict['combined_text'].apply(lambda x: clean_text(str(x)))



In [23]:
# Load the tokenizer and the trained BERT model
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-german-cased")
bert_model = TFAutoModelForSequenceClassification.from_pretrained("distilbert-base-german-cased", num_labels=3)  # Update `num_labels` based on your training

# Assuming label encoder from training
label_encoder = LabelEncoder()
label_encoder.classes_ = np.load("label_encoder_classes.npy", allow_pickle=True)  # Load saved label classes from training

# Tokenizing Data
encodings = tokenizer(list(data_to_predict['combined_text']), truncation=True, padding=True, max_length=128)

# Creating TensorFlow Dataset
def create_tf_dataset(encodings):
    return tf.data.Dataset.from_tensor_slices(dict(encodings)).batch(16)

dataset = create_tf_dataset(encodings)

# Get Predictions
predictions = bert_model.predict(dataset).logits
predicted_labels = np.argmax(predictions, axis=1)
predicted_categories = label_encoder.inverse_transform(predicted_labels)

# Add Predictions to DataFrame
data['predicted_category'] = predicted_categories

# Display or Save Results
print(data[['title', 'predicted_category']])
data.to_csv("./data/dejuna_data_with_predictions.csv", index=False)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 



ValueError: Length of values (51) does not match length of index (5024)

In [None]:

# Summary
# print("\nSummary of Model Performances:")
# print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_log_reg))
# print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
# print("BERT Test Accuracy:", test_acc)
