# Install and Import
Wordcloud und Transformer laufen nicht mehr im gleichen Environment

In [1]:
!pip install Pillow==9.0.0

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting Pillow==9.0.0
  Downloading Pillow-9.0.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
[K     |████████████████████████████████| 4.3 MB 30.4 MB/s 
[?25hInstalling collected packages: Pillow
  Attempting uninstall: Pillow
    Found existing installation: Pillow 7.1.2
    Uninstalling Pillow-7.1.2:
      Successfully uninstalled Pillow-7.1.2
Successfully installed Pillow-9.0.0


In [2]:
!pip install reportlab
!python -m spacy download de_core_news_sm
!pip install nltk
!python -m spacy download de_core_news_md
!pip install pyspellchecker
!pip install datasets
!pip install transformers
!pip install fonttools

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting reportlab
  Downloading reportlab-3.6.12-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 15.5 MB/s 
Installing collected packages: reportlab
Successfully installed reportlab-3.6.12
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting de-core-news-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-3.4.0/de_core_news_sm-3.4.0-py3-none-any.whl (14.6 MB)
[K     |████████████████████████████████| 14.6 MB 33.3 MB/s 
Installing collected packages: de-core-news-sm
Successfully installed de-core-news-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('de_core_news_sm')
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

In [3]:
# Data
import pandas as pd
import numpy as np
import sqlite3
from sklearn.model_selection import train_test_split
import datasets
from datasets import Dataset

# NLP
import regex as re
import spacy
from spellchecker import SpellChecker
from bs4 import SoupStrainer, BeautifulSoup
from bs4.element import Comment

# Modelling
from transformers import pipeline
import os
import torch
from torch import nn
from transformers import pipeline
from transformers import AutoTokenizer, BertTokenizer, DistilBertTokenizer
from transformers import AutoModelForSequenceClassification, RobertaForSequenceClassification
from transformers import BertForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback
from tqdm import tqdm

# Evaluation
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import accuracy_score, balanced_accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import fontTools
import fontTools.subset

In [4]:
tqdm.pandas()
pd.options.mode.chained_assignment = None

In [5]:
# get traceback
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [6]:
torch.cuda.is_available()

True

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

# Data

In [None]:
DB_CONNECT = "/content/drive/MyDrive/hareg_nlp/03_Data/hareg.db"
engine = sqlite3.connect(DB_CONNECT)
sql = '''SELECT* FROM df_firms'''
df = pd.read_sql(sql, engine)
df.head()

## Label

In [None]:
# dictionary mit label und integer als key
di_int_to_label = {0: 'Telecommunications',
                   1: 'Legal Services',
                   2: 'Management Consulting',
                   3: 'Medical Practice',
                   4: 'Consumer Goods',
                   5: 'Leisure, Travel & Tourism',
                   6: 'Recreational Facilities and Services',
                   7: 'Insurance',
                   8: 'Financial Services',
                   9: 'Real Estate',
                   10: 'Construction',
                   11: 'Automotive',
                   12: 'Marketing and Advertising',
                   13: 'Information Technology and Services',
                   14: 'Logistics and Supply Chain',
                   15: 'Wholesale',
                   16: 'Mechanical or Industrial Engineering',
                   17: 'Human Resources',
                   18: 'Renewables & Environment'}

In [None]:
# dictionary mit labelkeys als value und label als key
di_label_to_int = {'Telecommunications': 0,
                   'Legal Services': 1,
                   'Management Consulting': 2,
                   'Medical Practice': 3,
                   'Consumer Goods': 4,
                   'Leisure, Travel & Tourism': 5,
                   'Recreational Facilities and Services': 6,
                   'Insurance': 7,
                   'Financial Services': 8,
                   'Real Estate': 9,
                   'Construction': 10,
                   'Automotive': 11,
                   'Marketing and Advertising': 12,
                   'Information Technology and Services': 13,
                   'Logistics and Supply Chain': 14,
                   'Wholesale': 15,
                   'Mechanical or Industrial Engineering': 16,
                   'Human Resources': 17,
                   'Renewables & Environment': 18}

In [None]:
# change = di_int_to_label
change = di_label_to_int
df['label_int'] = df['label'].replace(change)

## Cleaning

In [None]:
df['gegenstand_raw'] = df['gegenstand']

In [None]:
# verbleibende Sonderzeichen ersetzen
def clean_txt(text):
  text = re.sub("'", "",text)
  text = re.sub("(\W)+"," ",text)
  return text

In [None]:
# Anzahl Wörter vor Verarbeitung
# Die maximal zulässige Inputsequenzlänge von 512 Tokens für BERT Modelle bei Klassifikation-Tasks wird nicht überschritten 
df['word_count'] = df['gegenstand'].str.split().apply(len)
df.word_count.describe()

In [None]:
df['gegenstand'] = df.gegenstand.apply(clean_txt)
df['label'] = df.label.replace(di_label_to_int)
df["tokens"] = ""

In [None]:
train, test = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
train, valid = train_test_split(train, test_size=0.1, random_state=42, shuffle=True)

In [None]:
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)
ds_valid = Dataset.from_pandas(valid)
di_data = datasets.DatasetDict({'train': ds_train, 'test': ds_test, 'valid': ds_valid})

## Spacy

Spacy StopWord Removal und (Lemmatization) trägt trotz Transformer Modell deutlich zur Verbesserung der Accuracy und F1 Scores bei.

In [None]:
# code 1:1 aus pds
nlp = spacy.load("de_core_news_sm", exclude=['tok2vec', 'tagger', 'morphologizer', 'parser', 'attribute_ruler', 'ner', 'senter'])

def opt_preprocess(text):
    remove_list = []

    doc = nlp(text.lower())
  
    non_stop_lem = [token.lemma_ for token in doc if not token.is_stop if not token.is_punct]

    for word in non_stop_lem:
        if re.findall('[^a-zA-Z0-9_À-ÖØ-öø-ÿ]', word):
            remove_list.append(word)

    return " ".join([word for word in non_stop_lem if word not in remove_list])

In [None]:
df['gegenstand'] = df.progress_apply(lambda row: opt_preprocess(row['gegenstand']), axis=1)

In [None]:
df['gegenstand'] = df.gegenstand.apply(lambda text: text.lower())

## Lemmatization (2)
Beim betrachten der Daten fällt auf, dass die Lemmatization nicht richtig funktioniert hat. Daher wird diese erneut durchgeführt.
Tatsächlich verschlechtert die korrekte Lemmatization die Vorhersagegenauigkeit um ~ 3% Accuracy/F1-Score. Daher wird diese nun doch nicht mehr ausgeführt.

In [None]:
nlp = spacy.load('de_core_news_md')

def lemma(text):
    doc = nlp(text.lower())
    text = " ".join(w.lemma_.lower() for w in doc) # if not w.is_stop if not w.is_punct)
    return text

In [None]:
# df['gegenstand'] = df.progress_apply(lambda row: lemma(row['gegenstand']), axis=1)

## Stop Words (2)

Erweiterete Stopword Liste führt erwartungsgemäß zu schlechteren Ergebnisse. Daher nur einzelne Buchstaben entfernen, was mit einem Regex im Nachhinein betrachtet wohl einfacher gewesen wäre. Doch das entfernen der Buchstaben führt ebenfalls zu signifikanten Einbußen. Die einzelnen Buchstaben sind höchstwahrscheinlich Gliederungspunkte. Der Transformer kann mit Hilfe dieser Punkte anscheinend den Zusammenhang der einzelnen Absätze besser konstruieren. Daher werden diese nun doch nicht entfernt. Wiederum verbessert die Entfernung einzelner Buchstaben aus dem Webseiten Text die Klassifikation.

In [None]:
# stopword liste von https://countwordsfree.com/stopwords/german
stopword_file = open("/content/drive/MyDrive/hareg_nlp/03_Data/stop_words_german.txt", "r")
stop_words_data = stopword_file.read()  
customize_stop_words = stop_words_data.replace('\n', ' ').split(" ")
stopword_file.close()
print(customize_stop_words)

In [None]:
li_letter = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']

In [None]:
print(nlp.Defaults.stop_words)

In [None]:
nlp = spacy.load('de_core_news_md')

In [None]:
def remove_stop_words(text):

    # add custom stop words
    for word in customize_stop_words:
        lex = nlp.vocab[word]
        lex.is_stop = True

    lst=[]
    stopwords = nlp.Defaults.stop_words

    for w in text.split():
        if w.lower() not in stopwords:    #checking whether the word is not 
            lst.append(w)                    #present in the stopword list.

    new_text = ' '.join(lst)        
    return new_text

In [None]:
def remove_letter(text):
    
    # add custom stop words
    for word in li_letter:
        lex = nlp.vocab[word]
        lex.is_stop = True

    lst=[]
    stopwords = nlp.Defaults.stop_words

    for w in text.split():
        if w.lower() not in stopwords:    #checking whether the word is not 
            lst.append(w)                    #present in the stopword list.

    new_text = ' '.join(lst)        
    return new_text 

In [None]:
# df['gegenstand_wordcloud'] = df.progress_apply(lambda row: remove_stop_words(row['gegenstand']), axis=1)

## Spellchecker

In [None]:
spell = SpellChecker(language='de')

def spellcheck(text):
    text = text.split()
    # find those words that may be misspelled
    misspelled = spell.unknown(text)

    for word in misspelled:
        correction = spell.correction(word)
        # print(correction)
        text = [(w.replace(w, correction) if ((w == word) & (correction is not None)) else w) for w in text]
    
    print(text)
    text = " ".join(text)
    return text

In [None]:
text = "hier ist was fual udn es stimmt nohc mehr nihct"
spellcheck(text)

Der Unternehmensgegenstand ist nahezu fehlerfrei. Daher neigt der Spellchecker zur Verschlimmbesserung und vermindert dadurch Accuracy und F1-Score.

In [None]:
# df['gegenstand'] = df.progress_apply(lambda row: spellcheck(row['gegenstand']), axis=1)

## Preprocessing Ideas

In [None]:
# noch nicht umgesetzte Ideen:
# zusammengesetzte nomen aufsplitten und sowohl die komponenten als auch das urspgl. nomen behalten
# n/bigrams
# enhance sentences
# einzelne englishe wörter nach deutsch Übersetzen

# für html - umlaute welche in html durch code dargestellt werden vor stop word removal wiederherstellen
# sieh dazu auch https://stackoverflow.com/questions/46613734/how-do-i-replace-xc3-etc-with-umlauts
# spellchekcer nutzen, um aus Versehen getrennte Worte nach utf-8 umlaut wieder zusammenzuführen

In [None]:
DB_CONNECT = '/content/drive/MyDrive/hareg_nlp/03_Data/hareg.db'
engine = sqlite3.connect(DB_CONNECT)
### df.to_sql('df_spellcheck', con=engine, if_exists='fail', index = False, chunksize=10000)

## Tokenization
try to train custom tokenizer 
https://www.youtube.com/watch?v=DJimQynXZsQ
https://www.youtube.com/watch?v=MR8tZm5ViWU&list=PLo2EIpI_JMQvWfQndUesu0nPBAtZ9gP1o&index=55

In [None]:
# distilbert-base-uncased
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

In [None]:
# df['tokens'] = df['gegenstand'].apply(lambda row: tokenizer(row, padding='max_length', truncation=True))
def make_tokens(data):
    return tokenizer(data['text'], padding='max_length', truncation=True)

In [None]:
di_data = di_data.map(make_tokens, batched=True)

In [None]:
ds_train = di_data["train"]
ds_test = di_data["test"]
ds_valid = di_data["valid"]

## Class Weights

In [None]:
class_weights = (1- (train['label'].value_counts().sort_index()/len(train))).values
class_weights = torch.from_numpy(class_weights).float().to('cuda')
class_weights

# new distil-BERT

Vor der Klassifikation könnte zudem ein binäres Klassifikationsmodell eingesetzt werden, um Holding-Gesellschaften herauszufiltern.

## Fine-Tune 

In [None]:
# model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=19)
model = BertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=19)

In [None]:
training_args = TrainingArguments(output_dir='trainer',
                                  evaluation_strategy='steps',
                                  eval_steps = 250,
                                  save_total_limit = 5,
                                  num_train_epochs=8,
                                  per_device_train_batch_size = 16,
                                  gradient_accumulation_steps = 2,
                                  logging_dir='./logs',
                                  logging_steps=1,
                                  metric_for_best_model = 'f1',
                                  load_best_model_at_end=True)

In [None]:
# Trainer class from https://huggingface.co/docs/transformers/v4.24.0/en/main_classes/trainer#transformers.Trainer
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss (suppose one has 3 labels with different weights)
        loss_fct = nn.CrossEntropyLoss(weight=class_weights)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [None]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
trainer = CustomTrainer(model = model,
                        args = training_args, 
                        train_dataset = ds_train, 
                        eval_dataset = ds_valid, 
                        compute_metrics = compute_metrics,
                        tokenizer = tokenizer,
                        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])

In [None]:
trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/hareg_nlp/04_Models/dist_bert")

In [None]:
# tokenized input
ds_train[0]

In [None]:
# show hidden embedding
model.bert.embeddings.word_embeddings.weight

## Predict

In [None]:
path = "/content/drive/MyDrive/hareg_nlp/04_Models/dist_bert"

pipe = pipeline('text-classification',
    model=path,
    device=0,
    truncation=True)

In [None]:
test['label_pred'] = test.progress_apply(lambda row: pipe(row['gegenstand'])[0]['label'].replace('LABEL_',''), axis=1)
test.label_pred = test.label_pred.astype(int)

## Evaluate

In [None]:
trainer.evaluate()

In [None]:
y_true = test['label']
y_pred = test['label_pred']

accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print("Balanced Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

# new ger-BERT

## Fine-Tune

In [None]:
model = AutoModelForSequenceClassification.from_pretrained('distilbert-base-german-cased', num_labels=19)

In [None]:
trainer = CustomTrainer(model = model,
                        args = training_args, 
                        train_dataset = ds_train, 
                        eval_dataset = ds_valid, 
                        compute_metrics = compute_metrics,
                        tokenizer = tokenizer,
                        callbacks = [EarlyStoppingCallback(early_stopping_patience=3)])

In [None]:
trainer.train()

In [None]:
trainer.save_model("/content/drive/MyDrive/hareg_nlp/04_Models/ger_bert")

## Predict

In [None]:
path = "/content/drive/MyDrive/hareg_nlp/04_Models/ger_bert"

pipe = pipeline('text-classification',
    model=path,
    device=0,
    truncation=True)

In [None]:
test['label_pred'] = test.progress_apply(lambda row: pipe(row['gegenstand'])[0]['label'].replace('LABEL_',''), axis=1)
test.label_pred = test.label_pred.astype(int)

## Evaluate

In [None]:
trainer.evaluate()

In [None]:
y_true = test['label']
y_pred = test['label_pred']

accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

# old distil-BERT

In [None]:
path_old_model = "/content/drive/MyDrive/capstoneproject/models/transformer/distilbert_overlap_chunking_checkpoint-2500"

## Single Predictions

In [None]:
def old_predict(txt_input):
    pipe = pipeline('text-classification',
                    model=path_old_model,
                    device=0,
                    truncation=True
                    )
    model_output = pipe(txt_input)
    label_id = int(model_output[0]['label'].replace('LABEL_',''))
    score = model_output[0]['score']*100
    score = float("{:.2f}".format(score))
    label = di_int_to_label[label_id]
    
    return label, score

In [None]:
try_input = df.loc[100, "gegenstand"]
print(old_predict(try_input),"\n",try_input)

In [None]:
pipe_old = pipeline('text-classification',
           model=path_old_model,
           device=0,
           truncation=True)

## Test Sample

In [None]:
test['label_pred'] = ""
test['label_pred'] = test.progress_apply(lambda row: pipe_old(row['gegenstand'])[0]['label'].replace('LABEL_',''), axis=1)
test.label_pred = test.label_pred.astype(int)

In [None]:
y_true = test['label']
y_pred = test['label_pred']

accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

## Whole Sample

In [None]:
df_all = df

In [None]:
df_all['label_pred'] = ""
df_all['label_pred'] = df_all.progress_apply(lambda row: pipe_old(row['gegenstand'])[0]['label'].replace('LABEL_',''), axis=1)
df_all.label_pred = df_all.label_pred.astype(int)

In [None]:
y_true = df_all['label']
y_pred = df_all['label_pred']

# average = weighted -> impliziert accuracy = recall
accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

### Classification Report

In [None]:
df_all['label_pred_txt'] = df_all['label_pred'].replace(di_int_to_label)

change = di_int_to_label
# change = di_label_to_int
df_all['label_pred'] = df_all['label_pred'].replace(change)
df_all['label'] = df_all['label'].replace(change)

In [None]:
print(classification_report(df_all['label'].to_list(), df_all['label_pred'].to_list()))

### Confusion Matrix

In [None]:
labels = list(di_int_to_label.values())
fig, ax = plt.subplots(figsize=(12,12)) 
cm = confusion_matrix(df_all['label'], df_all['label_pred'] )
f = sns.heatmap(cm, annot=True, fmt='d', linewidths=0.0, xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('cm_whole.svg', bbox_inches='tight',
            dpi=300)


In [None]:
labels = list(di_int_to_label.values())
cm = confusion_matrix(df_all['label'], df_all['label_pred'])
cmn = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

In [None]:
fig, ax = plt.subplots(figsize=(12,12))

f = sns.heatmap(cmn, annot=True, fmt='.0f', xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('cm_whole_relative.svg', bbox_inches='tight')

## Whole Sample Only Long
Durch die EDA wissen wir, dass die durchschnittliche Anzahl Zeichen ~520 beträgt. Der Median liegt bei ~344. Es existieren also viele Texte die kürzer als der Durchnittswert sind, aber auch einige lange Texte, welche die Verteilung positiv verzerren (positively skewed).
Wir filtern willkürlich bei einem Wert leicht unterhalb des Medians und beobachten, dass sowohl Accuracy als auch F1-Score um gut 7 bis 8 Prozentpunkte ansteigen.


In [None]:
df_long = df.loc[(df_all["zeichen"] >= 300)]
df_long.describe()

In [None]:
df_long['label_pred'] = ""
df_long['label_pred'] = df_long.progress_apply(lambda row: pipe_old(row['gegenstand'])[0]['label'].replace('LABEL_',''), axis=1)
df_long.label_pred = df_long.label_pred.astype(int)

In [None]:
df_long['label_pred_txt'] = df_long['label_pred'].replace(di_int_to_label)

# change = di_int_to_label
change = di_label_to_int
df_long['label_pred'] = df_long['label_pred'].replace(change)
df_long['label'] = df_long['label'].replace(change)

In [None]:
y_true = df_long['label']
y_pred = df_long['label_pred']

accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

### Class Report

In [None]:
df_long['label_pred_txt'] = df_long['label_pred'].replace(di_int_to_label)

change = di_int_to_label
# change = di_label_to_int
df_long['label_pred'] = df_long['label_pred'].replace(change)
df_long['label'] = df_long['label'].replace(change)

In [None]:
print(classification_report(df_long['label'].to_list(), df_long['label_pred'].to_list()))

### Conf Matrix

In [None]:
labels = list(di_int_to_label.values())
fig, ax = plt.subplots(figsize=(12,12)) 
cm = confusion_matrix(df_long['label'], df_long['label_pred'] )
f = sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('cm_whole_long.svg', bbox_inches='tight',
            dpi=300)

In [None]:
labels = list(di_int_to_label.values())
cm = confusion_matrix(df_long['label'], df_long['label_pred'])
cmn = (cm.astype('float') / cm.sum(axis=1)[:, np.newaxis])*100

In [None]:
fig, ax = plt.subplots(figsize=(12,12))
f = sns.heatmap(cmn, annot=True, fmt='.0f', xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('cm_whole_long_relative.svg', bbox_inches='tight')

# Website Classifier

### Parsen

In [None]:
# code zu tag_visible und extract_text aus pds
def tag_visible(element):
  if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
      return False
  if isinstance(element, Comment):
      return False
  return True

def extract_text(raw_html):
  raw_text = ''
  description = ''
  keywords = ''

  try:
    soup = BeautifulSoup(raw_html, 'lxml')
    #text
    text = soup.find_all(text=True)
    visible_texts = filter(tag_visible, text)
    text = u"°".join(t.strip() for t in visible_texts)
    raw_text = re.sub(r"(\s\s)+", " ", text)
  except:
    pass
    
  #meta
  try:
    description = soup.find("meta", attrs={'name': 'description'})['content']
  except:
    try:
      description = soup.find("meta", property="og:description")['content']
    except:
      pass
  try:
    keywords = soup.find("meta", attrs={'name': 'keywords'})['content']
  except:
    pass


  return raw_text, description, keywords

In [None]:
df[['site_text', 'description', 'keywords']] = df.progress_apply(lambda row: extract_text(row['website']), axis=1, result_type='expand')

### Decode UTF-8
Wrd aktuell nicht benötigt, die strings enthalten non ascii letter, daher ist die Dekodierung der Umlaute nicht ohne weiteres umzusetzen

In [None]:
# zur besseren Verarbeitung schon mal bestimmte Sonderzeichen entfernen
def clean_ascii(text):
    text = re.sub('°', ' ', str(text))
    text = re.sub("b' ", '', str(text))
    text = str(text).replace('\\\\', '\\')
    text = text.replace(':', '')
    return text

In [None]:
s = df.loc[4, "site_text"] 
bs = s.encode('raw-unicode-escape')  # encode to bytes without double-encoding
print(bs)

In [None]:
s = 'und m\xc3\xb6glicherweise'
bs = s.encode('raw-unicode-escape')  # encode to bytes without double-encoding
print(bs)


In [None]:
decoded = bs.decode('utf-8')
print(decoded)

### Cleanen

In [None]:
df['full_text'] = df['site_text'] + ' ' + df['description'] + ' ' + df['keywords']
df['full_text_raw'] = df['site_text'] + ' ' + df['description'] + ' ' + df['keywords']

In [None]:
def clean(text):
    text = text.replace('\n',' ')
    text = text.replace('\\n',' ')
    text = text.replace('html','')
    text = re.sub('°', ' ', str(text))
    text = re.sub('  ', ' ', str(text))
    text = re.sub(',+', ',', str(text))
    text = re.sub("'", "", str(text))
    text = re.sub("\b(\w)\b", "", str(text))
    return text

In [None]:
def remove_words(text, char_list):
    flag = 0
    in_list = text.split()
    new_list = []
    for line in in_list:
        new_words = ' '.join([word for word in line.split() if not any([phrase in word for phrase in char_list])])
        new_list.append(new_words)
        new_text = " ".join(new_list)
        flag = 1
    if flag == 1:
        return new_text
    else:
        return text

In [None]:
def clean_sonderzeichen(text):
    text = re.sub("(\W)+", " ", str(text))
    return text

In [None]:
def remove_special_words(text):
    text = text.replace('inkl','')
    text = text.replace('mwst','')
    return text

In [None]:
# allmögliches
df['full_text'] = df['full_text'].apply(lambda row: clean(row))

In [None]:
# remove words die bestimmte Sonderzeichen enthalten
char_list = ["\\", "{", "1", "2", "3",  "4",  "5",  "6",  "7",  "8",  "9", "0"]
df['full_text'] = df['full_text'].progress_apply(lambda row: remove_words(row, char_list))

In [None]:
# remove alle speziellen Zeichen und Zahlen
df['full_text'] = df['full_text'].apply(lambda row: clean_sonderzeichen(row))

In [None]:
# angepasstes preprocessing
df['full_text'] = df.progress_apply(lambda row: opt_preprocess(row['full_text']), axis=1)

In [None]:
# tolowercase oder Funktion lemma mit Lemmatizing nutzen
# lemmatizing verbessert F1-Score um 0,01 Prozentpunkte
df['full_text'] = df.full_text.apply(lambda text: text.lower())
df['full_text'] = df.full_text.progress_apply(lambda text: lemma(text)) 

In [None]:
# einzelne letter entfernen
df['full_text'] = df.progress_apply(lambda row: remove_letter(row['full_text']), axis=1)

In [None]:
# einzelne letter entfernen (2), weil nach (1) immernoch einzelne letter enthalten sind
def kill_single_letter(text):

    new_text = ' '.join( [w for w in text.split() if len(w)>1] )

    return new_text

df['full_text'] = df.progress_apply(lambda row: kill_single_letter(row['full_text']), axis=1)

In [None]:
# final strip
df['full_text'] = df.full_text.progress_apply(lambda text: re.sub(' +', ' ', text)) 

In [None]:
# remove special words
# df['full_text'] = df.full_text.progress_apply(lambda text: remove_special_words(text)) 
# !besser nicht, haben Aussagekraft

## Classify Website

In [None]:
df['web_pred'] = df.progress_apply(lambda row: pipe_old(row['full_text'])[0]['label'].replace('LABEL_',''), axis=1)
df['web_pred'] = df['web_pred'].astype(int)

In [None]:
y_true = df['label_int']
y_pred = df['web_pred']

accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average="weighted")
precision = precision_score(y_true, y_pred, average="weighted")
recall = recall_score(y_true, y_pred, average="weighted")

print("Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

In [None]:
df['web_pred_txt'] = df['web_pred'].replace(di_int_to_label)
print(classification_report(df['label'].to_list(), df['web_pred_txt'].to_list()))

In [None]:
labels = list(di_int_to_label.values())
fig, ax = plt.subplots(figsize=(12,12)) 
cm = confusion_matrix(df_all['label'], df_all['web_pred_txt'] )
f = sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('cm_website.svg', bbox_inches='tight',
            dpi=300)

## Classify Combi

In [None]:
df['combi_text'] = df['gegenstand'] + ' ' + df['full_text'] 

In [None]:
df['combi_pred'] = df.progress_apply(lambda row: pipe_old(row['combi_text'])[0]['label'].replace('LABEL_',''), axis=1)
df['combi_pred'] = df['combi_pred'].astype(int)

In [None]:
y_true = df['label_int']
y_pred = df['combi_pred']

accuracy = balanced_accuracy_score(y_true, y_pred)
f1_metric = f1_score(y_true, y_pred, average='weighted')
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')

print("Accuracy: {:.4f}".format(accuracy))
print("F1-score: {:.4f}".format(f1_metric))
print("Precision: {:.4f}".format(precision))
print("Recall: {:.4f}".format(recall))

In [None]:
df['combi_pred_txt'] = df['combi_pred'].replace(di_int_to_label)
print(classification_report(df['label'].to_list(), df['combi_pred_txt'].to_list()))

In [None]:
labels = list(di_int_to_label.values())
fig, ax = plt.subplots(figsize=(12,12)) 
cm = confusion_matrix(df_all['label'], df_all['combi_pred_txt'] )
f = sns.heatmap(cm, annot=True, fmt='d', xticklabels=labels, yticklabels=labels)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.savefig('cm_combi.svg', bbox_inches='tight',
            dpi=300)

In [None]:
# # Plot Precision-Recall curve for each class and iso-f1 curves
# # von https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html

# import matplotlib.pyplot as plt
# from itertools import cycle

# # setup plot details
# colors = cycle(["navy", "turquoise", "darkorange", "cornflowerblue", "teal"])

# _, ax = plt.subplots(figsize=(7, 8))

# f_scores = np.linspace(0.2, 0.8, num=4)
# lines, labels = [], []
# for f_score in f_scores:
#     x = np.linspace(0.01, 1)
#     y = f_score * x / (2 * x - f_score)
#     (l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
#     plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))

# display = PrecisionRecallDisplay(
#     recall=recall["micro"],
#     precision=precision["micro"],
#     average_precision=average_precision["micro"],
# )
# display.plot(ax=ax, name="Micro-average precision-recall", color="gold")

# for i, color in zip(range(n_classes), colors):
#     display = PrecisionRecallDisplay(
#         recall=recall[i],
#         precision=precision[i],
#         average_precision=average_precision[i],
#     )
#     display.plot(ax=ax, name=f"Precision-recall for class {i}", color=color)

# # add the legend for the iso-f1 curves
# handles, labels = display.ax_.get_legend_handles_labels()
# handles.extend([l])
# labels.extend(["iso-f1 curves"])
# # set the legend and the axes
# ax.set_xlim([0.0, 1.0])
# ax.set_ylim([0.0, 1.05])
# ax.legend(handles=handles, labels=labels, loc="best")
# ax.set_title("Extension of Precision-Recall curve to multi-class")

# plt.show()

# Combine seperate Classifiers

In [None]:
# direkt argmax der W'keiten beider Classifier
# oder erst Durchschnitt der W'keiten beider Classifier bilden, dann argmax
# weitere? was sagt die Literatur?
# Trainieren separeter Modelle 
# wie können mehrer DL Modelle kombiniert werden?

# Wordcloud

Wordcloud und Transformer laufen nicht mehr im gleichen Environment

In [None]:
!pip install -U pillow