In [None]:
# !pip install -r requirements.txt
# !python -m spacy download de_core_news_sm

In [23]:
from warnings import filterwarnings

filterwarnings("ignore")

In [3]:
import os
import re
from pprint import pprint
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import spacy
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score

In [4]:
def get_html_text(filepath):
    with open(filepath, encoding='ISO-8859-1') as f:
        soup = BeautifulSoup(f.read(), "html")
    
    tags_to_decompose = [
        tag 
        for tag_name in ("style", "script")
        for tag in soup.find_all(tag_name)
    ]
    
    for tag in tags_to_decompose:
        tag.decompose()
    
    return soup.get_text(" ")

get_html_text(os.path.join("htmls", "0.html"))

'\n \n \n \n \n \n BauchspeicheldrÃ¼se | Klinik fÃ¼r Allgemein-, Viszeral- und Kinderchirurgie GÃ¶ttingen \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n Klinik fÃ¼r Allgemein-, Viszeral- und Kinderchirurgie \n zur Hauptnavigation springen \n zum Inhalt wechseln \n \n Aktuelles und Kontakt Kontakt \n \n \n Logo der UniverstÃ¤tsmedizin GÃ¶ttingen \n \n \n \n \n \n \n \n \n \n Navigation Ã¶ffnen oder schliessen \n \n \n \n \n \n \n Hauptnavigation Subnavigation Ã¶ffnen oder schliessen Medizinische Versorgung Poliklinik Sonographie SchilddrÃ¼se SpeiserÃ¶hre und Magen Darm BauchspeicheldrÃ¼se CED Leber und Galle Hernien Koloproktologie Adipositaschirurgie Kinderchirurgie Sarkomchirurgie HIPEC Roboterchirurgie InterdisziplinÃ¤re Zentren Subnavigation Ã¶ffnen oder schliessen Forschung Klinische Studien Tumorepigenetik AG Conradi AG Gaedcke AG Grade AG Krause AG Sperling AG Sprenger AG Wegwitz Promotion Publikationen Subnavigation Ã¶ffnen oder schliessen Lehre Module Blockpraktikum Famulaturen/P

In [5]:
def get_all_texts():
    return {
        int(file.replace(".html", "")): get_html_text(os.path.join("htmls", file))
        for file in os.listdir("htmls")
    }

all_texts = get_all_texts()

In [6]:
train_df = pd.read_csv("train.csv")
train_df

Unnamed: 0,url,doc_id,label
0,http://elbe-elster-klinikum.de/fachbereiche/ch...,1,1
1,http://klinikum-bayreuth.de/einrichtungen/zent...,3,3
2,http://klinikum-braunschweig.de/info.php/?id_o...,4,1
3,http://klinikum-braunschweig.de/info.php/?id_o...,5,1
4,http://klinikum-braunschweig.de/zuweiser/tumor...,6,3
...,...,...,...
95,http://www.unicross.uni-freiburg.de/thema/unifm/,140,1
96,http://www.uniklinik-duesseldorf.de/patienten-...,141,1
97,http://www.vivantes.de/fuer-sie-vor-ort/klinik...,144,2
98,http://www.vivantes.de/fuer-sie-vor-ort/klinik...,145,2


In [7]:
train_df.label.value_counts()

2    59
1    32
3     9
Name: label, dtype: int64

In [11]:
del train_df['url']
train_df['text'] = train_df['doc_id'].apply(lambda x: all_texts[x])
train_df

Unnamed: 0,doc_id,label,text
0,1,1,\n \n \n \n \n \n Elbe-Elster Klinikum - Chiru...
1,3,3,\n \n \n \n \n \n \n Onkologisches Zentrum - K...
2,4,1,\n \n Zentrum - SozialpÃ¤diatrisches Zentrum -...
3,5,1,\n \n Leistung - Spezielle UnterstÃ¼tzung bei ...
4,6,3,\n \n Zuweiser - Tumorkonferenzen - Tumorkonfe...
...,...,...,...
95,140,1,\n \n \n \n \n \n \n \n \n \n \n uniFM | uniCR...
96,141,1,\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \...
97,144,2,\n \n \n \n \n \n \n \n \n \n \n \n FÃ¼r Ãrzt...
98,145,2,\n \n \n \n \n \n \n \n \n \n \n \n \n \n Inne...


In [12]:
class_3_sample = train_df[train_df.label == 3].sample(20, replace=True)
train_df = pd.concat([train_df[train_df.label != 3], class_3_sample])
train_df.label.value_counts()

2    59
1    32
3    20
Name: label, dtype: int64

In [13]:
nlp = spacy.load("de_core_news_sm")

In [20]:
def preprocessor(text):
    text = re.sub('<[^>]*>', '', text)
    text = re.sub('[\W]+', '', text.lower())
    
    return text

def lemmatize_sentence(sent):
    lemmas = [
        preprocessor(token.lemma_)
        for token in sent
        if all([
            not token.is_stop,
            token.pos_ not in {"SPACE", "PUNCT"}
        ])
    ]
    return [lem for lem in lemmas if lem]

def clean_text(text):
    global nlp
    
    doc = nlp(text)
    
    tokens = [
        tok
        for sent in doc.sents
        for tok in lemmatize_sentence(sent)
    ]
    return " ".join(tokens)

def vectorize(text):
    global nlp
    
    return nlp(text).vector

In [18]:
train_df['cleaned'] = train_df['text'].apply(clean_text)
train_df = train_df[train_df.cleaned != '']

train_df

NameError: name 'text' is not defined

In [21]:
train_df['vectors'] = train_df['cleaned'].apply(vectorize)
train_df

Unnamed: 0,doc_id,label,text,tokens,cleaned,vectors
0,1,1,\n \n \n \n \n \n Elbe-Elster Klinikum - Chiru...,elbeelster klinikum chirurgie finsterwalde suc...,elbeelster klinikum chirurgie finsterwalde suc...,"[-0.43255353, 0.4135299, 1.1170418, 0.21382248..."
2,4,1,\n \n Zentrum - SozialpÃ¤diatrisches Zentrum -...,zentrum sozialpãdiatrisches zentrum stãdtische...,zentrum sozialpãdiatrisches zentrum stãdtische...,"[-0.30636021, 0.18710062, 0.9198907, 0.2302482..."
3,5,1,\n \n Leistung - Spezielle UnterstÃ¼tzung bei ...,leistung speziell unterstã¼tzung anmeldung geb...,leistung speziell unterstã¼tzung anmeldung geb...,"[-0.3820183, 0.18181883, 1.0549333, 0.31993324..."
5,8,1,\n \n \n Krebszentrum Reutlingen: Impressum - ...,krebszentrum reutlingen impressum kreisklinike...,krebszentrum reutlingen impressum kreisklinike...,"[0.22153619, 0.19331177, 1.2188078, 0.26290113..."
6,9,1,\n \n \n Ãsthetische Brustchirurgie - krebsze...,ãsthetische brustchirurgie krebszentrumkreiskl...,ãsthetische brustchirurgie krebszentrumkreiskl...,"[0.15936552, 0.19908608, 1.125857, 0.3426929, ..."
...,...,...,...,...,...,...
65,100,3,\n \n \n \n \n Ãrztliche Weiterbildung \n \n ...,ãrztliche weiterbildung impressum datenschutz ...,ãrztliche weiterbildung impressum datenschutz ...,"[-0.45951912, 0.4303466, 1.3440295, 0.11675749..."
65,100,3,\n \n \n \n \n Ãrztliche Weiterbildung \n \n ...,ãrztliche weiterbildung impressum datenschutz ...,ãrztliche weiterbildung impressum datenschutz ...,"[-0.45951912, 0.4303466, 1.3440295, 0.11675749..."
4,6,3,\n \n Zuweiser - Tumorkonferenzen - Tumorkonfe...,zuweiser tumorkonferenzen tumorkonferenz gastr...,zuweiser tumorkonferenzen tumorkonferenz gastr...,"[-0.3597676, 0.47311553, 1.2677805, 0.28748557..."
65,100,3,\n \n \n \n \n Ãrztliche Weiterbildung \n \n ...,ãrztliche weiterbildung impressum datenschutz ...,ãrztliche weiterbildung impressum datenschutz ...,"[-0.45951912, 0.4303466, 1.3440295, 0.11675749..."


In [24]:
x, y = np.stack(train_df.vectors), train_df.label.to_numpy()

X_train, x_val, Y_train, y_val = train_test_split(x, y, train_size=0.9, shuffle=True, random_state=1, stratify=y)

In [25]:
def eval_model(model, x_test, y_test):
    y_pred = model.predict(x_test)
    
    acc = accuracy_score(y_test, y_pred)
    conf_matrix = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    
    print(f'Accuracy {acc}\n')
    print(conf_matrix)
    print()
    print(report)

In [26]:
from sklearn.linear_model import LogisticRegression

lgr = LogisticRegression(n_jobs=10, C=1, class_weight="balanced")
lgr.fit(X_train, Y_train)

eval_model(lgr, x_val, y_val)

Accuracy 0.6666666666666666

[[3 1 0]
 [1 4 1]
 [0 1 1]]

              precision    recall  f1-score   support

           1       0.75      0.75      0.75         4
           2       0.67      0.67      0.67         6
           3       0.50      0.50      0.50         2

   micro avg       0.67      0.67      0.67        12
   macro avg       0.64      0.64      0.64        12
weighted avg       0.67      0.67      0.67        12



In [27]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(penalty='l2', alpha=1e-3, random_state=42, max_iter=10000, tol=None)
sgd.fit(X_train, Y_train)

eval_model(sgd, x_val, y_val)

Accuracy 0.8333333333333334

[[4 0 0]
 [2 4 0]
 [0 0 2]]

              precision    recall  f1-score   support

           1       0.67      1.00      0.80         4
           2       1.00      0.67      0.80         6
           3       1.00      1.00      1.00         2

   micro avg       0.83      0.83      0.83        12
   macro avg       0.89      0.89      0.87        12
weighted avg       0.89      0.83      0.83        12



In [29]:
Y_train_bin = np.array([2 if l == 3 else l for l in Y_train])
y_val_bin = np.array([2 if l == 3 else l for l in y_val])

In [30]:
from sklearn.linear_model import SGDClassifier

sgd = SGDClassifier(penalty='l2', alpha=1e-3, random_state=42, max_iter=10000, tol=None)
sgd.fit(X_train, Y_train_bin)

eval_model(sgd, x_val, y_val_bin)

Accuracy 0.8333333333333334

[[4 0]
 [2 6]]

              precision    recall  f1-score   support

           1       0.67      1.00      0.80         4
           2       1.00      0.75      0.86         8

   micro avg       0.83      0.83      0.83        12
   macro avg       0.83      0.88      0.83        12
weighted avg       0.89      0.83      0.84        12

