# Text Classification 

### Import all bibliographies

In [14]:
import spacy
from spacy.util import minibatch
from spacy.training.example import Example
import random
import os
import csv
import json
import shutil
from sklearn.model_selection import KFold

### Config

In [15]:
# path where the trained model should be saved

model_dir = './trained_model'
# load the german spaCy model
nlp = spacy.load('de_core_news_lg')

# add the text classifier to the pipeline if it doesn't exist
if 'textcat_multilabel' not in nlp.pipe_names:
    textcat = nlp.add_pipe('textcat_multilabel')
else:
    textcat = nlp.get_pipe('textcat_multilabel')

# define the labels
labels = {"Umwelt", "Bildung", "Gesundheit", "Wirtschaft"}

### Main function with training, testing, evaluation and cross validation

In [16]:
# proof if model already exists
if os.path.exists(model_dir):
    nlp = spacy.load(model_dir)
else:
    os.mkdir(model_dir)
    for label in labels:
        textcat.add_label(label)

    # path to the json file with the training data
    file_path = './training_data.json'

    # read the training data
    with open(file_path, 'r', encoding='utf-8') as file:
        train_data = json.load(file)

    kf = KFold(n_splits=5, shuffle=True, random_state=42)


# cross validation
for fold_number, (train_index, test_index) in enumerate(kf.split(train_data), start=1):
    # Trainings- und Testdaten für diesen Fold extrahieren
    train_fold = [train_data[i] for i in train_index]
    test_fold = [train_data[i] for i in test_index]

    # training data for this fold
    train_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in train_fold]

    # train the model for this fold
    optimizer = nlp.begin_training()
    for i in range(10):
        random.shuffle(train_examples)
        losses = {}
        for batch in minibatch(train_examples, size=2):
            for example in batch:
                nlp.update([example], drop=0.2, losses=losses)
        print(f"Verluste im Fold {fold_number}, Iteration {i}: {losses}")

    # test data for this fold
    test_examples = [Example.from_dict(nlp.make_doc(text), annotations) for text, annotations in test_fold]

    # evaluation for this model and fold
    scores = nlp.evaluate(test_examples)
    print(f"Evaluierungsergebnisse für Fold {fold_number}: {scores}")

# print the evaluation results
print(f"Evaluation results: {scores}")

# directory with the extracted text files
extracted_text_directory = '../1 data_preprocessing/output'
output_folder = './labels'
os.makedirs(output_folder, exist_ok=True)
allParties = "Party,Label,Percentage\n"

# classify the text files
for filename in os.listdir(extracted_text_directory):
    print("Start processing files")
    counterUmwelt = 0
    counterBildung = 0
    counterGesundheit = 0
    counterWirtschaft = 0
    if filename.endswith('.txt'):
        file_path = os.path.join(extracted_text_directory, filename)
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
            texts = text.split(". ")
            countSentences = len(texts)
            for text in texts:
                doc = nlp(text)
                scores = {label: doc.cats[label] for label in labels}
                max_label = max(scores, key=scores.get)
                if max_label == "Umwelt":
                    counterUmwelt += 1
                if max_label == "Bildung":
                    counterBildung += 1
                if max_label == "Gesundheit":
                    counterGesundheit += 1
                if max_label == "Wirtschaft":
                    counterWirtschaft += 1

            csv_file_path = os.path.join(output_folder, filename.replace('.txt', '.csv'))
            with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
                text = "Label,Percentage\n"
                for column, counter in zip(["Bildung", "Wirtschaft", "Gesundheit", "Umwelt"], [counterBildung, counterWirtschaft, counterGesundheit, counterUmwelt]):
                    text += column + "," + str(round(((counter / countSentences) * 100), 2)) + "\n"
                    allParties += filename.replace('.txt', '') + "," + column + "," + str(round(((counter / countSentences) * 100), 2)) + "\n"
                text = text[:-1]
                csvfile.write(text)


Verluste im Fold 1, Iteration 0: {'tok2vec': 0.0, 'tagger': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0, 'textcat_multilabel': 37.255113361244526}
Verluste im Fold 1, Iteration 1: {'tok2vec': 0.0, 'tagger': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0, 'textcat_multilabel': 7.6698653678516}
Verluste im Fold 1, Iteration 2: {'tok2vec': 0.0, 'tagger': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0, 'textcat_multilabel': 3.5171978597967666}
Verluste im Fold 1, Iteration 3: {'tok2vec': 0.0, 'tagger': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0, 'textcat_multilabel': 2.4596295098054526}
Verluste im Fold 1, Iteration 4: {'tok2vec': 0.0, 'tagger': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 0.0, 'textcat_multilabel': 1.8127355250003347}
Verluste im Fold 1, Iteration 5: {'tok2vec': 0.0, 'tagger': 0.0, 'morphologizer': 0.0, 'parser': 0.0, 'lemmatizer': 0.0, 'ner': 

  matches = self.matcher(doc, allow_missing=True, as_spans=False)


Evaluierungsergebnisse für Fold 1: {'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'sents_p': None, 'sents_r': None, 'sents_f': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'lemma_acc': None, 'ents_p': None, 'ents_r': None, 'ents_f': None, 'ents_per_type': None, 'cats_score': 0.988310106073264, 'cats_score_desc': 'macro AUC', 'cats_micro_p': 0.925, 'cats_micro_r': 0.925, 'cats_micro_f': 0.925, 'cats_macro_p': 0.9354166666666666, 'cats_macro_r': 0.9244949494949494, 'cats_macro_f': 0.9267984466513879, 'cats_macro_auc': 0.988310106073264, 'cats_f_per_type': {'Gesundheit': {'p': 1.0, 'r': 0.9090909090909091, 'f': 0.9523809523809523}, 'Bildung': {'p': 0.7916666666666666, 'r': 0.95, 'f': 0.8636363636363635}, 'Wirtschaft': {'p': 0.95, 'r': 0.95, 'f': 0.9500000000000001}, 'Umwelt': {'p': 1.0, 'r': 0.88888888888

### Results in a csv

In [12]:
# write the results to a csv file
csv_file_path = os.path.join(output_folder, 'classification_results.csv')
with open(csv_file_path, 'w', newline='', encoding='utf-8') as csvfile:
    allParties = allParties[:-1]
    csvfile.write(allParties)

### Write the results into the frontend

In [13]:
# copy the folder to the frontend
shutil.copytree('./labels', '../../frontend/src/pages/charts/data/labels', dirs_exist_ok=True)

'../../frontend/src/pages/charts/data/labels'