# 1. Data Loading and Preprocessing

In [1]:
# Importing necessary libraries
import keras
import keras.layers as layers
import pandas as pd
import json
import tensorflow as tf
from collections import Counter
import numpy as np
import emoji
import string
import re


2024-10-01 13:07:51.770004: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-01 13:07:51.773400: I external/local_tsl/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-10-01 13:07:51.814697: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# Load data
train_filepath = 'data/germeval-trial-train.jsonl' 
test_filepath = 'data/germeval-trial-test.jsonl'  

train_data = open(train_filepath).read()
test_data = open(test_filepath).read()

json_train_data = [json.loads(jline) for jline in train_data.splitlines()]
json_test_data = [json.loads(jline) for jline in test_data.splitlines()]

numTrainSamples = len(json_train_data)

# Combine train and test data into a DataFrame
df = pd.DataFrame(json_train_data + json_test_data)
X = list(df['text'])


In [3]:
# Get annotator IDs
def get_annotator_ids(train_filepath, test_filepath):
    train_data = open(train_filepath).read()
    json_train_data = [json.loads(jline) for jline in train_data.splitlines()]
    train_annotator_ids = sorted(set([ann['user'] for annotations in json_train_data for ann in annotations['annotations']]))

    test_data = open(test_filepath).read()
    json_test_data = [json.loads(jline) for jline in test_data.splitlines()]
    test_annotator_ids = sorted(set([annotator for annotations in json_test_data for annotator in annotations['annotators']]))

    return sorted(set(train_annotator_ids + test_annotator_ids))

annotator_ids = get_annotator_ids(train_filepath, test_filepath)

In [4]:
# Feature extraction functions
def count_special_characters(text):
    return len(re.findall(r'[!@#$%^&*(),.?":{}|<>]', text))

def count_capitalized_words(text):
    return sum(1 for word in text.split() if word.isupper())

def count_emojis(text):
    return len(emoji.emoji_list(text))

def count_characters(text):
    return len(text)

def count_words(text):
    return len(text.split())

def count_punctuations(text):
    return sum(1 for char in text if char in string.punctuation)

In [5]:
# Extract additional features
def get_additional_features(filepath, annotator_ids, annotations_key):
    data = open(filepath).read()
    json_data = [json.loads(jline) for jline in data.splitlines()]
    df = pd.DataFrame(json_data)

    annotator_presence = []
    for annotations in df[annotations_key]:
        doc_vector = [0] * len(annotator_ids)
        for ann in annotations:
            user = ann if annotations_key == 'annotators' else ann['user']
            idx = annotator_ids.index(user)
            doc_vector[idx] = 1
        annotator_presence.append(doc_vector)

    annotator_presence_matrix = np.array(annotator_presence)

    df['special_char_count'] = df['text'].apply(count_special_characters)
    df['capitalized_word_count'] = df['text'].apply(count_capitalized_words)
    df['char_count'] = df['text'].apply(count_characters)
    df['word_count'] = df['text'].apply(count_words)
    df['punctuation_count'] = df['text'].apply(count_punctuations)

    special_char_count = np.array(df['special_char_count']).reshape(-1, 1)
    capitalized_word_count = np.array(df['capitalized_word_count']).reshape(-1, 1)
    char_count = np.array(df['char_count']).reshape(-1, 1)
    word_count = np.array(df['word_count']).reshape(-1, 1)
    punctuation_count = np.array(df['punctuation_count']).reshape(-1, 1)
    
    additional_features = np.hstack([annotator_presence_matrix, special_char_count, capitalized_word_count, 
                                     char_count, word_count, punctuation_count])

    return additional_features

additional_features = get_additional_features(train_filepath, annotator_ids, 'annotations')
additional_features_test = get_additional_features(test_filepath, annotator_ids, 'annotators')


In [6]:
# Label preparation
dictlabels = {'bin_maj': [], 'bin_one': [], 'bin_all': [], 'multi_maj': [], 'disagree_bin': [], 'dist_bin_0': [], 'dist_bin_1': [], 'dist_multi_0': [], 
            'dist_multi_1': [], 'dist_multi_2': [], 'dist_multi_3': [], 'dist_multi_4': []}

for annotations in df['annotations'][:numTrainSamples]:
    labels = [ann['label'] for ann in annotations]
    label_count = Counter(labels)
    total_annotations = sum(label_count.values())

    dictlabels['bin_maj'].append(0 if label_count['0-Kein'] > total_annotations / 2 else 1)
    dictlabels['bin_one'].append(1 if any(label != '0-Kein' for label in labels) else 0)
    dictlabels['bin_all'].append(1 if all(label != '0-Kein' for label in labels) else 0)

    dict_label_count = {'0-Kein': labels.count('0-Kein'), '1-Gering': labels.count('1-Gering'), 
                        '2-Vorhanden': labels.count('2-Vorhanden'), '3-Stark': labels.count('3-Stark'), '4-Extrem': labels.count('4-Extrem')}
    
    max_label = max(dict_label_count, key=dict_label_count.get)
    dictlabels['multi_maj'].append([int(max_label == l) for l in dict_label_count])

    dictlabels['disagree_bin'].append(1 if len(label_count) > 1 and '0-Kein' in label_count else 0)

    dictlabels['dist_bin_0'].append(label_count['0-Kein'] / total_annotations)
    dictlabels['dist_bin_1'].append((total_annotations - label_count['0-Kein']) / total_annotations)
    
    dictlabels['dist_multi_0'].append(label_count['0-Kein'] / total_annotations)
    dictlabels['dist_multi_1'].append(label_count['1-Gering'] / total_annotations)
    dictlabels['dist_multi_2'].append(label_count['2-Vorhanden'] / total_annotations)
    dictlabels['dist_multi_3'].append(label_count['3-Stark'] / total_annotations)
    dictlabels['dist_multi_4'].append(label_count['4-Extrem'] / total_annotations)

y_bin_maj = np.array(dictlabels['bin_maj'])
y_bin_one = np.array(dictlabels['bin_one'])
y_bin_all = np.array(dictlabels['bin_all'])
y_multi_maj = np.array(dictlabels['multi_maj'])
y_disagree_bin = np.array(dictlabels['disagree_bin'])
y_dist_bin_0 = np.array(dictlabels['dist_bin_0'])
y_dist_bin_1 = np.array(dictlabels['dist_bin_1'])
y_dist_multi_0 = np.array(dictlabels['dist_multi_0'])
y_dist_multi_1 = np.array(dictlabels['dist_multi_1'])
y_dist_multi_2 = np.array(dictlabels['dist_multi_2'])
y_dist_multi_3 = np.array(dictlabels['dist_multi_3'])
y_dist_multi_4 = np.array(dictlabels['dist_multi_4'])

# Create label arrays
y_train_combined = np.column_stack([np.array(dictlabels[key]) for key in dictlabels.keys()])


In [7]:
# Text sequence processing
X_seq = [tf.keras.preprocessing.text.text_to_word_sequence(item, lower=True, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', split=' ') for item in X]

# Map words to integers
all_words = [word for seq in X_seq for word in seq]
unique_words = set(all_words)
integer_mapping = {word: i for i, word in enumerate(unique_words)}
X_int_seq = [[integer_mapping[word] for word in seq] for seq in X_seq]

# Pad sequences
vocab_size = len(integer_mapping)
maxlen = 80
X_padded = tf.keras.preprocessing.sequence.pad_sequences(X_int_seq, maxlen=maxlen)


# 2. Model Definition and Training

In [8]:
# Load FastText embeddings
fasttext_path = 'pretrained_embeddings/cc.de.300.vec'
embedding_dim = 300

embeddings_index = {}
with open(fasttext_path, encoding='utf-8') as f:
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs


In [9]:
# Prepare embedding matrix
embedding_matrix = np.zeros((vocab_size, embedding_dim))
for word, i in integer_mapping.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector


In [10]:
# Define embedding layer
embedding_layer = layers.Embedding(input_dim=vocab_size,
                                   output_dim=embedding_dim,
                                   weights=[embedding_matrix],
                                   input_length=maxlen,
                                   trainable=False)


2024-10-01 13:09:19.466303: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:998] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2024-10-01 13:09:19.466827: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2251] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [12]:
# Transformer block definition
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim)])
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None, **kwargs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


In [13]:
# Model architecture
num_heads = 2
ff_dim = 30

text_input = layers.Input(shape=(maxlen,))
features_input = layers.Input(shape=(additional_features.shape[1],))
x = embedding_layer(text_input)
x = TransformerBlock(embedding_dim, num_heads, ff_dim)(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Concatenate()([x, features_input])
x = layers.Dropout(0.5)(x)
x = layers.Dense(512, activation='leaky_relu')(x)
x = layers.Dropout(0.5)(x)

# Define the output layers with appropriate activations
output_bin_maj = layers.Dense(1, activation='sigmoid', name='output_bin_maj')(x)
output_bin_one = layers.Dense(1, activation='sigmoid', name='output_bin_one')(x)
output_bin_all = layers.Dense(1, activation='sigmoid', name='output_bin_all')(x)
output_multi_maj = layers.Dense(5, activation='softmax', name='output_multi_maj')(x)  # Multi-class classification
output_disagree_bin = layers.Dense(1, activation='sigmoid', name='output_disagree_bin')(x)
output_dist_bin_0 = layers.Dense(1, activation='sigmoid', name='output_dist_bin_0')(x)
output_dist_bin_1 = layers.Dense(1, activation='sigmoid', name='output_dist_bin_1')(x)
output_dist_multi_0 = layers.Dense(1, activation='sigmoid', name='output_dist_multi_0')(x)
output_dist_multi_1 = layers.Dense(1, activation='sigmoid', name='output_dist_multi_1')(x)
output_dist_multi_2 = layers.Dense(1, activation='sigmoid', name='output_dist_multi_2')(x)
output_dist_multi_3 = layers.Dense(1, activation='sigmoid', name='output_dist_multi_3')(x)
output_dist_multi_4 = layers.Dense(1, activation='sigmoid', name='output_dist_multi_4')(x)

# Combine the model outputs
model = tf.keras.Model(inputs=[text_input,features_input], outputs=[
    output_bin_maj, output_bin_one, output_bin_all, output_multi_maj, output_disagree_bin,
    output_dist_bin_0, output_dist_bin_1, output_dist_multi_0, output_dist_multi_1,
    output_dist_multi_2, output_dist_multi_3, output_dist_multi_4
])



# Compile the model with different losses for different outputs
model.compile(
    optimizer="adamW",
    loss={
        "output_bin_maj": "binary_focal_crossentropy",  # For binary classification (majority vote)
        "output_bin_one": "binary_focal_crossentropy",  # For binary classification (one-vote)
        "output_bin_all": "binary_focal_crossentropy",  # For binary classification (all-vote)
        "output_multi_maj": "categorical_focal_crossentropy",  # For multi-class classification (majority vote)
        "output_disagree_bin": "binary_focal_crossentropy",  # For binary classification (disagreement)
        "output_dist_bin_0": "mse",  # For regression (distribution for '0-Kein')
        "output_dist_bin_1": "mse",  # For regression (distribution for '1-Gering')
        "output_dist_multi_0": "mse",  # For regression (distribution multi for '0-Kein')
        "output_dist_multi_1": "mse",  # For regression (distribution multi for '1-Gering')
        "output_dist_multi_2": "mse",  # For regression (distribution multi for '2-Vorhanden')
        "output_dist_multi_3": "mse",  # For regression (distribution multi for '3-Stark')
        "output_dist_multi_4": "mse"   # For regression (distribution multi for '4-Extrem')
    },
    metrics={
        "output_bin_maj": "binary_accuracy",
        "output_bin_one": "binary_accuracy",
        "output_bin_all": "binary_accuracy",
        "output_multi_maj": "categorical_accuracy",
        "output_disagree_bin": "binary_accuracy",
        "output_dist_bin_0": "mape",  # Mean Absolute Percentage Error for regression
        "output_dist_bin_1": "mape",
        "output_dist_multi_0": "mape",
        "output_dist_multi_1": "mape",
        "output_dist_multi_2": "mape",
        "output_dist_multi_3": "mape",
        "output_dist_multi_4": "mape"
    }
)

# Model summary
model.summary()

In [14]:
# Split train/test
X_train_padded, X_test_padded = X_padded[:numTrainSamples], X_padded[numTrainSamples:]

# Train the model
history = model.fit([X_train_padded,additional_features], {"output_bin_maj": y_bin_maj,
                                     "output_bin_one": y_bin_one,
                                     "output_bin_all": y_bin_all,
                                     "output_multi_maj": y_multi_maj,
                                     "output_disagree_bin": y_disagree_bin,
                                     "output_dist_bin_0": y_dist_bin_0,
                                     "output_dist_bin_1": y_dist_bin_1,
                                     "output_dist_multi_0": y_dist_multi_0,
                                     "output_dist_multi_1": y_dist_multi_1,
                                     "output_dist_multi_2": y_dist_multi_2,
                                     "output_dist_multi_3": y_dist_multi_3,
                                     "output_dist_multi_4": y_dist_multi_4},
                    batch_size=40, epochs=5, validation_split=0.2, shuffle = True)

# Save the model
model.save('saved_models/development_model_MH.keras')


Epoch 1/5
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 107ms/step - loss: 22.6086 - output_bin_all_binary_accuracy: 0.6979 - output_bin_maj_binary_accuracy: 0.5015 - output_bin_one_binary_accuracy: 0.5546 - output_disagree_bin_binary_accuracy: 0.5304 - output_dist_bin_0_mape: 76960280.0000 - output_dist_bin_1_mape: 176202224.0000 - output_dist_multi_0_mape: 126819776.0000 - output_dist_multi_1_mape: 257621408.0000 - output_dist_multi_2_mape: 165359504.0000 - output_dist_multi_3_mape: 195842592.0000 - output_dist_multi_4_mape: 556492672.0000 - output_multi_maj_categorical_accuracy: 0.3939 - val_loss: 7.3477 - val_output_bin_all_binary_accuracy: 0.7400 - val_output_bin_maj_binary_accuracy: 0.5250 - val_output_bin_one_binary_accuracy: 0.4900 - val_output_disagree_bin_binary_accuracy: 0.7000 - val_output_dist_bin_0_mape: 189004656.0000 - val_output_dist_bin_1_mape: 4480307.0000 - val_output_dist_multi_0_mape: 188841088.0000 - val_output_dist_multi_1_mape: 1527449.2500 - 

# 3. Prediction and Evaluation

In [15]:
# Predict using the trained model
ypred = model.predict([X_test_padded,additional_features_test], batch_size=None, verbose='auto', steps=None, callbacks=None)

# Get the test IDs
test_id = [json_test_data[i]['id'] for i in range(len(json_test_data))]

# Process binary predictions
ypred_bin_maj = [1 if ypred[0][i][0] >= 0.5 else 0 for i in range(len(ypred[0]))]
ypred_bin_one = [1 if ypred[1][i][0] >= 0.5 else 0 for i in range(len(ypred[1]))]
ypred_bin_all = [1 if ypred[2][i][0] >= 0.5 else 0 for i in range(len(ypred[2]))]

# Process multi-class prediction (for output_multi_maj)
categories = ["0-Kein", "1-Gering", "2-Vorhanden", "3-Stark", "4-Extrem"]
ypred_multi_maj_raw = [categories[ll.index(max(ll))] for ll in [list(ypred[0][i]) for i in range(len(ypred[3]))]]# This is the raw output from the model for multi_maj

# Process binary disagree prediction
ypred_disagree_bin = [1 if ypred[4][i][0] >= 0.5 else 0 for i in range(len(ypred[4]))]

# Process predictions for dist_bin
ypred_dist_bin_0 = [ypred[5][i][0] for i in range(len(ypred[5]))]
ypred_dist_bin_1 = [round(ypred[6][i][0],4) for i in range(len(ypred[6]))]
for i in range(len(ypred_dist_bin_0)):
    total = ypred_dist_bin_0[i] + ypred_dist_bin_1[i]
    ypred_dist_bin_0[i] = ypred_dist_bin_0[i] / total
    ypred_dist_bin_1[i] = ypred_dist_bin_1[i] / total

ypred_dist_multi_0 = [ypred[7][i][0] for i in range(len(ypred[7]))]
ypred_dist_multi_1 = [ypred[8][i][0] for i in range(len(ypred[8]))]
ypred_dist_multi_2 = [ypred[9][i][0] for i in range(len(ypred[9]))]
ypred_dist_multi_3 = [ypred[10][i][0] for i in range(len(ypred[10]))]
ypred_dist_multi_4 = [ypred[11][i][0] for i in range(len(ypred[11]))]
for i in range(len(ypred_dist_multi_0)):
    total = ypred_dist_multi_0[i] + ypred_dist_multi_1[i] + ypred_dist_multi_2[i] + ypred_dist_multi_3[i] + ypred_dist_multi_4[i]  
    ypred_dist_multi_0[i] = ypred_dist_multi_0[i] / total
    ypred_dist_multi_1[i] = ypred_dist_multi_1[i] / total
    ypred_dist_multi_2[i] = ypred_dist_multi_2[i] / total
    ypred_dist_multi_3[i] = ypred_dist_multi_3[i] / total
    ypred_dist_multi_4[i] = ypred_dist_multi_4[i] / total
 
# Create output DataFrame for subtask 1 
df_output_subtask1 = pd.DataFrame({
              'id': test_id,
             'bin_maj': ypred_bin_maj,
             'bin_one': ypred_bin_one,
              'bin_all': ypred_bin_all,
              'multi_maj': ypred_multi_maj_raw,
              'disagree_bin': ypred_disagree_bin,
             })
             
# Create output DataFrame for subtask 2
df_output_subtask2 = pd.DataFrame({
                    'id': test_id,
                  'dist_bin_0': ypred_dist_bin_0,
              'dist_bin_1': ypred_dist_bin_1,
              'dist_multi_0': ypred_dist_multi_0,
              'dist_multi_1': ypred_dist_multi_1,
              'dist_multi_2': ypred_dist_multi_2,
              'dist_multi_3': ypred_dist_multi_3,
              'dist_multi_4': ypred_dist_multi_4

})

# Save predictions to TSV files
subtask1_output = df_output_subtask1.to_csv('output/MH_subtask1_output_model.tsv', sep="\t", index = False) 
subtask2_output = df_output_subtask2.to_csv('output/MH_subtask2_output_model.tsv', sep="\t", index = False) 

[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 32ms/step
