In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split, evaluate_keras

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate, train_test_split
from tensorflow import keras
from nltk.tokenize import TweetTokenizer

In [2]:
data = get_data("../../data/authors_bert.csv", select_authors=False).drop(["Unnamed: 0", "comment"], axis=1)

In [3]:
def build_model(input_shape, output_shape):  
    model = keras.models.Sequential(name="NeuralNetwork")
    model.add(keras.layers.Input(shape=input_shape, name="Input"))
    model.add(keras.layers.Dense(30, activation="relu", name="Dense1"))
    model.add(keras.layers.Dense(30, activation="relu", name="Dense2"))
    model.add(keras.layers.Dense(30, activation="relu", name="Dense3"))
    model.add(keras.layers.Dense(output_shape, activation="softmax", name="Output"))
    return model

In [4]:
usernames = list(np.unique(data["username"]))
results = list()
metrics = list()
evaluation = list()

for i in range(len(usernames)):
    author1 = usernames.pop()

    for author2 in usernames:
        X_train, X_test, y_train, y_test = temporal_train_test_split(
            data, author1, author2)

        y_classes = pd.get_dummies(y_train).columns
        y_train = pd.get_dummies(y_train).values
        y_test = pd.get_dummies(y_test).values
        
        input_shape_text = X_train.shape[1]
        output_shape = y_train.shape[1]

        callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

        model = build_model(input_shape_text, output_shape)
        model.compile(loss = "categorical_crossentropy",
                    optimizer = keras.optimizers.SGD(learning_rate=0.01),
                    metrics = ["accuracy"])
        history = model.fit(X_train, y_train, epochs=1000, callbacks=[callback], validation_split=0.1, shuffle=True, verbose=False)

        y_pred_proba = model.predict(X_test)
        evaluation.append(evaluate_keras(y_test.argmax(1), y_pred_proba, *y_classes))
        metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()
results.append(metrics)

2022-06-04 21:05:48.377188: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-06-04 21:05:48.410354: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-06-04 21:05:48.410369: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-06-04 21:05:48.410811: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN



In [6]:
metrics_df = pd.DataFrame([results[i] for i in range(len(results))])
metrics_df["clf"] = "Neural Network with BERT"
metrics_df.to_csv("../../results/neural_network_bert.csv")
metrics_df

Unnamed: 0,f1_macro,recall_macro,precision_macro,accuracy,auc_score,clf
0,0.87017,0.870333,0.871621,0.870609,0.936238,Neural Network with BERT
