In [2]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split, evaluate_keras

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate, train_test_split
from tensorflow import keras
from nltk.tokenize import TweetTokenizer

In [3]:
data = get_data("../../data/authors.csv")

In [4]:
tt = TweetTokenizer()

vectorizers = [CountVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize), 
                CountVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize), 
                CountVectorizer(ngram_range=(1,5), analyzer="char"),
                CountVectorizer(ngram_range=(4,5), analyzer="char"),
                CountVectorizer(ngram_range=(3,8), analyzer="char"),
                TfidfVectorizer(ngram_range=(1,1), analyzer="word", tokenizer=tt.tokenize), 
                TfidfVectorizer(ngram_range=(1,3), analyzer="word", tokenizer=tt.tokenize), 
                TfidfVectorizer(ngram_range=(1,5), analyzer="char"),
                TfidfVectorizer(ngram_range=(4,5), analyzer="char"),
                TfidfVectorizer(ngram_range=(3,8), analyzer="char")]

In [5]:
def build_model(input_shape, output_shape):  
    model = keras.models.Sequential(name="NeuralNetwork")
    model.add(keras.layers.Input(shape=input_shape, name="Input"))
    model.add(keras.layers.Dense(300, activation="relu", name="Dense1"))
    model.add(keras.layers.Dense(300, activation="relu", name="Dense2"))
    model.add(keras.layers.Dense(300, activation="relu", name="Dense3"))
    model.add(keras.layers.Dense(output_shape, activation="softmax", name="Output"))
    return model

In [7]:
evaluation = list()
usernames = list(np.unique(data["username"]))
results = list()

for i, vectorizer in enumerate(vectorizers):
    vectorizer_str = vectorizer.__str__()
    print(f"Running => {vectorizer_str}")

    for i in range(len(usernames)):
        author1 = usernames.pop()

        for author2 in usernames:
            X_train, X_test, y_train, y_test = temporal_train_test_split(
                data, author1, author2)

            scaler = MinMaxScaler()
            X_train = vectorizer.fit_transform(X_train["comment"]).toarray()
            X_test = vectorizer.transform(X_test["comment"]).toarray()
            X_train = scaler.fit_transform(X_train)
            X_test = scaler.transform(X_test)

            y_classes = pd.get_dummies(y_train).columns
            y_train = pd.get_dummies(y_train).values
            y_test = pd.get_dummies(y_test).values
            
            input_shape_text = X_train.shape[1]
            output_shape = y_train.shape[1]

            callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30, restore_best_weights=True)

            model = build_model(input_shape_text, output_shape)
            model.compile(loss = "categorical_crossentropy",
                        optimizer = keras.optimizers.SGD(learning_rate=0.01),
                        metrics = ["accuracy"])
            history = model.fit(X_train, y_train, epochs=1000, callbacks=[callback], validation_split=0.1, shuffle=True, verbose=True)


            y_pred_proba = model.predict(X_test)
            evaluation.append(evaluate_keras(y_test.argmax(1), y_pred_proba, *y_classes))
            metrics = pd.DataFrame(evaluation)[["f1_macro", "recall_macro", "precision_macro", "accuracy", "auc_score"]].mean()
            metrics["vectorizer"] = [vectorizer_str for i in range(len(metrics))]
            print("==================")
            print("F1:", metrics["f1_macro"])
            print("Acc:", metrics["accuracy"])
            print("AUC:", metrics["auc_score"])
            print("==================")
        break
    break
    results.append(metrics)

Running => CountVectorizer(tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x7f187507c0d0>>)
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/100

In [None]:
results

In [None]:
metrics_df = pd.DataFrame([results[i][1] for i in range(len(results))])
metrics_df["vectorizer"] = [results[i][0] for i in range(len(results))]
metrics_df.to_csv("../../results/neural_network.csv")
metrics_df