In [1]:
import sqlite3
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_validate, train_test_split
from tensorflow import keras

In [2]:
conn = sqlite3.connect('Reddit.db')

brasildob = pd.read_sql_query("""
SELECT Author.username, 
	   Comment.body,
	   Subreddit.name AS subreddit_name, 
	   Comment.created_utc
FROM Comment
INNER JOIN Author ON Author.id = Comment.author_id
INNER JOIN Subreddit ON Comment.subreddit_id = Subreddit.id
WHERE Author.username = 'squiercg'
""", conn)

brasil = pd.read_sql_query("""
SELECT Author.username, 
	   Comment.body,
	   Subreddit.name AS subreddit_name, 
	   Comment.created_utc
FROM Comment
INNER JOIN Author ON Author.id = Comment.author_id
INNER JOIN Subreddit ON Comment.subreddit_id = Subreddit.id
WHERE Author.username = 'AlehCemy'
""", conn)

brasilivre = pd.read_sql_query("""
SELECT Author.username, 
	   Comment.body,
	   Subreddit.name AS subreddit_name, 
	   Comment.created_utc
FROM Comment
INNER JOIN Author ON Author.id = Comment.author_id
INNER JOIN Subreddit ON Comment.subreddit_id = Subreddit.id
WHERE Author.username = 'drfritz2'
""", conn)

In [3]:
test_brasilivre = brasilivre[:200]
test_brasildob = brasildob[:200]
train_brasilivre = brasilivre[200:]
train_brasildob = brasildob[200:]

In [4]:
train = pd.concat([train_brasildob, train_brasilivre], ignore_index=True, sort=False)
test = pd.concat([test_brasilivre, test_brasildob], ignore_index=True, sort=False)

In [5]:
import spacy
from spacy.lang.pt.examples import sentences 

nlp = spacy.load("pt_core_news_lg")

In [6]:
pos_texts_train = [nlp(text) for text in train.body]
pos_texts_test = [nlp(text) for text in test.body]

In [7]:
pos_train = np.empty(len(pos_texts_train), dtype='object')
for i in range(len(pos_texts_train)):
    pos_train[i] = " ".join([token.pos_ for token in pos_texts_train[i]])

pos_test = np.empty(len(pos_texts_test), dtype='object')
for i in range(len(pos_texts_test)):
    pos_test[i] = " ".join([token.pos_ for token in pos_texts_test[i]])

In [8]:
text_tfidf = TfidfVectorizer().fit(train.body)
pos_tfidf = TfidfVectorizer().fit(pos_train)
X_text_train = text_tfidf.transform(train.body)
X_pos_train = pos_tfidf.transform(pos_train)
y_train_full = pd.get_dummies(train.username).values
y_classes = pd.get_dummies(train.username).columns

X_text_test = text_tfidf.transform(test.body)
X_pos_test = pos_tfidf.transform(pos_test)
y_test = pd.get_dummies(test.username).values


In [9]:
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_text_train, y_train_full, test_size=0.25, random_state=42)

X_train_pos, X_val_pos, y_train, y_val = train_test_split(
    X_pos_train, y_train_full, test_size=0.25, random_state=42)

In [10]:
input_shape_pos = X_train_pos.shape[1]
input_shape_text = X_train_text.shape[1]
output_shape = y_train_full.shape[1]

In [11]:
def build_model(input_shape_pos, input_shape_text, output_shape):  
    pos_input = keras.layers.Input(shape=input_shape_pos, name="Input_POS")
    text_input = keras.layers.Input(shape=input_shape_text, name="Input_Text")

    pos_dense1 = keras.layers.Dense(30, activation="relu", name="Dense1_POS")(pos_input)
    text_dense1 = keras.layers.Dense(30, activation="relu", name="Dense1_Text")(text_input)
    pos_dense2 = keras.layers.Dense(30, activation="relu", name="Dense2_POS")(pos_dense1)
    text_dense2 = keras.layers.Dense(30, activation="relu", name="Dense2_Text")(text_dense1)
    pos_dense3 = keras.layers.Dense(30, activation="relu", name="Dense3_POS")(pos_dense2)
    text_dense3 = keras.layers.Dense(30, activation="relu", name="Dense3_Text")(text_dense2)

    concat = keras.layers.concatenate([pos_dense3, text_dense3])
    output = keras.layers.Dense(output_shape, activation="softmax", name="Output")(concat)
    
    model = keras.Model(inputs=[pos_input, text_input], outputs=[output])
    #print(model.summary())
    return model

In [12]:
callback = keras.callbacks.EarlyStopping(monitor='val_loss', patience=30)

model = build_model(input_shape_pos, input_shape_text, output_shape)
model.compile(loss = "categorical_crossentropy",
             optimizer = keras.optimizers.SGD(learning_rate=0.01),
             metrics = ["accuracy"])
history = model.fit((X_train_pos.toarray(), X_train_text.toarray()), y_train, epochs=1000, validation_data=((X_val_pos.toarray(), X_val_text.toarray()), y_val), callbacks=[callback], shuffle=True)

Epoch 1/1000


2022-02-23 19:39:03.521629: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:936] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-02-23 19:39:03.594440: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-02-23 19:39:03.594466: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-02-23 19:39:03.595011: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN

Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Epoch 70/1000
Epoch 71/1000
Epoch 72/1000
Epoch 73/1000


In [13]:
model.evaluate((X_pos_test.toarray(), X_text_test.toarray()), y_test)



[0.33289310336112976, 0.875]