In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout


In [4]:
df_with_emojis = pd.read_csv('../datasets/df_with_emojis.csv')
df_without_emojis = pd.read_csv('../datasets/df_without_emojis.csv')

df_with_emojis = df_with_emojis.dropna()
df_without_emojis = df_without_emojis.dropna()

In [5]:
X_train_emojis, X_test_emojis, y_train_emojis, y_test_emojis = train_test_split(
    df_with_emojis['text'], df_with_emojis['sentiment'], test_size=0.2, random_state=42)

X_train_no_emojis, X_test_no_emojis, y_train_no_emojis, y_test_no_emojis = train_test_split(
    df_without_emojis['text'], df_without_emojis['sentiment'], test_size=0.2, random_state=42)

In [6]:
vectorizer = TfidfVectorizer(max_features=5000) 
X_train_emojis_vec = vectorizer.fit_transform(X_train_emojis)
X_test_emojis_vec = vectorizer.transform(X_test_emojis)
X_train_no_emojis_vec = vectorizer.fit_transform(X_train_no_emojis)
X_test_no_emojis_vec = vectorizer.transform(X_test_no_emojis)

print(f"Formato da matriz TF-IDF (treino) - com emojis: {X_train_emojis_vec.shape}")
print(f"Formato da matriz TF-IDF (treino) - sem emojis: {X_train_no_emojis_vec.shape}")

Formato da matriz TF-IDF (treino) - com emojis: (1274444, 5000)
Formato da matriz TF-IDF (treino) - sem emojis: (1274420, 5000)


In [None]:
def plot_confusion_matrix(y_true, y_pred, title, class_labels):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title(title)
    plt.ylabel('Verdadeiro')
    plt.xlabel('Previsto')
    plt.show()

In [None]:
mlp_emojis = Sequential()
mlp_emojis.add(Dense(128, activation='relu', input_shape=(X_train_emojis_vec.shape[1],)))
mlp_emojis.add(Dropout(0.5))
mlp_emojis.add(Dense(64, activation='relu'))
mlp_emojis.add(Dense(1, activation='sigmoid'))

mlp_emojis.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = mlp_emojis.fit(X_train_emojis_vec.toarray(), y_train_emojis, 
                        epochs=10, batch_size=32, 
                        validation_split=0.2, verbose=1)

y_pred_emojis = (mlp_emojis.predict(X_test_emojis_vec.toarray()) > 0.5).astype(int)

print("Multilayer Perceptron - Com emojis:")
print(classification_report(y_test_emojis, y_pred_emojis))

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
class_labels = np.unique(y_test_emojis)

plot_confusion_matrix(y_test_emojis, y_pred_emojis, "Matriz de Confusão - MLP com emojis ", class_labels)

In [None]:
mlp_no_emojis = Sequential()
mlp_no_emojis.add(Dense(128, activation='relu', input_shape=(X_train_no_emojis_vec.shape[1],)))
mlp_no_emojis.add(Dropout(0.5))
mlp_no_emojis.add(Dense(64, activation='relu'))
mlp_no_emojis.add(Dense(1, activation='sigmoid'))

mlp_no_emojis.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

history = mlp_no_emojis.fit(X_train_no_emojis_vec.toarray(), y_train_no_emojis, 
                        epochs=10, batch_size=32, 
                        validation_split=0.2, verbose=1)

y_pred_no_emojis = (mlp_no_emojis.predict(X_test_no_emojis_vec.toarray()) > 0.5).astype(int)

print("Multilayer Perceptron - Sem emojis:")
print(classification_report(y_test_no_emojis, y_pred_no_emojis))

In [None]:
class_labels = np.unique(y_test_emojis)

plot_confusion_matrix(y_test_emojis, y_pred_emojis, "Matriz de Confusão - MLP sem emojis ", class_labels)