In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier

In [2]:
df_with_emojis = pd.read_csv('../datasets/df_with_emojis.csv')
df_without_emojis = pd.read_csv('../datasets/df_without_emojis.csv')

df_with_emojis = df_with_emojis.dropna()
df_without_emojis = df_without_emojis.dropna()

In [3]:
X_train_emojis, X_test_emojis, y_train_emojis, y_test_emojis = train_test_split(
    df_with_emojis['text'], df_with_emojis['sentiment'], test_size=0.2, random_state=42)

X_train_no_emojis, X_test_no_emojis, y_train_no_emojis, y_test_no_emojis = train_test_split(
    df_without_emojis['text'], df_without_emojis['sentiment'], test_size=0.2, random_state=42)

In [None]:
vectorizer = TfidfVectorizer(max_features=5000) 
X_train_emojis_vec = vectorizer.fit_transform(X_train_emojis)
X_test_emojis_vec = vectorizer.transform(X_test_emojis)
X_train_no_emojis_vec = vectorizer.fit_transform(X_train_no_emojis)
X_test_no_emojis_vec = vectorizer.transform(X_test_no_emojis)

print(f"Formato da matriz TF-IDF (treino) - com emojis: {X_train_emojis_vec.shape}")
print(f"Formato da matriz TF-IDF (treino) - sem emojis: {X_train_no_emojis_vec.shape}")

Formato da matriz TF-IDF (treino) - com emojis: (1274444, 5000)
Formato da matriz TF-IDF (treino) - sem emojis: (1274420, 5000)


In [None]:
def plot_confusion_matrix(y_true, y_pred, title, class_labels):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=class_labels, yticklabels=class_labels)
    plt.title(title)
    plt.ylabel('Verdadeiro')
    plt.xlabel('Previsto')
    plt.show()

In [None]:
rf_emojis = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_emojis.fit(X_train_emojis_vec, y_train_emojis)

y_pred_emojis = rf_emojis.predict(X_test_emojis_vec)

print("Random Forest - com emojis:")
print(classification_report(y_test_emojis, y_pred_emojis))

In [None]:
class_labels = np.unique(y_test_emojis)

plot_confusion_matrix(y_test_emojis, y_pred_emojis, "Matriz de Confusão - Random Forest com emojis ", class_labels)

In [None]:
rf_no_emojis = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs=-1)
rf_no_emojis.fit(X_train_no_emojis_vec, y_train_no_emojis)

y_pred_no_emojis = rf_no_emojis.predict(X_test_no_emojis_vec)

print("Random Forest - sem emojis:")
print(classification_report(y_test_no_emojis, y_pred_no_emojis))

In [None]:
class_labels = np.unique(y_test_no_emojis)

plot_confusion_matrix(y_test_no_emojis, y_pred_no_emojis, "Matriz de Confusão - Random Forest sem emojis ", class_labels)