In [None]:
# =========================================================
# 0. Imports e Configuração
# =========================================================
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datasets import load_dataset
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Criar pasta de gráficos
os.makedirs("graficos", exist_ok=True)

# Fixar semente global
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

# =========================================================
# 1. Dataset
# =========================================================
dataset = load_dataset("sms_spam", split="train")
df = pd.DataFrame(dataset)
df = df.rename(columns={"sms": "texto", "label": "label"})

# =========================================================
# 2. Separação treino/teste
# =========================================================
X_train, X_test, y_train, y_test = train_test_split(
    df["texto"], df["label"], test_size=0.2, random_state=RANDOM_STATE, stratify=df["label"]
)

# =========================================================
# 3. Vetorização BOW e TF-IDF
# =========================================================
bow = CountVectorizer()
tfidf = TfidfVectorizer()

X_train_bow = bow.fit_transform(X_train)
X_test_bow = bow.transform(X_test)

X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# =========================================================
# 4. Treinamento do modelo
# =========================================================
model = LogisticRegression(max_iter=500, random_state=RANDOM_STATE)
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)

print("[CLASSIFICAÇÃO TABULAR]")
print(classification_report(y_test, y_pred))

# =========================================================
# 5. Gráficos BOW e TF-IDF (determinísticos)
# =========================================================

# Função auxiliar para pegar top K de forma determinística
def top_k_indices(arr, k=20):
    indices = np.argsort(arr)[::-1][:k]
    return indices

# BOW
bow_sum = X_train_bow.sum(axis=0).A1
indices = top_k_indices(bow_sum, 20)
words = np.array(bow.get_feature_names_out())[indices]
freqs = bow_sum[indices]

plt.figure(figsize=(8,6))
plt.barh(words, freqs)
plt.gca().invert_yaxis()  # colocar maior no topo
plt.title("Top 20 palavras - BOW (determinístico)")
plt.tight_layout()
plt.savefig("graficos/bow_top20.png", dpi=300)
plt.close()

# TF-IDF
tfidf_sum = X_train_tfidf.sum(axis=0).A1
indices = top_k_indices(tfidf_sum, 20)
words = np.array(tfidf.get_feature_names_out())[indices]
scores = tfidf_sum[indices]

plt.figure(figsize=(8,6))
plt.barh(words, scores)
plt.gca().invert_yaxis()
plt.title("Top 20 palavras - TF-IDF (determinístico)")
plt.tight_layout()
plt.savefig("graficos/tfidf_top20.png", dpi=300)
plt.close()

print("Gráficos BOW e TF-IDF salvos em /graficos")