In [None]:
!pip install transformers scikit-learn

In [None]:
import pandas as pd
import numpy as np
import torch
import tensorflow as tf
import re
import os
import pickle
from transformers import DistilBertTokenizer, DistilBertModel
from sklearn.model_selection import train_test_split
from sklearn.utils import class_weight
from sklearn.metrics import classification_report, f1_score
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Centralizando os hiperparâmetros
MAX_LEN = 512
BATCH_SIZE = 32 # Lote para extração
EMBEDDING_DIM = 768

In [None]:
# Arquivos do projeto
EMBEDDINGS_FILE = "bert_embeddings.dat"
CLASSIFIER_MODEL = "bilstm_classifier.keras"
TOKENIZER_PATH = "distilbert_tokenizer"
MODEL_CONFIG_FILE = "model_metadata.pkl"

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
from google.colab import drive
drive.mount('/content/drive')

BASE_PATH = "/content/drive/MyDrive/NLP Ligia | Luiz Miguel Gonzaga | FAKE NEWS Detection"

train_path = f"{BASE_PATH}/train.csv"
test_path = f"{BASE_PATH}/test.csv"

df = pd.read_csv(train_path)

### **EDA**

In [None]:
plt.figure(figsize=(8,6))
ct = pd.crosstab(df["subject"], df["label"], normalize="index")
sns.heatmap(ct, annot=True, cmap="coolwarm")
plt.title("Correlação Subject vs Fake")
plt.show()

df_model = df.drop(columns=["subject", "date", "id"])
# Análise da correlação entre o veículo de notícias e a quantidade de casos verdadeiros e falsos

In [None]:
plt.figure(figsize=(6,4))
sns.countplot(data=df_model, x="label")
plt.title("Distribuição das Classes")
plt.show()

print(df_model["label"].value_counts(normalize=True))

In [None]:
df_model["title_len"] = df_model["title"].str.split().apply(len)
df_model["text_len"] = df_model["text"].str.split().apply(len)

print(df_model[["title_len","text_len"]].describe(percentiles=[0.5,0.75,0.9,0.95,0.99]))
# Dados Estatísticos, do tamanho do título e texto

In [None]:
df.head()

In [None]:
# Junta todos os textos, junta as palavras mais ditas, a frequência de cada
texto_total = " ".join(df["text"].dropna())
palavras = texto_total.split()
from collections import Counter
contagem = Counter(palavras)
print(contagem.most_common(40))

### **Pré-processamento**

In [None]:
df = df.drop(columns=["subject", "date"], errors="ignore")

def clean_text_light(text):
    if not isinstance(text, str):
        return ""
    text = re.sub(r"http\S+|www\S+|https\S+", "", text)
    text = re.sub(r"<.*?>", "", text)
    text = re.sub(r"\(?reuters\)?\s*-?", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

df["text_light"] = (df["title"] + " " + df["text"]).apply(clean_text_light)

X_text = df["text_light"].values
y = df["label"].values
N_SAMPLES = len(X_text)
# Remove Link, veículo de notícias, múltiplos espaços

### **Extranindo Embbedings: BERT e MEMMAP**

In [None]:
# Carrega tokenizer e modelo DistilBERT pré-treinado
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
bert_model.eval()  # Modo avaliação (desativa dropout)

In [None]:
# Se já existe arquivo antigo de embeddings, remove
if os.path.exists(EMBEDDINGS_FILE):
    os.remove(EMBEDDINGS_FILE)

In [None]:
# Aloca espaço no disco
X_memmap = np.memmap(
    EMBEDDINGS_FILE,
    dtype="float32",
    mode="w+",
    shape=(N_SAMPLES, MAX_LEN, EMBEDDING_DIM)
)

In [None]:
# Trunca texto: pega 382 do começo e 128 do final se passar de 512 tokens
def smart_truncate(text, tokenizer, max_len=512):
    limit = max_len - 2
    tokens = tokenizer.encode(text, add_special_tokens=False)
    if len(tokens) <= limit:
        return tokenizer.encode(text, max_length=max_len, padding="max_length", truncation=True)
    head = tokens[:382]
    tail = tokens[-128:]
    return [tokenizer.cls_token_id] + head + tail + [tokenizer.sep_token_id]

In [None]:
for i in tqdm(range(0, N_SAMPLES, BATCH_SIZE)):
    batch_texts = X_text[i : i + BATCH_SIZE]

    input_ids = []
    attention_masks = []

    for text in batch_texts:
        ids = smart_truncate(text, tokenizer, MAX_LEN)
        # Pad manual
        if len(ids) < MAX_LEN:
            ids = ids + [tokenizer.pad_token_id] * (MAX_LEN - len(ids))

        mask = [1 if token != tokenizer.pad_token_id else 0 for token in ids]
        input_ids.append(ids)
        attention_masks.append(mask)

    input_ids_pt = torch.tensor(input_ids).to(device)
    attention_masks_pt = torch.tensor(attention_masks).to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids_pt, attention_mask=attention_masks_pt)
        batch_embeddings = outputs.last_hidden_state

        # Aplica a máscara para zerar padding
        mask_expanded = attention_masks_pt.unsqueeze(-1).expand(batch_embeddings.size()).float()
        batch_embeddings = batch_embeddings * mask_expanded

        # Escreve no Memmap (Disco)
        # O .cpu().numpy() tira da GPU e joga pro disco via memmap
        X_memmap[i : i + len(batch_texts)] = batch_embeddings.cpu().numpy()

        # Força gravação no disco para liberar RAM
        X_memmap.flush()

# Limpa GPU
del bert_model
torch.cuda.empty_cache()

### **Gerador e Split**

In [None]:
class MemmapGenerator(tf.keras.utils.Sequence):

    def __init__(self, memmap_data, indices, labels, batch_size=16):
        self.memmap_data = memmap_data
        self.indices = indices
        self.labels = labels
        self.batch_size = batch_size

    def __len__(self):
        return int(np.ceil(len(self.indices) / self.batch_size))

    def __getitem__(self, index):
        batch_indices = self.indices[index*self.batch_size:(index+1)*self.batch_size]
        X_batch = self.memmap_data[batch_indices]
        y_batch = self.labels[batch_indices]
        return X_batch, y_batch


indices = np.arange(N_SAMPLES)

In [None]:
train_idx, val_idx, y_train_split, y_val_split = train_test_split(
    indices, y, test_size=0.2, stratify=y, random_state=42
)

weights = class_weight.compute_class_weight(
    "balanced",
    classes=np.unique(y_train_split),
    y=y_train_split
)

class_weights = dict(zip(np.unique(y_train_split), weights))

X_memmap_read = np.memmap(
    EMBEDDINGS_FILE,
    dtype="float32",
    mode="r",
    shape=(N_SAMPLES, MAX_LEN, EMBEDDING_DIM)
)

train_gen = MemmapGenerator(X_memmap_read, train_idx, y, batch_size=16)
val_gen = MemmapGenerator(X_memmap_read, val_idx, y, batch_size=16)

### **Treinamento**

In [None]:
def build_lstm_classifier():

    # Modelo recebe embeddings do BERT (MAX_LEN x 768) e usa BiLSTM para capturar contexto
    inputs = tf.keras.Input(shape=(MAX_LEN, EMBEDDING_DIM))
    x = tf.keras.layers.Masking(mask_value=0.0)(inputs)
    x = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128))(x)
    x = tf.keras.layers.Dense(128, activation="relu")(x)
    x = tf.keras.layers.Dropout(0.2)(x)
    outputs = tf.keras.layers.Dense(1, activation="sigmoid")(x)

    model = tf.keras.Model(inputs, outputs)

    model.compile(
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss="binary_crossentropy",
        metrics=["accuracy"]
    )

    return model


model = build_lstm_classifier()

# Callbacks para salvar melhor modelo e parar treino se não melhorar
checkpoint = tf.keras.callbacks.ModelCheckpoint(
    "best_lstm.keras",
    monitor="val_loss",
    save_best_only=True
)

early_stop = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss",
    patience=3,
    restore_best_weights=True
)

# Treina o classificador usando embeddings pré-gerados
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=10,
    class_weight=class_weights,
    callbacks=[checkpoint, early_stop]
)

### **Avaliação e Limite**

In [None]:
y_pred_prob = model.predict(val_gen)
y_true_val = y[val_idx]

thresholds = np.arange(0.1, 0.9, 0.01)

f1_scores = [
    f1_score(y_true_val, (y_pred_prob > t).astype(int))
    for t in thresholds
]

best_thresh = thresholds[np.argmax(f1_scores)]

print("Melhor Threshold:", best_thresh)
print("Melhor F1:", max(f1_scores))
print(classification_report(
    y_true_val,
    (y_pred_prob > best_thresh).astype(int),
    digits=4
))

### **Salvar**

In [None]:
model.save(MODEL_FILE)
tokenizer.save_pretrained(TOKENIZER_FOLDER)

configuracoes = {
    "best_threshold": best_thresh,
    "max_len": MAX_LEN
}

with open(CONFIG_FILE, "wb") as f:
    pickle.dump(configuracoes, f)

### **Interpretação das Métricas**

In [None]:
from sklearn.metrics import matthews_corrcoef, precision_recall_curve

y_pred_final = (y_pred_prob > best_thresh).astype(int)
mcc = matthews_corrcoef(y_true_val, y_pred_final)
print("MCC:", mcc)

precision, recall, _ = precision_recall_curve(y_true_val, y_pred_prob)

plt.figure()
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Precision-Recall Curve")
plt.show()

def analyze_sensitivity(sample_index, remove_tokens=50):
    original_embedding = X_memmap_read[sample_index:sample_index+1]
    original_prob = model.predict(original_embedding)[0][0]

    perturbed_embedding = original_embedding.copy()
    perturbed_embedding[:, :remove_tokens, :] = 0

    new_prob = model.predict(perturbed_embedding)[0][0]

    print("Prob original:", float(original_prob))
    print("Prob perturbado:", float(new_prob))
    print("Variação:", abs(float(original_prob - new_prob)))

### **Pipeline para novo conjunto**

In [None]:
with open(CONFIG_FILE, "rb") as f:
    configuracoes = pickle.load(f)

best_thresh = configuracoes["best_threshold"]
MAX_LEN = configuracoes["max_len"]

tokenizer = DistilBertTokenizer.from_pretrained(TOKENIZER_FOLDER)
model = tf.keras.models.load_model(MODEL_FILE)

bert_model = DistilBertModel.from_pretrained("distilbert-base-uncased").to(device)
bert_model.eval()

df_test = pd.read_csv("test.csv")
df_test["title"] = df_test["title"].fillna("")
df_test["text"] = df_test["text"].fillna("")

df_test["text_light"] = (df_test["title"] + " " + df_test["text"]).apply(clean_text_light)
X_test_text = df_test["text_light"].values

todas_probabilidades = []

for i in tqdm(range(0, len(X_test_text), BATCH_SIZE)):
    batch_texts = X_test_text[i:i+BATCH_SIZE]

    input_ids, attention_masks = [], []

    for text in batch_texts:
        ids = smart_truncate(text, tokenizer, MAX_LEN)
        if len(ids) < MAX_LEN:
            ids += [tokenizer.pad_token_id] * (MAX_LEN - len(ids))
        mask = [1 if token != tokenizer.pad_token_id else 0 for token in ids]
        input_ids.append(ids)
        attention_masks.append(mask)

    input_ids_pt = torch.tensor(input_ids).to(device)
    attention_masks_pt = torch.tensor(attention_masks).to(device)

    with torch.no_grad():
        outputs = bert_model(input_ids_pt, attention_mask=attention_masks_pt)
        embeddings = outputs.last_hidden_state
        mask_expanded = attention_masks_pt.unsqueeze(-1).expand(embeddings.size()).float()
        embeddings = embeddings * mask_expanded

    preds = model.predict(embeddings.cpu().numpy(), verbose=0)
    todas_probabilidades.extend(preds.flatten())

y_pred_test = (np.array(todas_probabilidades) > best_thresh).astype(int)

submission = pd.DataFrame({
    "id": df_test["id"],
    "label": y_pred_test
})

submission.to_csv("submission.csv", index=False)