In [1]:
from google.colab import drive

drive.mount('/content/drive')


Mounted at /content/drive


In [2]:
# Ruta del proyecto en Google Drive
PROJECT_DIR = "/content/drive/MyDrive/TFG-FakeNewsNet"

# Me muevo a la carpeta principal del proyecto
%cd $PROJECT_DIR

!ls

/content/drive/MyDrive/TFG-FakeNewsNet
data  notebooks  README.md  requirements.txt


In [3]:
!git ls-files notebooks


notebooks/00_setup.ipynb
notebooks/00_setup.ipynb.ipynb
notebooks/01_fnn_import.ipynb
notebooks/02_preprocessing.ipynb
notebooks/99_git_push.ipynb


In [None]:
import re
import numpy as np
import pandas as pd
from tqdm.auto import tqdm  # barra de progreso en operaciones lentas
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

# Activo las barras de progreso en pandas
tqdm.pandas()

# Stopwords en inglés
STOPWORDS = set(ENGLISH_STOP_WORDS)
print("Número de stopwords cargadas:", len(STOPWORDS))


Número de stopwords cargadas: 318


In [None]:
# Cargo el dataset generado en el Notebook 01
df = pd.read_csv("data/noticias.csv")

print("Shape inicial (filas, columnas):", df.shape)
display(df.head(3))
display(df.info())

# Distribución de label
print("\nDistribución de la etiqueta 'label' (%):")
print((df["label"].value_counts(normalize=True) * 100).round(2))


Shape inicial (filas, columnas): (44898, 5)


Unnamed: 0,title,text,subject,date,label
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44898 entries, 0 to 44897
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 1.7+ MB


None


Distribución de la etiqueta 'label' (%):
label
1    52.3
0    47.7
Name: proportion, dtype: float64


#Normalización columna "text" y filtración de textos vacíos

In [None]:
# Me aseguro que "text" es string y no contiene NaNs
df["text"] = df["text"].fillna("").astype(str)

# Elimino filas con texto extremadamente corto (esto evita ruido o errores en vectores vacíos)
min_length = 10
mask_valid = df["text"].str.len() >= min_length

print("Filas antes:", len(df))
print("Filas después:", mask_valid.sum())

df = df[mask_valid].reset_index(drop=True)


Filas antes: 44246
Filas después: 44246


#Limpieza y lematización

In [None]:
# Patrón para detectar URLs
URL_PATTERN = re.compile(r"http\S+|www\.\S+")

# Patrón para tokenizar palabras (deja solo letras de a-z y números simples de 0-9)
TOKEN_PATTERN = re.compile(r"\b[a-z0-9]+\b")


In [None]:
def simple_lemma(token: str) -> str:   # Lematizar muy  ligero

    t = token

    #  Pasar sufijos -ies → -y
    if len(t) > 4 and t.endswith("ies"):
        return t[:-3] + "y"

    # Eliminar sufijos "ing" y "ed"
    for suffix in ["ing", "ed"]:
        if len(t) > 4 and t.endswith(suffix):
            t = t[:-len(suffix)]
            break

    # Quitar plurales (-s, -es)
    if len(t) > 3 and t.endswith("es"):
        t = t[:-2]
    elif len(t) > 2 and t.endswith("s"):
        t = t[:-1]

    return t

In [None]:
def clean_and_normalize(text: str) -> str:  # Limpieza necesaria en los modelos

    text = text.lower()  # Convierte el texto en minúsculas
    text = URL_PATTERN.sub(" ", text) # Reemplazar las URLs por un espacio

    tokens = TOKEN_PATTERN.findall(text) # Tokenización simple (quita alfanuméricos)

    clean_tokens = []
    for tok in tokens:
        if len(tok) <= 2:   # Eliminar palabras cortas
            continue
        if tok in STOPWORDS:    # Eliminar stopwords sin significado
            continue
        clean_tokens.append(simple_lemma(tok))   # Aplicar la función de arriba

    return " ".join(clean_tokens)   # Devuelve ya el texto limpio

#Añadir columna "text_clean" (texto limpio y normalizado)

In [None]:
%%time
df["text_clean"] = df["text"].progress_apply(clean_and_normalize)

# Vista comparativa
df[["text", "text_clean"]].head(5)


  0%|          | 0/44246 [00:00<?, ?it/s]

CPU times: user 20.2 s, sys: 152 ms, total: 20.3 s
Wall time: 20.5 s


Unnamed: 0,text,text_clean
0,"21st Century Wire says Ben Stein, reputable pr...",21st century wire say ben stein reputable prof...
1,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuter president donald trump remov...
2,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuter puerto rico governor ricardo rossello s...
3,"On Monday, Donald Trump once again embarrassed...",monday donald trump embarras country accidenta...
4,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuter presidential candidat ...


#Checkpoint

In [None]:
# Guardar un CSV intermedio para no repetir la limpieza por si cambio algo
TMP_PATH = "data/noticias_tmp_lematizado.csv"
df.to_csv(TMP_PATH, index=False)
print("Checkpoint intermedio guardado en:", TMP_PATH)


Checkpoint intermedio guardado en: data/noticias_tmp_lematizado.csv


#Features estructurales de "text"

In [None]:
# Número de palabras en mayúsculas
def count_upper_words(text: str) -> int:
    words = text.split()
    return sum(1 for w in words if w.isupper() and len(w) > 1)

# Longitudes básicas
df["n_chars"] = df["text"].str.len()
df["n_words"] = df["text"].str.split().apply(len)
df["avg_word_len"] = (df["n_chars"] / df["n_words"]).replace([np.inf, np.nan], 0)

# Signos de puntuación
df["n_exclam"] = df["text"].str.count("!")
df["n_question"] = df["text"].str.count(r"\?")
df["n_digits"] = df["text"].str.count(r"\d")

# Palabras en mayúsculas
df["n_upper_words"] = df["text"].apply(count_upper_words)

# URLs dentro del texto original
df["url_count"] = df["text"].str.count(r"http[s]?://|www\.")
df["has_url"] = (df["url_count"] > 0).astype(int)

df.head(5)


Unnamed: 0,title,text,subject,date,label,text_clean,n_chars,n_words,avg_word_len,n_exclam,n_question,n_digits,n_upper_words,url_count,has_url
0,Ben Stein Calls Out 9th Circuit Court: Committ...,"21st Century Wire says Ben Stein, reputable pr...",US_News,"February 13, 2017",1,21st century wire say ben stein reputable prof...,1028,171,6.011696,0,0,7,12,0,0
1,Trump drops Steve Bannon from National Securit...,WASHINGTON (Reuters) - U.S. President Donald T...,politicsNews,"April 5, 2017",0,washington reuter president donald trump remov...,4820,771,6.251621,0,0,6,19,0,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,(Reuters) - Puerto Rico Governor Ricardo Rosse...,politicsNews,"September 27, 2017",0,reuter puerto rico governor ricardo rossello s...,1848,304,6.078947,0,0,0,4,0,0
3,OOPS: Trump Just Accidentally Confirmed He Le...,"On Monday, Donald Trump once again embarrassed...",News,"May 22, 2017",1,monday donald trump embarras country accidenta...,1244,183,6.797814,0,0,10,2,1,1
4,Donald Trump heads for Scotland to reopen a go...,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",politicsNews,"June 24, 2016",0,glasgow scotland reuter presidential candidat ...,3137,529,5.930057,0,0,24,10,0,0


#Verificación

Me aseguro a ver si el "text_clean" tiene sentido, los features estructurales son coherentes y no hay valores extraños

In [None]:
cols_to_show = [
    "title", "subject", "date", "label",
    "text",
    "text_clean",
    "n_chars", "n_words", "avg_word_len",
    "n_exclam", "n_question", "n_digits",
    "n_upper_words", "url_count", "has_url"
]

df[cols_to_show].head(5)


Unnamed: 0,title,subject,date,label,text,text_clean,n_chars,n_words,avg_word_len,n_exclam,n_question,n_digits,n_upper_words,url_count,has_url
0,Ben Stein Calls Out 9th Circuit Court: Committ...,US_News,"February 13, 2017",1,"21st Century Wire says Ben Stein, reputable pr...",21st century wire say ben stein reputable prof...,1028,171,6.011696,0,0,7,12,0,0
1,Trump drops Steve Bannon from National Securit...,politicsNews,"April 5, 2017",0,WASHINGTON (Reuters) - U.S. President Donald T...,washington reuter president donald trump remov...,4820,771,6.251621,0,0,6,19,0,0
2,Puerto Rico expects U.S. to lift Jones Act shi...,politicsNews,"September 27, 2017",0,(Reuters) - Puerto Rico Governor Ricardo Rosse...,reuter puerto rico governor ricardo rossello s...,1848,304,6.078947,0,0,0,4,0,0
3,OOPS: Trump Just Accidentally Confirmed He Le...,News,"May 22, 2017",1,"On Monday, Donald Trump once again embarrassed...",monday donald trump embarras country accidenta...,1244,183,6.797814,0,0,10,2,1,1
4,Donald Trump heads for Scotland to reopen a go...,politicsNews,"June 24, 2016",0,"GLASGOW, Scotland (Reuters) - Most U.S. presid...",glasgow scotland reuter presidential candidat ...,3137,529,5.930057,0,0,24,10,0,0


Mostrar estadísticas (por ejemplo si n_words<5, hay ruido)

In [None]:
df[[
    "n_chars", "n_words", "avg_word_len",
    "n_exclam", "n_question", "n_digits",
    "n_upper_words", "url_count"
]].describe(percentiles=[0.25, 0.5, 0.75])


Unnamed: 0,n_chars,n_words,avg_word_len,n_exclam,n_question,n_digits,n_upper_words,url_count
count,44246.0,44246.0,44246.0,44246.0,44246.0,44246.0,44246.0,44246.0
mean,2505.476495,411.253786,6.1843,0.413597,0.688605,16.387764,6.294942,0.123785
std,2166.642595,350.357216,2.065957,1.465536,1.806103,26.461253,9.233789,0.537634
min,10.0,1.0,3.25,0.0,0.0,0.0,0.0,0.0
25%,1295.0,213.0,5.890411,0.0,0.0,2.0,2.0,0.0
50%,2210.0,366.0,6.098928,0.0,0.0,9.0,4.0,0.0
75%,3126.0,516.0,6.297499,0.0,1.0,21.0,8.0,0.0
max,51794.0,8135.0,149.0,133.0,94.0,1396.0,307.0,22.0


#Guardado del dataset preprocesado

In [None]:
OUTPUT_PATH = "data/noticias_preproc.csv"
df.to_csv(OUTPUT_PATH, index=False)

print("Dataset preprocesado guardado en:", OUTPUT_PATH)
print("Shape final:", df.shape)
print("\nColumnas finales del dataset:")
print(df.columns.tolist())


Dataset preprocesado guardado en: data/noticias_preproc.csv
Shape final: (44246, 15)

Columnas finales del dataset:
['title', 'text', 'subject', 'date', 'label', 'text_clean', 'n_chars', 'n_words', 'avg_word_len', 'n_exclam', 'n_question', 'n_digits', 'n_upper_words', 'url_count', 'has_url']
