In [None]:
# Conexión con Google Drive (específico para Google Colab)
from google.colab import drive  # Permite montar Google Drive en el entorno Colab
drive.mount('/content/drive')  # Monta la carpeta de Google Drive en la ruta /content/drive

# Descarga del modelo de idioma español para spaCy (ejecuta solo una vez)
!python -m spacy download es_core_news_sm

# Importaciones de NLTK
import nltk
from nltk.tokenize import TreebankWordTokenizer  # Tokenizador avanzado para palabras
from nltk.tokenize import sent_tokenize  # Tokenizador de oraciones
from nltk.corpus import stopwords  # Lista de palabras vacías (stopwords)
from nltk.stem import WordNetLemmatizer  # Lematizador (reducción a forma canónica)
from nltk.stem import SnowballStemmer  # Stemmer para español (reducción a raíz)
from sklearn.utils import resample # permite realizar remuestreo (bootstrap) de conjuntos de datos
import math                        # importa el módulo math de Python, que proporciona funciones matemáticas básicas.
from collections import defaultdict # subclase del diccionario estándar de Python que proporciona un valor predeterminado para las claves que no existen, evitando así errores KeyError.
from sklearn.metrics import classification_report  #Calcula la precisión de las predicciones,f1-score, recall y support.
import pandas as pd  # Biblioteca para el análisis y manipulación de datos tabulares
from sklearn.metrics import accuracy_score  #Calcula la precisión de las predicciones

# Importación de spaCy
import spacy  # Biblioteca avanzada de NLP

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Collecting es-core-news-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/es_core_news_sm-3.8.0/es_core_news_sm-3.8.0-py3-none-any.whl (12.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.9/12.9 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('es_core_news_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Descargar 'punkt' para tokenización de oraciones
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download([
    'punkt',
    'stopwords',
    'wordnet',
    'averaged_perceptron_tagger'
])

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [None]:
# Cargar CSV ignorando líneas corruptas
df = pd.read_csv("/content/drive/MyDrive/US-Economic-News.csv", encoding='latin1')

# Mostrar las primeras filas
print(df.head())

    _unit_id  _golden _unit_state  _trusted_judgments _last_judgment_at  \
0  842613455    False   finalized                   3     12/5/15 17:48   
1  842613456    False   finalized                   3     12/5/15 16:54   
2  842613457    False   finalized                   3      12/5/15 1:59   
3  842613458    False   finalized                   3      12/5/15 2:19   
4  842613459    False   finalized                   3     12/5/15 17:48   

   positivity  positivity:confidence relevance  relevance:confidence  \
0         3.0                 0.6400       yes                 0.640   
1         NaN                    NaN        no                 1.000   
2         NaN                    NaN        no                 1.000   
3         NaN                 0.0000        no                 0.675   
4         3.0                 0.3257       yes                 0.640   

       articleid      date                                           headline  \
0  wsj_398217788   8/14/91         

In [None]:
#Pre-procesamiento
# Cargar spaCy y stopwords
nlp = spacy.load("es_core_news_sm")
stop_words = set(stopwords.words('spanish'))

def preprocess(text):
    # Tokenización
    tokens = nltk.word_tokenize(text.lower())
    # Quitar stopwords y signos
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    # Lematización
    doc = nlp(" ".join(tokens))
    lemmas = [token.lemma_ for token in doc]
    return lemmas

NameError: name 'spacy' is not defined

In [None]:
# Procesamiento por lotes con spaCy (mucho más rápido)
texts = df['text'].fillna("").tolist()
labels = df['relevance'].map({'yes': 1, 'no': 0}).fillna(0).astype(int).tolist()

# Procesar por lotes
docs = list(nlp.pipe(texts, disable=["parser", "ner"]))

# Extraer tokens lematizados
processed_tokens = [
    [token.lemma_.lower() for token in doc if not token.is_stop and token.is_alpha]
    for doc in docs
]

# Guardar en el DataFrame
df['tokens'] = processed_tokens

In [None]:
# Mostrar valores únicos para verificar
print("Valores únicos en 'relevance':")
print(df['relevance'].str.strip().str.lower().value_counts(dropna=False))

# Mapear valores válidos
def clean_relevance(value):
    value = str(value).strip().lower()
    if value in ['yes', 'y']:
        return 1
    elif value in ['no', 'n']:
        return 0
    else:
        return None  # Marcar como NaN para eliminar después

df['relevance'] = df['relevance'].apply(clean_relevance)
df = df[df['relevance'].notna()]  # Eliminar filas con valores no definidos
df['relevance'] = df['relevance'].astype(int)  # Convertir a enteros

# Verificar distribución después de limpiar
print("\n✅ Distribución después de limpiar:")
print(df['relevance'].value_counts())

Valores únicos en 'relevance':
relevance
no          6571
yes         1420
not sure       9
Name: count, dtype: int64

✅ Distribución después de limpiar:
relevance
0    6571
1    1420
Name: count, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['relevance'] = df['relevance'].astype(int)  # Convertir a enteros


In [None]:
# Observacion de relevance pre-balanceo
print("Distribución de 'relevance' antes del balanceo:")
print(df['relevance'].value_counts())

Distribución de 'relevance' antes del balanceo:
relevance
0    6571
1    1420
Name: count, dtype: int64


In [None]:
# Separar clases
df_relevant = df[df['relevance'] == 1]
df_not_relevant = df[df['relevance'] == 0]

# Verificar que ambas clases tengan muestras
if len(df_relevant) == 0 or len(df_not_relevant) == 0:
    print("⚠️ Una de las clases está vacía. No se puede balancear.")
else:
    # Submuestrear la clase mayoritaria (no relevante)
    df_balanced = pd.concat([
        df_relevant,
        resample(df_not_relevant,
                 replace=False,
                 n_samples=len(df_relevant),
                 random_state=42)
    ]).sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n✅ Distribución después del balanceo:")
    print(df_balanced['relevance'].value_counts())


✅ Distribución después del balanceo:
relevance
1    1420
0    1420
Name: count, dtype: int64


In [None]:
# Extraer tokens y etiquetas del dataset balanceado
tokens_balanced = df_balanced['tokens'].tolist()
labels_balanced = df_balanced['relevance'].tolist()

# Inicializar contadores
vocab = set()
word_counts = {
    0: defaultdict(int),
    1: defaultdict(int)
}
class_total_words = {0: 0, 1: 0}

# Recorrer los tokens y etiquetas del dataset balanceado
for tokens, label in zip(tokens_balanced, labels_balanced):
    for word in tokens:
        vocab.add(word)
        word_counts[label][word] += 1
        class_total_words[label] += 1

vocab_size = len(vocab)
print(f"Tamaño del vocabulario: {vocab_size}")

Tamaño del vocabulario: 22849


In [None]:
# Suavizado (Laplace smoothing)
alpha = 1.0

# Probabilidad logarítmica de cada palabra por clase
log_class_word_probs = {
    0: {},
    1: {}
}

for label in [0, 1]:
    total_words_in_class = class_total_words[label]

    for word in vocab:
        count = word_counts[label].get(word, 0)
        # Probabilidad P(palabra | clase) con suavizado
        prob = (count + alpha) / (total_words_in_class + alpha * vocab_size)
        log_class_word_probs[label][word] = math.log(prob)

In [None]:
#  calcula las probabilidades a priori para un problema de clasificación binaria:
total_docs = len(labels_balanced)
class_counts = {
    0: sum(1 for label in labels_balanced if label == 0),
    1: sum(1 for label in labels_balanced if label == 1)
}

class_priors = {
    0: class_counts[0] / total_docs,
    1: class_counts[1] / total_docs
}

print("Priors:")
print(class_priors)

Priors:
{0: 0.5, 1: 0.5}


In [None]:
# Predice la clase de un texto usando un clasificador Naive Bayes multinomial.
# Recibe #text : str (el texto de entrada que se desea clasificar) y retorna un valor según
#La clase predicha: 0 o 1, dependiendo de cuál tenga mayor probabilidad logarítmica.

def predict(text):
    doc = nlp(text)
    tokens = [
        token.lemma_.lower() for token in doc
        if not token.is_stop and token.is_alpha
    ]

    # Inicializar probabilidades logarítmicas con los priors
    log_prob = {
        0: math.log(class_priors[0]),
        1: math.log(class_priors[1])
    }

    MIN_LOG_PROB = math.log(1e-10)  # Para palabras no vistas en entrenamiento

    for label in [0, 1]:
        for word in tokens:
            # Obtener probabilidad logarítmica o usar un valor por defecto
            word_log_prob = log_class_word_probs[label].get(word, MIN_LOG_PROB)
            log_prob[label] += word_log_prob

    return 1 if log_prob[1] > log_prob[0] else 0

In [None]:
# Predicciones sobre el dataset balanceado
y_pred = [predict(text) for text in df_balanced['text']]
y_true = df_balanced['relevance'].tolist()

In [None]:
# Visualizacion de metricas
print(classification_report(
    y_true,
    y_pred,
    target_names=['No Relevante', 'Relevante'],
    digits=4
))

NameError: name 'classification_report' is not defined

In [None]:
# Evaluacion de funcionmiento del modelo con ejemplos
test_texts = [
    "The Fed raised interest rates again, signaling tighter monetary policy.",
    "A new movie broke box office records this weekend.",
    "Oil prices fell sharply due to oversupply and weak demand.",
    "How to cook the perfect lasagna at home."
]

for t in test_texts:
    print("Texto:", t[:50] + "...")  # Muestra solo parte del texto
    print("Predicción:", "Relevante" if predict(t) == 1 else "No relevante")
    print("-" * 60)

Texto: The Fed raised interest rates again, signaling tig...


NameError: name 'predict' is not defined