In [None]:

import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import nltk
import numpy as np

# Descargar recursos necesarios de nltk
nltk.download('stopwords')
nltk.download('wordnet')

# ruta relativa al archivo CSV
base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))  # Subir un nivel desde 'notebooks'
dataset_path = os.path.join(base_dir, 'datos', 'dataset.csv')

# Cargar el dataset
dataset = pd.read_csv(dataset_path, delimiter=';', encoding='utf-8-sig')
# Cargar el dataset
#dataset = pd.read_csv('dataset.csv', delimiter=';', encoding='utf-8-sig')

# Modelo 1
texts = dataset['CONTENIDO A ANALIZAR']
labels = dataset['INTENSIDAD']

# Limpiar texto
def clean_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d', '', text)
    return text

texts = texts.apply(clean_text)

# Codificar etiquetas
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

# Tokenizar textos
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# Dividir datos
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# Construir y entrenar el primer modelo
model_1 = Sequential([
    Embedding(input_dim=10000, output_dim=128, input_length=100),
    Bidirectional(LSTM(64, return_sequences=True)),
    Dropout(0.3),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])

model_1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_1.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluar el primer modelo
loss_1, accuracy_1 = model_1.evaluate(X_test, y_test, verbose=1)
print(f"Modelo 1 - Loss: {loss_1}, Accuracy: {accuracy_1}")

# Preprocesar para el segundo modelo
stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(words)

dataset['mensaje'] = dataset['CONTENIDO A ANALIZAR'].map(preprocess_text)
X = dataset['mensaje'].values
y = dataset['INTENSIDAD'].values - 3
y = np.clip(y, 0, 1)

tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')

X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Construir y entrenar el segundo modelo
model_2 = Sequential([
    Embedding(input_dim=10000, output_dim=64, input_length=200),
    Bidirectional(LSTM(64)),
    Dense(64, activation='relu'),
    Dense(2, activation='softmax')
])

model_2.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model_2.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)

# Evaluar el segundo modelo
loss_2, accuracy_2 = model_2.evaluate(X_test, y_test, verbose=1)
print(f"Modelo 2 - Loss: {loss_2}, Accuracy: {accuracy_2}")


[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     SUPPORT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HP
[nltk_data]     SUPPORT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Epoch 1/5




[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 60ms/step - accuracy: 0.0128 - loss: -214.5198 - val_accuracy: 0.0102 - val_loss: -1593.5710
Epoch 2/5
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 57ms/step - accuracy: 0.0139 - loss: -2475.9272 - val_accuracy: 0.0102 - val_loss: -6019.6113
Epoch 3/5
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 56ms/step - accuracy: 0.0143 - loss: -7575.5752 - val_accuracy: 0.0102 - val_loss: -13086.5566
Epoch 4/5
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 57ms/step - accuracy: 0.0140 - loss: -15270.4561 - val_accuracy: 0.0102 - val_loss: -22453.6230
Epoch 5/5
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 59ms/step - accuracy: 0.0119 - loss: -25482.9121 - val_accuracy: 0.0102 - val_loss: -34057.8516
[1m77/77[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 15ms/step - accuracy: 0.0129 - loss: -33532.9961
Modelo 1 - Loss: -34057.8515625, Accur



[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 47ms/step - accuracy: 0.8624 - loss: 0.4060 - val_accuracy: 0.8864 - val_loss: 0.2758
Epoch 2/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 44ms/step - accuracy: 0.9230 - loss: 0.2034 - val_accuracy: 0.8836 - val_loss: 0.2860
Epoch 3/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 46ms/step - accuracy: 0.9554 - loss: 0.1235 - val_accuracy: 0.8783 - val_loss: 0.3225
Epoch 4/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 44ms/step - accuracy: 0.9734 - loss: 0.0844 - val_accuracy: 0.8742 - val_loss: 0.4119
Epoch 5/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 44ms/step - accuracy: 0.9825 - loss: 0.0541 - val_accuracy: 0.8807 - val_loss: 0.4328
Epoch 6/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 45ms/step - accuracy: 0.9843 - loss: 0.0479 - val_accuracy: 0.8689 - val_loss: 0.4996
Epoch 7/10
[1m307/307[0m 

In [2]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import string
import nltk
import numpy as np

# Descargar recursos necesarios de nltk
nltk.download('stopwords')
nltk.download('wordnet')

# Cargar el dataset
dataset = pd.read_csv('dataset.csv', delimiter=';', encoding='utf-8-sig')

# Preprocesamiento de textos
stop_words = set(stopwords.words('spanish'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)
    words = [lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words]
    return " ".join(words)

dataset['mensaje'] = dataset['CONTENIDO A ANALIZAR'].map(preprocess_text)
X = dataset['mensaje'].values
y = dataset['INTENSIDAD'].values - 3  # Normalización de etiquetas
y = np.clip(y, 0, 1)

# Tokenizar y rellenar secuencias
tokenizer = Tokenizer(num_words=10000, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_seq, maxlen=200, padding='post', truncating='post')

# Dividir los datos
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Modelo Optimizado basado en LSTM
def create_lstm_model():
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=200),
        Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        Bidirectional(LSTM(128, dropout=0.3, recurrent_dropout=0.3)),
        Dense(64, activation='relu'),
        Dropout(0.4),
        Dense(2, activation='softmax')  # Clasificación binaria
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Modelo Alternativo basado en GRU
def create_gru_model():
    model = Sequential([
        Embedding(input_dim=10000, output_dim=128, input_length=200),
        Bidirectional(GRU(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)),
        Bidirectional(GRU(128, dropout=0.3, recurrent_dropout=0.3)),
        Dense(64, activation='relu'),
        Dropout(0.4),
        Dense(2, activation='softmax')  # Clasificación binaria
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
                  loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Entrenamiento y evaluación
def train_and_evaluate(model, X_train, y_train, X_test, y_test, model_name):
    print(f"\nEntrenando {model_name}...")
    history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test), verbose=1)
    loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
    print(f"{model_name} - Loss: {loss}, Accuracy: {accuracy}")
    return history, accuracy

# Crear y entrenar los modelos
lstm_model = create_lstm_model()
lstm_history, lstm_accuracy = train_and_evaluate(lstm_model, X_train, y_train, X_test, y_test, "Modelo LSTM Optimizado")

gru_model = create_gru_model()
gru_history, gru_accuracy = train_and_evaluate(gru_model, X_train, y_train, X_test, y_test, "Modelo GRU Optimizado")

# Comparar los resultados
print(f"\nResultados finales:\nModelo LSTM Optimizado - Accuracy: {lstm_accuracy}\nModelo GRU Optimizado - Accuracy: {gru_accuracy}")


[nltk_data] Downloading package stopwords to C:\Users\HP
[nltk_data]     SUPPORT\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\HP
[nltk_data]     SUPPORT\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!



Entrenando Modelo LSTM Optimizado...
Epoch 1/10




[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 303ms/step - accuracy: 0.8451 - loss: 0.4590 - val_accuracy: 0.8640 - val_loss: 0.3943
Epoch 2/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 291ms/step - accuracy: 0.8567 - loss: 0.4166 - val_accuracy: 0.8640 - val_loss: 0.3869
Epoch 3/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m91s[0m 297ms/step - accuracy: 0.8675 - loss: 0.3781 - val_accuracy: 0.8783 - val_loss: 0.3235
Epoch 4/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m87s[0m 283ms/step - accuracy: 0.9007 - loss: 0.2413 - val_accuracy: 0.8884 - val_loss: 0.2960
Epoch 5/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 288ms/step - accuracy: 0.9285 - loss: 0.1900 - val_accuracy: 0.8909 - val_loss: 0.2952
Epoch 6/10
[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 288ms/step - accuracy: 0.9467 - loss: 0.1523 - val_accuracy: 0.8933 - val_loss: 0.3044
Epoch 7/10
[1m307/30