In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import codecs
import os
import pandas as pd
import numpy as np
import string
import random
import pickle

import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.probability import FreqDist

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split,GridSearchCV # Modelado
from sklearn.pipeline import Pipeline # Modelado
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # Modelado
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.linear_model import LogisticRegression # Reporte
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, roc_curve, precision_recall_curve

path = '/content/drive/MyDrive/NLP_practica/datasets/' #path en donde esta almacenado el df

In [None]:
class TextPipelineLogistic():
  """Esta clase genera el pipeline indicado realizando la extracción de características, calcula los mejores hyperparametros, entrena el modelo con dichos parametros y realiza las predicciones.
  A lo largo del proceso se iran mostrando en pantalla información del paso que se está realizando asi como algun dato de interes"""

  def __init__(self, method_extraction:str = 'tfidf', model:str= 'regresion') -> None:
    """el constructor de la clase crea las variables de clase que contienen los hyperparámetros a validar. Por defecto se ejecutan diferentes rangos de los hyperparámtros, si se quiere modificar el rango, 
    se puede pasar como argumento al crear la instancia"""
    self.method_extraction = method_extraction
    self.model = model
    self.params = self.createDictHyperparams()
    self.pipe = self.pipelineMethod()
    
  def run(self,X_train: pd.Series, X_test: pd.Series, y_train: pd.Series, y_test: pd.Series) -> tuple:
    """este método ejecuta todo el pipeline indicado automaticamente, y va generando mensajes durante todo el proceso
      : param X_train: pd.Series  conjunto de train
      : param X_test: pd.Series  conjunto de test
      : param y_train: pd.Series  etiquetas de train
      : param y_test: pd.Series  etiquetas de test
      : param method_extraction:str = 'tfidf' parámetro que define el pipeline,  por defecto TfidfVectorizer. se puede pasar por parámetro 'countVectorizer
      return : tuple --model.best_params_,train_predict, test_predict"""
    
    pipeline_steps = list(self.pipe.named_steps.keys())
    print(f'Iniciamos el cross-validation del pipeline: {pipeline_steps[0]} ->  {pipeline_steps[1]} -> {pipeline_steps[2]}.\n')
    print('Hiperparámetros a validar en la cross-validation')
    self.printDict(self.params)
    self.best_model = self.gridSearch(pipeline = self.pipe, params = self.params)
    model = self.fitGridModel(best_model = self.best_model, X_train= X_train, y_train= y_train)
    print('\nObtenemos los valores optimos para los hiperparámtros')
    print('<<<<<<<<<<<<<<<<<Best Hyperparamns>>>>>>>>>>>>>>>>>>>')
    self.printDict(model.best_params_)
    train_predict, test_predict = self.predictModel(model,X_train= X_train,X_test= X_test)
    print('\nMétricas obtenidas en el conjunto de train')
    self.evaluateModel(y_train,train_predict)
    print('\nMétricas obtenidas en el conjunto de test')
    self.evaluateModel(y_test,test_predict)
    return model.best_params_,train_predict, test_predict

    
  def createDictHyperparams(self,tfidf__ngram_range: list= [(1, 1), (1, 2)], tfidf__min_df: list = [1, 2],
               tfidf__max_df:list = [0.9, 0.95],tfidf__max_features:list = [1000,2500], 
               chi2__k:list = [10, 100], logistic__C:list = [0.1, 1.0],
               n_estimators:list = [50, 100], max_depth:list= [3, 5], learning_rate:list = [0.01,0.1]):
    if self.method_extraction == 'tfidf':
      param_extract = {
      'tfidf__ngram_range':  tfidf__ngram_range,
      'tfidf__min_df': tfidf__min_df,
      'tfidf__max_df': tfidf__max_df,
      'tfidf__max_features': tfidf__max_features,
      'chi2__k': chi2__k,      
                        }
    if self.method_extraction == 'countVectorizer':
      param_extract = {
      'chi2__k': chi2__k
                     }

    if self.model == 'regresion':
      param_model = {
      'logistic__C': logistic__C
                      }
    if self.model == 'boosting':
      param_model = {
      'n_estimators': n_estimators,
      'max_depth': max_depth,
      'learning_rate': learning_rate
                          }
    return {**param_extract,**param_model}


  def pipelineMethod(self) -> Pipeline:
    """método que carga el pipeline dependiendo del método de extraccion
      : param method_extraction: puede ser 'tfidf' o 'countVectorizer'
      return : Pipeline"""
    print('[INFO] Generando el pipeline...')
    if self.method_extraction == 'tfidf':
      if self.model =='regresion':
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=1.0, max_features = 2500)),
            ('chi2', SelectKBest(chi2)),
            ('logistic', LogisticRegression(max_iter=1000))
        ])
      if self.model =='boosting':
        pipeline = Pipeline([
            ('tfidf', TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=1.0, max_features = 2500)),
            ('chi2', SelectKBest(chi2)),
            ('boosting', GradientBoostingClassifier())
        ])
      return pipeline
    if self.method_extraction == 'countVectorizer':
      if self.model =='regresion':
        pipeline = Pipeline([
            ('countvectorizer',CountVectorizer()),
            ('chi2', SelectKBest(chi2)),
            ('logistic', LogisticRegression())
          ]) 
      if self.model =='boosting':
        pipeline = Pipeline([
            ('countvectorizer',CountVectorizer()),
            ('chi2', SelectKBest(chi2)),
            ('boosting', GradientBoostingClassifier())
          ]) 
        
    return pipeline
    

  def gridSearch(self, pipeline: Pipeline, params: dict, cv: int = 5) -> GridSearchCV:
    """metodo que carga el GridSearchCV
     : param pipeline: Pipeline 
     : param params: dict contien los hyperparámetros y los rangos a validar
     : param cv: int por defecto 5. número de validaciones
     return : objeto de la clase GridSearchCV"""
    print('\n[INFO] Realizando el gridSeach ...')
    grid_search = GridSearchCV(pipeline, params, cv=cv, n_jobs=-1, verbose=1)
    return grid_search

  def fitGridModel(self, best_model: GridSearchCV, X_train: pd.Series, y_train: pd.Series) -> GridSearchCV:
    """con este metedo se realiza el entrenamiento en base al modelo GridSearchCV pasado por parámetro
      : param best_model: GridSearchCV     
      : param X_train: pd.Series
      : param X_train: pd.Series
      return GridSearchCV entrenado """
    best_model.fit(X_train, y_train)
    return best_model

  def predictModel(self, model, X_train: pd.Series,X_test: pd.Series) -> tuple:
    """con este metedo se realiza la prediccion del conjunto de train t test
      : param model: GridSearchCV     
      : param X_train: pd.Series
      : param X_train: pd.Series
      return tuple (train_predict, test_predict)"""
    print('\n[INFO] Realizando las predicciones del conjunto de train ...')
    train_predict = model.predict(X_train)
    print('[INFO] Realizando las predicciones del conjunto de test ...')
    test_predict = model.predict(X_test)
    return train_predict, test_predict

  @staticmethod
  def evaluateModel(y_true: pd.Series, y_pred: pd.Series) -> None:
    """método que calcula e imprime por pantalla diferentes métricas
      : param y_true: etiquetas
      : param y_pred: predicciones
      return: None"""
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    print('Accuracy: {:.4f}'.format(acc))
    print('Precision: {:.4f}'.format(precision))
    print('Recall: {:.4f}'.format(recall))
    print('F1-Score: {:.4f}'.format(f1))
    print('Confusion Matrix:\n{}'.format(cm))

  def printDict(self,dic: dict) -> None:
    """metodo al que le pasas un diccionario e itera en los items para imprimirlos '{key}:    {value}'
     : param dic: dict
     return: None"""
    for key, value in dic.items():
      print(f'{key}:    {value}')




In [None]:
class TextPipelineLogistic():
  """Esta clase genera el pipeline indicado realizando la extracción de características, calcula los mejores hyperparametros, entrena el modelo con dichos parametros y realiza las predicciones.
  A lo largo del proceso se iran mostrando en pantalla información del paso que se está realizando asi como algun dato de interes"""

  def __init__(self,tfidf__ngram_range: list= [(1, 1), (1, 2)], tfidf__min_df: list = [1, 2], tfidf__max_df:list = [0.9, 0.95],tfidf__max_features:list = [1000,2500], chi2__k:list = [10, 100], logistic__C:list = [0.1, 1.0]) -> None:
    """el constructor de la clase crea las variables de clase que contienen los hyperparámetros a validar. Por defecto se ejecutan diferentes rangos de los hyperparámtros, si se quiere modificar el rango, 
    se puede pasar como argumento al crear la instancia"""
    self.params_tfidf = {
      'tfidf__ngram_range':  tfidf__ngram_range,
      'tfidf__min_df': tfidf__min_df,
      'tfidf__max_df': tfidf__max_df,
      'tfidf__max_features': tfidf__max_features,
      'chi2__k': chi2__k,
      'logistic__C': logistic__C
    }
    self.params_counV = {
      'chi2__k': chi2__k,
      'logistic__C': logistic__C
    }

  def run(self,X_train: pd.Series, X_test: pd.Series, y_train: pd.Series, y_test: pd.Series, method_extraction:str = 'tfidf') -> tuple:
    """este método ejecuta todo el pipeline indicado automaticamente, y va generando mensajes durante todo el proceso
      : param X_train: pd.Series  conjunto de train
      : param X_test: pd.Series  conjunto de test
      : param y_train: pd.Series  etiquetas de train
      : param y_test: pd.Series  etiquetas de test
      : param method_extraction:str = 'tfidf' parámetro que define el pipeline,  por defecto TfidfVectorizer. se puede pasar por parámetro 'countVectorizer
      return : tuple --model.best_params_,train_predict, test_predict"""
    
    if method_extraction == 'tfidf': 
      print('Iniciamos el cross-validation del pipeline: TfidfVectorizer -> SelectKBest(chi2) -> LogisticRegression\n')
      print('Hiperparámetros a validar en la cross-validation')
      self.printDict(self.params_tfidf)
      self.pipe = self.pipelineMethod(method_extraction)      
      self.best_model = self.gridSearch(pipeline = self.pipe, params = self.params_tfidf)
      model = self.fitGridModel(best_model = self.best_model, X_train= X_train, y_train= y_train)
      print('\nObtenemos los valores optimos para los hiperparámtros')
      print('<<<<<<<<<<<<<<<<<Best Hyperparamns>>>>>>>>>>>>>>>>>>>')
      self.printDict(model.best_params_)
      train_predict, test_predict = self.predictModel(model,X_train= X_train,X_test= X_test)
      print('\nMétricas obtenidas en el conjunto de train')
      self.evaluateModel(y_train,train_predict)
      print('\nMétricas obtenidas en el conjunto de test')
      self.evaluateModel(y_test,test_predict)
      return model.best_params_,train_predict, test_predict

    elif method_extraction == 'countVectorizer':      
      print('Iniciamos el cross-validation del pipeline: CountVectorizer -> SelectKBest(chi2) -> LogisticRegression\n')
      self.pipe = self.pipelineMethod(method_extraction)
      print('Hiperparámetros a validar en la cross-validation')
      self.printDict(self.params_counV)
      self.best_model = self.gridSearch(pipeline = self.pipe, params = self.params_counV)
      model = self.fitGridModel(best_model = self.best_model, X_train= X_train, y_train= y_train)
      print('\nObtenemos los valores optimos para los hiperparámtros')
      print('<<<<<<<<<<<<<<<<<Best Hyperparamns>>>>>>>>>>>>>>>>>>>')
      self.printDict(model.best_params_)
      train_predict, test_predict = self.predictModel(model,X_train= X_train,X_test= X_test)
      print('\nMétricas obtenidas en el conjunto de train:')
      self.evaluateModel(y_train,train_predict)
      print('\nMétricas obtenidas en el conjunto de test:')
      self.evaluateModel(y_test,test_predict)    
      return model.best_params_,train_predict, test_predict

    else:
      raise TypeError ('El método indicado no se puede ejecutar')   
        
  def pipelineMethod(self, method_extraction:str) -> Pipeline:
    """método que carga el pipeline dependiendo del método de extraccion
      : param method_extraction: puede ser 'tfidf' o 'countVectorizer'
      return : Pipeline"""
    print('[INFO] Generando el pipeline...')
    if method_extraction == 'tfidf':
      pipeline = Pipeline([
          ('tfidf', TfidfVectorizer(ngram_range=(1, 1), min_df=1, max_df=1.0, max_features = 2500)),
          ('chi2', SelectKBest(chi2)),
          ('logistic', LogisticRegression(max_iter=1000))
      ])
      return pipeline
    if method_extraction == 'countVectorizer':
      pipeline = Pipeline([
          ('countvectorizer',CountVectorizer()),
          ('chi2', SelectKBest(chi2)),
          ('logistic', LogisticRegression())
         ]) 
      return pipeline

  def gridSearch(self, pipeline: Pipeline, params: dict, cv: int = 5) -> GridSearchCV:
    """metodo que carga el GridSearchCV
     : param pipeline: Pipeline 
     : param params: dict contien los hyperparámetros y los rangos a validar
     : param cv: int por defecto 5. número de validaciones
     return : objeto de la clase GridSearchCV"""
    print('\n[INFO] Realizando el gridSeach ...')
    grid_search = GridSearchCV(pipeline, params, cv=cv, n_jobs=-1, verbose=1)
    return grid_search

  def fitGridModel(self, best_model: GridSearchCV, X_train: pd.Series, y_train: pd.Series) -> GridSearchCV:
    """con este metedo se realiza el entrenamiento en base al modelo GridSearchCV pasado por parámetro
      : param best_model: GridSearchCV     
      : param X_train: pd.Series
      : param X_train: pd.Series
      return GridSearchCV entrenado """
    best_model.fit(X_train, y_train)
    return best_model

  def predictModel(self, model, X_train: pd.Series,X_test: pd.Series) -> tuple:
    """con este metedo se realiza la prediccion del conjunto de train t test
      : param model: GridSearchCV     
      : param X_train: pd.Series
      : param X_train: pd.Series
      return tuple (train_predict, test_predict)"""
    print('\n[INFO] Realizando las predicciones del conjunto de train ...')
    train_predict = model.predict(X_train)
    print('[INFO] Realizando las predicciones del conjunto de test ...')
    test_predict = model.predict(X_test)
    return train_predict, test_predict

  @staticmethod
  def evaluateModel(y_true: pd.Series, y_pred: pd.Series) -> None:
    """método que calcula e imprime por pantalla diferentes métricas
      : param y_true: etiquetas
      : param y_pred: predicciones
      return: None"""
    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred)
    
    print('Accuracy: {:.4f}'.format(acc))
    print('Precision: {:.4f}'.format(precision))
    print('Recall: {:.4f}'.format(recall))
    print('F1-Score: {:.4f}'.format(f1))
    print('Confusion Matrix:\n{}'.format(cm))

  def printDict(self,dic: dict) -> None:
    """metodo al que le pasas un diccionario e itera en los items para imprimirlos '{key}:    {value}'
     : param dic: dict
     return: None"""
    for key, value in dic.items():
      print(f'{key}:    {value}')




In [None]:
pipeline = Pipeline([
            ('countvectorizer', CountVectorizer()),
            ('chi2', SelectKBest(chi2)),
            ('boosting', GradientBoostingClassifier())
          ])


In [None]:
a = list(pipeline.named_steps.keys())

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import gensim
import multiprocessing as mp

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
)
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.preprocessing import LabelEncoder

In [None]:
path = '/content/drive/MyDrive/NLP_practica/datasets/' #
df = pd.read_csv(f'{path}df_balanced.csv')

In [None]:
# Parámetros del WORD2VEC
W2V_SIZE = 300 # tamaño de vectores
W2V_WINDOW = 7 # número de palabras que va a mirar alrededor
# 32
W2V_EPOCH = 5 # número de epoca
W2V_MIN_COUNT = 2 #número mínimo de frecuencia

# KERAS
SEQUENCE_LENGTH = 500 # número de secuencias de keras

In [None]:
class PreprocessingCNN():
  """pipeline de preprocesado que consta de: EliminacionNulos -> Normalización -> Lemmatización -> EliminarStopWord"""
  def __init__(self, vocab_size, max_len, embedding_matrix=None):
        self.tokenizer = Tokenizer(num_words=vocab_size)
        self.max_len = max_len
        self.embedding_matrix = embedding_matrix

  def generateTokenizer(self, train_df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train_df)
    vocab_size = len(tokenizer.word_index) + 1
    print(f"Total words: {vocab_size}")
    return tokenizer, vocab_size

  def generateWord2vec(self, train_df):
    documents = [_text.split() for _text in train_df.review]
    w2v_model = gensim.models.word2vec.Word2Vec(
        size=W2V_SIZE,
        window=W2V_WINDOW,
        min_count=W2V_MIN_COUNT,
        workers=mp.cpu_count(),
    )
    w2v_model.build_vocab(documents)

    words = w2v_model.wv.vocab.keys()
    vocab_size = len(words)
    print(f"Vocab size: {vocab_size}")
    w2v_model.train(documents, total_examples=len(documents), epochs=W2V_EPOCH)

    return w2v_model

  def generateEmbedding(self, word2vec_model, vocab_size, tokenizer):
    embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
    for word, i in tokenizer.word_index.items():
        if word in word2vec_model.wv:
            embedding_matrix[i] = word2vec_model.wv[word]
    return Embedding(
        vocab_size,
        W2V_SIZE,
        weights=[embedding_matrix],
        input_length=SEQUENCE_LENGTH, 
        trainable=False,
    )
  @staticmethod
  def deleteNan(df: pd.DataFrame) -> pd.DataFrame:
    #Eliminamos los documentos que no tienen review
    mask = df['text_length'] != 0 #mascara boleana 
    return df.loc[mask]

  


In [None]:
path = '/content/drive/MyDrive/NLP_practica/datasets/' #
df = pd.read_csv(f'{path}processed_df.csv')

In [None]:
df

In [None]:
def _build_embedding_matrix(self):
        embedding_dim = 100  # Dimensionality of the word embeddings
        word_index = self.tokenizer.word_index
        vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
        embedding_matrix = np.zeros((vocab_size, embedding_dim))
        return embedding_matrix

    def _create_embedding_layer(self):
      embedding_dim = 100  # Dimensionality of the word embeddings
      word_index = self.tokenizer.word_index
      vocab_size = len(word_index) + 1  # Adding 1 because of reserved 0 index
      embedding_matrix = self.embedding_matrix
      if embedding_matrix is None:
          embedding_matrix = self._build_embedding_matrix()
      embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=self.max_len,
                                  weights=[embedding_matrix], trainable=False)
      return embedding_layer

In [None]:
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np
import multiprocessing as mp

class TextPreprocessorCNN:

    def __init__(self, df:pd.DataFrame,num_words:int = 1000):
      self.X_train, self.X_test, self.y_train, self.y_test = self.trainTestSplit(df)
      self.reviewMetrics()
      self.tokenizer_object = Tokenizer(num_words= num_words)

    def preprocess(self, maxlen: int):  
      #tokenizer
      sequences_train, sequences_test, vocab_size = self.tokenizer()    
      # Padding
      padded_sequences_train = self.paddingSequences(sequences_train, maxlen)
      padded_sequences_test = self.paddingSequences(sequences_test, maxlen)  
      # #word2vec
      w2v_model = self.generate_word2vec(sequences_train)
      #embeding
      embedding_layer = self.generate_embedding(word2vec_model=w2v_model, vocab_size= vocab_size)
      return padded_sequences_train,padded_sequences_test ,embedding_layer,self.y_train, self.y_test

    def trainTestSplit(self, df:pd.DataFrame, train_size: float=0.75, test_size: float=0.25, random_state:int =42, shuffle:bool =True):
      X_train, X_test, y_train, y_test = train_test_split(
      df['processed_tokens'],
      df['sentiment_label'],
      train_size=0.75,
      test_size=0.25,
      random_state=42,
      shuffle=True
       )
      return X_train, X_test, y_train, y_test
    def reviewMetrics(self):
      max_length = 0
      total_length = 0
      num_examples = len(self.X_train)

      for example in self.X_train:
          length = len(example)
          total_length += length
          if length > max_length:
              max_length = length
      mean_length = total_length / num_examples

      print('---------------------------------------')
      print(f'El tamaño máximo de review: {max_length}\n')
      print('---------------------------------------')
      print(f'La media de palabras por review: {mean_length}\n')

    def tokenizer(self):

      # Concatenar las columnas usando pd.concat()
      sequences = pd.concat([self.X_train, self.X_test],axis=0)

      # Verificar que la concatenación se realizó correctamente
      self.tokenizer_object.fit_on_texts(sequences)
      vocab_size = len(self.tokenizer_object.word_index) + 1
      sequences_train = self.tokenizer_object.texts_to_sequences(self.X_train)
      sequences_test = self.tokenizer_object.texts_to_sequences(self.X_test)
      return sequences_train, sequences_test, vocab_size

    def paddingSequences(self, sequences, maxlen: int):
      padded_sequences = pad_sequences(sequences, maxlen=maxlen)
      return padded_sequences

    def generate_word2vec(self, sequences_train):     
      sequences_train = [str(sequence) for sequence in sequences_train]
   
      documents = [sequence.split() for sequence in sequences_train]
      w2v_model =Word2Vec(
          size=300,
          window=5,
          min_count=2,
          workers=mp.cpu_count(),
      )
      w2v_model.build_vocab(documents)

      words = w2v_model.wv.vocab.keys()
      vocab_size = len(words)
      print(f"Vocab size: {vocab_size}")
      w2v_model.train(documents, total_examples=len(documents), epochs=2)

      return w2v_model

    def generate_embedding(self, word2vec_model, vocab_size):
      embedding_matrix = np.zeros((vocab_size, 300))
      for word, i in self.tokenizer_object.word_index.items():
          if word in word2vec_model.wv:
              embedding_matrix[i] = word2vec_model.wv[word]
      return Embedding(
          vocab_size,
          300,
          weights=[embedding_matrix],
          input_length=500, 
          trainable=False,
    )
    
    

  

    


In [None]:
from keras.preprocessing.text import Tokenizer
from gensim.models import Word2Vec
from tensorflow.keras.layers import (
    Dense,
    Dropout,
    Embedding,
    LSTM,
)

In [None]:
a = TextPreprocessorCNN(df)
padded_sequences_train,padded_sequences_test ,embedding_layer,y_train, y_test = a.preprocess(maxlen = 1200)

In [None]:
df['padded_sequences'] = padded_sequences
df['embedding'] = embedding

In [None]:
padded_sequences.shape

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Dropout, GRUV2, SimpleRNN
from tqdm import tqdm
class TrainingCNN:
  def __init__(self,padded_sequences_train,padded_sequences_test, y_train, y_test ,embedding_layer):
    self.padded_sequences_train= padded_sequences_train
    self.padded_sequences_test= padded_sequences_test
    self.y_train = y_train
    self.y_test = y_test
    self.embedding_layer = embedding_layer
    
  def run(self, input_dim: int=10882, output_dim: int=100, input_length: int=1200,n_neural: int = 100):
    type_layers = ['LSTM','GRU','SimpleRNN']
    metrics_df = pd.DataFrame(columns=['layer', 'loss_train', 'accuracy_train', 'precision_train', 'recall_train', 'f1_train', 'loss_test', 'accuracy_test', 'precision_test', 'recall_test', 'f1_test'])
    for layer in type_layers:
      self.model_cnn_embedding = self.modelCnnEmbedding(rnn_type = layer, input_dim=input_dim, output_dim=output_dim, input_length=input_length,n_neural= n_neural)
      print(f'\n[INFO]Entrenando modelo CNN con capa {layer}')
      self.model_cnn_embedding.fit(self.padded_sequences_train,self.y_train, validation_split=0.2,batch_size=64, epochs=1)
      print('\nGeneramos las métricas de train:')
      metrics_train = self.evaluate_model(self.model_cnn_embedding,self.padded_sequences_train,self.y_train)
      print('\nGeneramos las métricas de test:')
      metrics_test = self.evaluate_model(self.model_cnn_embedding,self.padded_sequences_test,self.y_test)
      metrics_dict = {'layer': layer,
                        'loss_train': metrics_train['loss'],
                        'accuracy_train': metrics_train['accuracy'],
                        'precision_train': metrics_train['precision'],
                        'recall_train': metrics_train['recall'],
                        'f1_train': metrics_train['f1'],
                        'loss_test': metrics_test['loss'],
                        'accuracy_test': metrics_test['accuracy'],
                        'precision_test': metrics_test['precision'],
                        'recall_test': metrics_test['recall'],
                        'f1_test':metrics_test['f1']
                       }
        
      metrics_df = metrics_df.append(metrics_dict, ignore_index=True)
    return metrics_df
  def modelCnnEmbedding(self, rnn_type: str, input_dim: int,output_dim:int , n_neural: int, input_length: int):
    print(f'\n#######################Modelo con capa {rnn_type} ####################################\n')
    print(f'[INFO] Generando el modelo...')
    model = Sequential()
    model.add(Embedding(input_dim=input_dim, output_dim=output_dim, input_length=input_length))
    if rnn_type == 'LSTM':
          model.add(LSTM(n_neural))
    elif rnn_type == 'GRU':
        model.add(GRUV2(n_neural))  
    elif rnn_type == 'SimpleRNN':
        model.add(SimpleRNN(n_neural))
    else:
        raise ValueError('Invalid RNN type specified. Must be "LSTM" or "GRU".')
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

  def evaluate_model(self, model, X, y):
    loss, accuracy = model.evaluate(X, y, verbose=0)
    y_pred = model.predict(X)
    y_pred_binary = np.where(y_pred >= 0.5, 1, 0)
    precision = precision_score(y, y_pred_binary)
    recall = recall_score(y, y_pred_binary)
    f1 = f1_score(y, y_pred_binary)
    metrics = {'loss': loss, 'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1': f1}
    
    # Imprimir las métricas de forma legible para el usuario
    print(f"Loss: {metrics['loss']:.4f}")
    print(f"Accuracy: {metrics['accuracy']*100:.2f}%")
    print(f"Precision: {metrics['precision']*100:.2f}%")
    print(f"Recall: {metrics['recall']*100:.2f}%")
    print(f"F1-score: {metrics['f1']*100:.2f}%")
    return metrics

  # def modelCnnW2VEmbedding(self, )
    
prueba = TrainingCNN(padded_sequences_train,padded_sequences_test, y_train, y_test ,embedding_layer)
metrics = prueba.run()

In [None]:
metrics

In [None]:
model = Sequential()
model.add(Embedding(input_dim=9208, output_dim=32, input_length=1200))
model.add(LSTM(100))
model.add(Dense(1, activation='sigmoid'))
print(model.summary())
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(padded_sequences_train,y_train, validation_split=0.2,
          batch_size=64, epochs=1)

1. unificar metricas en modelos normales en un df
2. conclusiones y graficar resultados
3. comentar clases 
4. readme