#### Trabajo Práctico 2 : Críticas Cinematográficas

### Grupo 01 - Integrantes:
        Cabrera Rodríguez, Mateo 
        Gonzalez Alejo, Camila
        Bocanegra, Eduardo Martín

# Setup

In [3]:
#!pip install -r ./torch-cuda-118.txt
!pip install -r ./requirements.txt
#!python -m spacy download es_dep_news_trf



### Cargar bibliotecas

In [4]:
import sklearn
import joblib
import pandas as pd
import tensorflow as tf
import plotly as py
import numpy as np
import langdetect
#import spacy     # Disable because incompatibilities with tensorflow and CuDNN
import unicodedata
import nltk
import string
import re

from sklearn.model_selection import GridSearchCV, train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import BernoulliNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn import metrics
from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

#from pysentimiento import create_analyzer
#import transformers

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Show availables GPUs
physical_devices = tf.config.list_physical_devices('GPU')
print("Available GPUs:", len(physical_devices))
print(tf.__version__)

Available GPUs: 1
2.12.0


[nltk_data] Downloading package punkt to /home/jovyan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jovyan/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### Cargar dataset

In [5]:
ds = pd.read_csv("datasets/train.csv")
print(ds.head())

   ID                                          review_es sentimiento
0   0  Uno de los otros críticos ha mencionado que de...    positivo
1   1  Una pequeña pequeña producción.La técnica de f...    positivo
2   2  Pensé que esta era una manera maravillosa de p...    positivo
3   3  Básicamente, hay una familia donde un niño peq...    negativo
4   4  El "amor en el tiempo" de Petter Mattei es una...    positivo


# Visualización de los datos

In [7]:
x = ds['sentimiento'].value_counts()
fig = py.graph_objs.Figure([py.graph_objs.Bar(x=x.index, y=x.values)])
fig.update_layout(title_text='Distribución de tweets reales y falsos')
fig.show()

Observamos el gráfico y confirmamos que es un data set balanceado. No hará falta hacer over o undersampling.

In [5]:
# Cantidad de reseñas que no estan en español
def count_non_spanish(reviews):
    non_spanish_count = 0
    for review in reviews:
        if langdetect.detect(review) != 'es': non_spanish_count += 1
    return non_spanish_count

non_spanish_count = count_non_spanish(ds['review_es'])
spanish_count = len(ds['review_es']) - non_spanish_count
non_spanish_bar = py.graph_objs.Bar(x=["Reseñas en otro idioma"], y=[non_spanish_count])
spanish_bar = py.graph_objs.Bar(x=["Reseñas en español"], y=[spanish_count])
fig = py.graph_objs.Figure([spanish_bar, non_spanish_bar])
fig.update_layout(title_text='Distribución de reseñas en español y no español')
fig.show()

# Preprocesamiento de datos

In [59]:
ds = pd.read_csv("datasets/train.csv")

ascii_chars = set(string.printable)
stop_words = set(stopwords.words("spanish"))
#nlp = spacy.load('es_dep_news_trf')

# Compile regex
url_regex = re.compile(r'http:?\S+')
non_ascii_regex = re.compile('[^a-z0-9 \n\.]')
double_space_regex = re.compile(' +')

def _remove_punctuation(review):
    punctuation_chars = ".:,;\n!|?¿¡"
    translator = str.maketrans('', '', punctuation_chars)
    review = review.translate(translator)
    return review

def _remove_single_char_word(review):
    words = review.split()
    words = [word for word in words if len(word) > 1]
    return ' '.join(words)

#def _lemmatize(review, print_msg=False):
#    if print_msg: print("Finished 100 reviews")
#    lemmatized = nlp(review)
#    lemas = [token.lemma_ for token in lemmatized]
#    return ' '.join(lemas)    

def _remove_urls(review):
    return url_regex.sub('', review)


def _remove_non_ascii(review):
    review = unicodedata.normalize('NFD', review)
    review = review.encode('ascii', 'ignore')
    review = review.decode("utf-8")
    review = non_ascii_regex.sub('', review)
    review = double_space_regex.sub(' ', review) # Removes double space
    return review

def _remove_stopwords(review):
    review_without_stopwords = []
    review_split = review.split()
    for word in review_split:
        if word in stop_words: continue
        review_without_stopwords.append(word)
    
    review = ' '.join(review_without_stopwords)
    return double_space_regex.sub(' ', review) # Removes double space

def clean_reviews(ds):
    ds_copy = ds.copy()
    ds_copy['review_es'] = ds_copy['review_es'].str.lower()
    print("Finished lowering")
    ds_copy['review_es'] = ds_copy['review_es'].apply(_remove_punctuation)
    print("Finished removing punctuation")
    ds_copy['review_es'] = ds_copy['review_es'].apply(_remove_urls)
    print("Finished removing urls")
    ds_copy['review_es'] = ds_copy['review_es'].apply(_remove_non_ascii)
    print("Finished removing non-ascii chars")
    ds_copy['review_es'] = ds_copy['review_es'].apply(_remove_stopwords)
    print("Finished removing stopwords")
    ds_copy['review_es'] = ds_copy['review_es'].apply(_remove_single_char_word)
    print("Finished removing one-char words")
    #ds_copy['review_es'] = ds_copy['review_es'].applt(_lemmatize)
    #print("Finished lemmatizing")
    return ds_copy

In [62]:
ds_cleaned = clean_reviews(ds)
ds_cleaned = ds_cleaned[ds_cleaned['review_es'].map(langdetect.detect) == 'es']
ds_cleaned['sentimiento'] = [1 if sentiment == "positivo" else 0 for sentiment in ds_cleaned['sentimiento']]
ds_cleaned.to_csv('datasets/train_preprocessed_stopwords.csv', index=False)

Finished lowering
Finished removing punctuation
Finished removing urls
Finished removing non-ascii chars
Finished removing stopwords
Finished removing one-char words


In [71]:
#ds = pd.read_csv('datasets/train_preprocessed_stopwords.csv')
ds = pd.read_csv('datasets/train_preprocessed_stopwords.csv')
reviews = ds['review_es']
sentiments = ds['sentimiento']

vectorizer = CountVectorizer(stop_words=stopwords.words('spanish'))
reviews_vectorized = vectorizer.fit_transform(reviews)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(reviews_vectorized, sentiments, test_size=0.2, random_state=42)

In [None]:
print(sum(y_train) / len(y_train))

In [None]:
X_train.shape

# Entrenamiento de modelos

### Carga de Dataset preprocesado

In [6]:
ds = pd.read_csv("datasets/train_preprocessed_stopwords.csv")
reviews = ds['review_es']
sentiments = ds['sentimiento']
print(ds.head())

   ID                                          review_es  sentimiento
0   0  criticos mencionado despues ver solo oz episod...            1
1   1  pequena pequena produccionla tecnica filmacion...            1
2   2  pense manera maravillosa pasar tiempo fin sema...            1
3   3  basicamente familia nino pequeno jake piensa z...            0
4   4  amor tiempo petter mattei pelicula visualmente...            1


## Bernoulli

In [38]:
nb_model = BernoulliNB()
nb_model.fit(X_train, y_train)
joblib.dump(svm_classifier, f"models/bernoulli.pkl")

['models/bernoulli.pkl']

In [39]:
y_pred = nb_model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))

Accuracy: 0.8466
Precision: 0.8670157068062827
Recall: 0.8215915856320699
F1 Score: 0.8436926839209293


In [10]:
test_ds = pd.read_csv("datasets/test.csv")
test_ds["sentimiento"] = nb_model.predict(vectorizador.transform(test_ds["review_es"]))
# convert "sentimiento" to "positivo" or "negativo"
test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
test_ds[["ID","sentimiento"]].to_csv("submissions/binomialNB.csv", index=False)

In [17]:
param_grid = {
    "alpha": [0.1, 0.5, 1.0, 2.0, 5.0, 10.0],
    "binarize": [0.0, 0.5, 1.0, 2.0, 5.0, 10.0],
    "fit_prior": [True, False]
}

nb_model = BernoulliNB()
grid_search = GridSearchCV(estimator=nb_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, sentiments)

print(grid_search.best_params_)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
{'alpha': 0.5, 'binarize': 0.0, 'fit_prior': True}


In [15]:
best_params = {'alpha': 0.5, 'binarize': 0.0, 'fit_prior': True}

In [16]:
nb_model = BernoulliNB(**best_params)
nb_model.fit(X_train, y_train)

y_pred = nb_model.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))

Accuracy: 0.8458
Precision: 0.8641949593834618
Recall: 0.8233776542964873
F1 Score: 0.8432926829268294


In [17]:
test_ds = pd.read_csv("datasets/test.csv")
test_ds["sentimiento"] = nb_model.predict(vectorizador.transform(test_ds["review_es"]))
# convert "sentimiento" to "positivo" or "negativo"
test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
test_ds[["ID","sentimiento"]].to_csv("submissions/binomialNB_best.csv", index=False)

## Pysentimiento

In [22]:
transformers.logging.set_verbosity(transformers.logging.ERROR)
analyzer = create_analyzer(task="sentiment", lang="es")

In [23]:
prediction = analyzer.predict(reviews[0])

print("positivo" if prediction.probas["POS"] > prediction.probas["NEG"] else "negativo")

negativo


In [24]:
test_ds = pd.read_csv("datasets/test.csv")
reviews = test_ds["review_es"].tolist()
for i in range(len(reviews)):
    prediction = analyzer.predict(reviews[i])
    test_ds.loc[i, "sentimiento"] = "positivo" if prediction.probas["POS"] > prediction.probas["NEG"] else "negativo"

test_ds[["ID","sentimiento"]].to_csv("submissions/pysentimiento.csv", index=False)


## Random Forest

In [17]:
#Random forest

FOLDS = 6
kfold = StratifiedKFold(n_splits=FOLDS)
scorer_fn = make_scorer(f1_score, zero_division=1)

hiperparametros = { 
    'n_estimators': [10, 100, 200, 300, 500],
    'max_features': ['sqrt', 'log2'],
    'max_depth' : [4, 5, 6, 7, 8],
    'criterion' :['gini', 'entropy'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rfc = RandomForestClassifier(random_state=42)

gs = RandomizedSearchCV(rfc, hiperparametros, scoring=scorer_fn, cv=kfold, n_jobs=-1, verbose=2, n_iter=100)
gs.fit(X_train, sentiments)

Fitting 6 folds for each of 100 candidates, totalling 600 fits
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has alrea

In [19]:
print(gs.best_params_)

{'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 8, 'criterion': 'gini'}


In [36]:
best_params = {'n_estimators': 500, 'min_samples_split': 2, 'min_samples_leaf': 4, 'max_features': 'sqrt', 'max_depth': 8, 'criterion': 'gini'}

rfc = RandomForestClassifier(**best_params)
rfc.fit(X_train, y_train)
joblib.dump(svm_classifier, f"models/randomforest.pkl")

['models/randomforest.pkl']

In [28]:
y_pred = rfc.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))

Accuracy: 0.8261
Precision: 0.810267017675818
Recall: 0.8551299861083548
F1 Score: 0.8320942357825625


In [30]:
test_ds = pd.read_csv("datasets/test.csv")
test_ds["sentimiento"] = rfc.predict(vectorizador.transform(test_ds["review_es"]))
# convert "sentimiento" to "positivo" or "negativo"
test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
test_ds[["ID","sentimiento"]].to_csv("submissions/randomforest.csv", index=False)

## XGBoost

In [15]:
# XGBoost
from xgboost import XGBClassifier

FOLDS = 6
kfold = StratifiedKFold(n_splits=FOLDS)
scorer_fn = make_scorer(f1_score, zero_division=1)

hiperparametros = {
    'n_estimators': [10, 100, 200, 300, 500],
    'max_depth': [4, 5, 6, 7, 8],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'min_child_weight': [1, 2, 3, 4, 5],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9],
    'colsample_bytree': [0.5, 0.6, 0.7, 0.8, 0.9]
}

xgb = XGBClassifier(random_state=42)

gs = RandomizedSearchCV(xgb, hiperparametros, scoring=scorer_fn, cv=kfold, n_jobs=-1, verbose=2, n_iter=100)
gs.fit(X_train, sentiments)

print(gs.best_params_)

Fitting 6 folds for each of 100 candidates, totalling 600 fits
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.1, max_depth=7, min_child_weight=1, n_estimators=300, subsample=0.9; total time= 8.8min
[CV] END colsample_bytree=0.5, gamma=0.4, learning_rate=0.01, max_depth=7, min_child_weight=3, n_estimators=300, subsample=0.8; total time= 7.6min
[CV] END colsample_bytree=0.8, gamma=0.3, learning_rate=0.3, max_depth=8, min_child_weight=4, n_estimators=500, subsample=0.6; total time=15.4min
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05, max_depth=6, min_child_weight=5, n_estimators=300, subsample=0.9; total time= 7.5min
[CV] END colsample_bytree=0.8, gamma=0, learning_rate=0.05, max_depth=5, min_child_weight=5, n_estimators=500, subsample=0.6; total time=13.0min
[CV] END colsample_bytree=0.9, gamma=0.2, learning_rate=0.01, max_depth=7, min_child_weight=2, n_estimators=500, subsample=0.6; total time=16.0min
[CV] END colsample_bytree=0.6, gamma=0.2, learning_rate=0.05,

In [17]:
from xgboost import XGBClassifier

best_params = {'subsample': 0.9, 'n_estimators': 500, 'min_child_weight': 4, 'max_depth': 8, 'learning_rate': 0.2, 'gamma': 0.2, 'colsample_bytree': 0.7}
xgb = XGBClassifier(**best_params)
xgb.fit(X_train, y_train)

# Save best XGBoost
joblib.dump(xgb, f"models/xgboost.pkl")

y_pred = xgb.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))

Accuracy: 0.8722
Precision: 0.8643673706645999
Recall: 0.8852947013296288
F1 Score: 0.8747058823529412


In [18]:
test_ds = pd.read_csv("datasets/test.csv")
test_ds["sentimiento"] = xgb.predict(vectorizador.transform(test_ds["review_es"]))
# convert "sentimiento" to "positivo" or "negativo"
test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
test_ds[["ID","sentimiento"]].to_csv("submissions/xgboost.csv", index=False)

## SVM

In [14]:
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
svm_classifiers = {}

for kernel in kernels:
    # Create an SVM classifier with the current kernel
    svm_classifier = svm.SVC(kernel=kernel)

    # Train the classifier on the training data
    svm_classifier.fit(X_train, y_train)

    # Make predictions on the test data
    y_pred = svm_classifier.predict(X_test)

    # Evaluate the model
    print(sklearn.metrics.classification_report(y_test, y_pred))
    svm_classifiers[kernel] = svm_classifier

              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.89      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

              precision    recall  f1-score   support

           0       0.89      0.80      0.84      4961
           1       0.82      0.90      0.86      5039

    accuracy                           0.85     10000
   macro avg       0.85      0.85      0.85     10000
weighted avg       0.85      0.85      0.85     10000

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      4961
           1       0.88      0.91      0.90      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000

              preci

In [17]:
# Guardamos los modelos entrenados

for kernel, svm_classifier in svm_classifiers.items():
    joblib.dump(svm_classifier, f"models/svm_classifier_{kernel}.pkl")

In [None]:
test_ds = pd.read_csv("datasets/test.csv")
for classfier in svm_classifiers:
    test_ds["sentimiento"] = svm_classifiers[classfier].predict(vectorizador.transform(test_ds["review_es"]))
    # convert "sentimiento" to "positivo" or "negativo"
    test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
    test_ds[["ID","sentimiento"]].to_csv("submissions/svm_" + classfier + ".csv", index=False)

Observamos que el mejor fue el Radial Basis Function kernel, por lo que vamos a buscar hiperparámetros para ver si podemos mejorar ese.

In [19]:
# Observamos que el mejor fue el Radial Basis Function kernel, por lo que vamos a buscar hiperparámetros para ver si podemos mejorar ese.

FOLDS = 6
kfold = StratifiedKFold(n_splits=FOLDS)
scorer_fn = make_scorer(f1_score)

hiperparametros = {
    'C': [0.01, 0.1, 1, 2],
    'gamma': ["auto", "scale"],
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'degree': [1, 2, 3, 4, 5]
}

svm_classifier = svm.SVC()

gs = RandomizedSearchCV(svm_classifier, hiperparametros, scoring=scorer_fn, cv=kfold, n_jobs=-1, verbose=2, n_iter=10)
gs.fit(X_train, sentiments)

print(gs.best_params_)

ValueError: Found input variables with inconsistent numbers of samples: [40000, 50000]

In [31]:
best_params = {'kernel': 'rbf', 'gamma': 'scale', 'degree': 3, 'C': 1}
# Create the best SVM classfier
svm_classifier = svm.SVC(**best_params)

# Train the classifier on the training data
svm_classifier.fit(X_train, y_train)

# Save best SVM
joblib.dump(svm_classifier, f"models/svm_classifier_{best_params['kernel']}_best.pkl")

# Make predictions on the test data
y_pred = svm_classifier.predict(X_test)
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))
print("F1 Score:", metrics.f1_score(y_test, y_pred))

Accuracy: 0.8641
Precision: 0.8494113178883403
Recall: 0.887676126215519
F1 Score: 0.8681222707423581


In [33]:
# Make Kaggle predictions
test_ds = pd.read_csv("datasets/test.csv")
test_ds["sentimiento"] = svm_classifier.predict(vectorizador.transform(test_ds["review_es"]))
# convert "sentimiento" to "positivo" or "negativo"
test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
test_ds[["ID","sentimiento"]].to_csv("submissions/svm_rbf_best.csv", index=False)

## Redes Neuronales

In [8]:
max_words = None # Numero de las palabras mas frecuentes a considerar
MAX_LEN = 500 # Longitud máxima de las secuencias

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(reviews)
sequences = tokenizer.texts_to_sequences(reviews)
# Rellena o corta las secuencias al mismo tamaño
data = pad_sequences(sequences, maxlen=MAX_LEN)

# Divide los datos en conjuntos de entrenamiento y prueba
X_train_nn, X_test_nn, y_train_nn, y_test_nn = train_test_split(data, sentiments, test_size=0.2, random_state=42)
y_train_nn = np.array(y_train_nn)
y_test_nn = np.array(y_test_nn)

# Tamaño del vocabulario
vocabulary_size = len(tokenizer.word_index) + 1

In [10]:
import tensorflow as tf
from tensorflow import keras
from keras import regularizers
from keras.callbacks import EarlyStopping
import numpy as np

# Definimos la arquitectura de la red neuronal
modelo = keras.Sequential([
    keras.layers.Embedding(input_dim=vocabulary_size, output_dim=256, input_length=MAX_LEN),
    keras.layers.LSTM(50, activation='tanh', return_sequences=True, kernel_regularizer=regularizers.l2(0.01)),
    keras.layers.Dropout(0.5),
    keras.layers.Flatten(),
    keras.layers.Dense(1, activation='sigmoid')  # Por ser problema de clasificación binaria 
])

optimizer = keras.optimizers.Adam(learning_rate=1e-6)
modelo.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])

# Usamos Early Stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Entramos al modelo
modelo.fit(X_train_nn, y_train_nn, epochs=50, batch_size=32, validation_data=(X_test_nn, y_test_nn), callbacks=[early_stopping])

# Evaluar el modelo
loss, accuracy = modelo.evaluate(X_test_nn, y_test_nn)
print("Loss:", loss)
print("Accuracy:", accuracy)


Epoch 1/50


2023-06-28 02:34:05.872422: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-28 02:34:05.873633: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-28 02:34:05.874571: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

UnknownError: Graph execution error:

Fail to find the dnn implementation.
	 [[{{node CudnnRNN}}]]
	 [[sequential_1/lstm_1/PartitionedCall]] [Op:__inference_train_function_6905]

In [72]:
# Entramos al modelo
modelo.fit(X_train_nn, y_train_nn, epochs=10, batch_size=32, validation_data=(X_test_nn, y_test_nn), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f0eb2dace10>

In [75]:
# Save NN
MODEL_NAME = f"./models/nn_LSTM_50_preprocessed_output_dim_256"
modelo.save(MODEL_NAME)

2023-06-27 21:40:47.004618: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_2_grad/concat/split_2/split_dim' with dtype int32
	 [[{{node gradients/split_2_grad/concat/split_2/split_dim}}]]
2023-06-27 21:40:47.005918: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients/split_grad/concat/split/split_dim' with dtype int32
	 [[{{node gradients/split_grad/concat/split/split_dim}}]]
2023-06-27 21:40:47.006897: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

INFO:tensorflow:Assets written to: ./models/nn_LSTM_50_no_stopwords_only_spanish_output_dim_256/assets


INFO:tensorflow:Assets written to: ./models/nn_LSTM_50_no_stopwords_only_spanish_output_dim_256/assets


In [38]:
MODEL_NAME = f"./models/nn_LSTM_50_no_stopwords_only_spanish_dim_256"
modelo = keras.models.load_model(MODEL_NAME)

2023-06-27 19:51:52.340505: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients_split_2_grad_concat_split_2_split_dim' with dtype int32
	 [[{{node gradients_split_2_grad_concat_split_2_split_dim}}]]
2023-06-27 19:51:52.340585: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You must feed a value for placeholder tensor 'gradients_split_grad_concat_split_split_dim' with dtype int32
	 [[{{node gradients_split_grad_concat_split_split_dim}}]]
2023-06-27 19:51:52.340643: I tensorflow/core/common_runtime/executor.cc:1197] [/device:CPU:0] (DEBUG INFO) Executor start aborting (this does not indicate an error and you can ignore this message): INVALID_ARGUMENT: You mus

In [74]:
from sklearn.metrics import confusion_matrix, classification_report

# Obtener las predicciones del modelo en el conjunto de prueba
y_pred = modelo.predict(X_test_nn)
y_pred_rounded = np.where(y_pred >= 0.5, 1, 0)

# Calcular la matriz de confusión
confusion_mat = confusion_matrix(y_test_nn, y_pred_rounded)

# Imprimir la matriz de confusión
print("Matriz de Confusión:")
print(confusion_mat)

# Obtener otras métricas de evaluación
classification_rep = classification_report(y_test_nn, y_pred_rounded)

# Imprimir el informe de clasificación con métricas como precisión, recall y f1-score
print("Informe de Clasificación:")
print(classification_rep)


Matriz de Confusión:
[[4115  749]
 [ 791 3982]]
Informe de Clasificación:
              precision    recall  f1-score   support

           0       0.84      0.85      0.84      4864
           1       0.84      0.83      0.84      4773

    accuracy                           0.84      9637
   macro avg       0.84      0.84      0.84      9637
weighted avg       0.84      0.84      0.84      9637



In [76]:
test_ds = pd.read_csv("datasets/test.csv")

reviews_to_predict = test_ds["review_es"]
filtered_reviews_to_predict = filter_stopwords(reviews_to_predict, stopwords)
#filtered_reviews_to_predict = reviews_to_predict
sequences = tokenizer.texts_to_sequences(filtered_reviews_to_predict)
data = pad_sequences(sequences, maxlen=MAX_LEN)


test_ds["sentimiento"] = modelo.predict(data)
print(test_ds["sentimiento"][:100])
# Redondear los valores de sentimiento a 0 o 1
test_ds["sentimiento"] = round(test_ds["sentimiento"])
print(test_ds["sentimiento"][:100])

# convert "sentimiento" to "positivo" or "negativo"
test_ds["sentimiento"] = test_ds["sentimiento"].apply(lambda x: "positivo" if x == 1 else "negativo")
test_ds[["ID", "sentimiento"]].to_csv("submissions/nn_LSTM_50_no_stopwords_only_spanish_output_dim_256.csv", index=False)

0     0.011234
1     0.622728
2     0.447009
3     0.647730
4     0.131043
        ...   
95    0.144608
96    0.328653
97    0.011422
98    0.170269
99    0.647323
Name: sentimiento, Length: 100, dtype: float32
0     0.0
1     1.0
2     0.0
3     1.0
4     0.0
     ... 
95    0.0
96    0.0
97    0.0
98    0.0
99    1.0
Name: sentimiento, Length: 100, dtype: float32
