# Modelamiento de Datos

In [91]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from imblearn.over_sampling import SMOTE

In [92]:
# cargar datos train y test

train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [93]:
train.head()

Unnamed: 0,bank,rating,review_title_by_user,review,rating_title_by_user,useful_count,review_length,positive_words,negative_words
0,review,5.0,"""Best account""",transaction indian bank branch near residence ...,Blown Away!,13,23,1,0
1,HDFC Bank,3.5,"""Need to improve the service""",holding salary account hdfc bank past year som...,Pretty good,0,38,0,0
2,HDFC Bank,3.5,"""Good bank""",year salary account hdfc made transaction acco...,Pretty good,0,23,1,0
3,SBI,5.0,"""Good account""",sbi service good glad account opened account y...,Blown Away!,0,27,2,0
4,HDFC Bank,4.5,"""Good """,month opened salary account hdfc bank hidden c...,Excellent!,2,27,0,0


In [94]:
test.head()

Unnamed: 0,bank,rating,review_title_by_user,review,rating_title_by_user,useful_count,review_length,positive_words,negative_words
0,HDFC Bank,5.0,"""Best salary account""",holding salary account hdfc bank past month cu...,Blown Away!,0,24,1,0
1,HDFC Bank,5.0,"""Excellent service """,holding salary account hdfc bank year zero bal...,Blown Away!,0,36,0,0
2,Axis Bank,5.0,"""Very Good Service""",axis bank saving account past year using net b...,Blown Away!,0,31,0,0
3,review,3.0,"""Good """,holding salary account indian overseas bank ye...,Satisfactory,2,28,1,0
4,Kotak,4.0,"""Good service""",saving account kotak bank year zero balance ac...,Great!,1,23,1,0


In [95]:
# Vectorización TF-IDF
vectorizer = TfidfVectorizer(max_features=10000)
X_train = vectorizer.fit_transform(train["review"])
X_test = vectorizer.transform(test["review"])

In [96]:
y_train = train["rating"]
y_test = test["rating"]

In [97]:
# Convertir etiquetas continuas a etiquetas discretas
y_train_discrete = y_train.round().astype(int)
y_test_discrete = y_test.round().astype(int)

In [98]:
# Aplicar SMOTE para balancear las clases
smote = SMOTE(random_state=42)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train, y_train_discrete)

# Verificar la distribución de clases después de aplicar SMOTE
print("Distribución de clases antes de SMOTE:", y_train_discrete.value_counts())
print("Distribución de clases después de SMOTE:", y_train_balanced.value_counts())

Distribución de clases antes de SMOTE: 5    440
4    259
3     61
2     25
1      8
0      7
Name: rating, dtype: int64
Distribución de clases después de SMOTE: 5    440
4    440
3    440
0    440
2    440
1    440
Name: rating, dtype: int64


In [99]:
# Modelos a evaluar
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"),
    "SVM": SVC(probability=True)
}



In [100]:
# Evaluación de modelos
results = []

for name, model in models.items():
    model.fit(X_train, y_train_discrete)
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test) if hasattr(model, "predict_proba") else None

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test_discrete, y_pred),
        "Precision": precision_score(y_test_discrete, y_pred, average="weighted"),
        "Recall": recall_score(y_test_discrete, y_pred, average="weighted"),
        "F1-score": f1_score(y_test_discrete, y_pred, average="weighted"),
        "AUC-ROC": roc_auc_score(y_test_discrete, y_proba, multi_class="ovr") if y_proba is not None else np.nan
    }

    results.append(metrics)

# Convertir resultados en DataFrame y ordenarlos por F1-score
df_results = pd.DataFrame(results).sort_values(by="F1-score", ascending=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [101]:
print(df_results)

                 Model  Accuracy  Precision  Recall  F1-score   AUC-ROC
2              XGBoost      0.47   0.397045    0.47  0.427571  0.598183
0  Logistic Regression      0.50   0.386212    0.50  0.421457  0.664698
3                  SVM      0.55   0.399201    0.55  0.410107  0.668923
1        Random Forest      0.46   0.311543    0.46  0.366077  0.615350


Los valores de precisión son bajos, lo que indica que el modelo no está identificando bien las clases.

## Con Word2Vec/GloVe en lugar de TF-IDF.

In [102]:
from gensim.models import Word2Vec
from sklearn.preprocessing import StandardScaler

# Entrenar el modelo Word2Vec
sentences = [review.split() for review in train["review"]]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Crear una función para obtener el vector promedio de las palabras en una reseña
def get_average_word2vec(review, model, vector_size):
    words = review.split()
    word_vecs = [model.wv[word] for word in words if word in model.wv]
    if len(word_vecs) == 0:
        return np.zeros(vector_size)
    return np.mean(word_vecs, axis=0)

# Transformar las reseñas en vectores
X_train_word2vec = np.array([get_average_word2vec(review, word2vec_model, 100) for review in train["review"]])
X_test_word2vec = np.array([get_average_word2vec(review, word2vec_model, 100) for review in test["review"]])

# Escalar los datos
scaler = StandardScaler()
X_train_word2vec = scaler.fit_transform(X_train_word2vec)
X_test_word2vec = scaler.transform(X_test_word2vec)

In [103]:
# Evaluación de modelos con Word2Vec
results_word2vec = []

for name, model in models.items():
    model.fit(X_train_word2vec, y_train_discrete)
    y_pred = model.predict(X_test_word2vec)
    y_proba = model.predict_proba(X_test_word2vec) if hasattr(model, "predict_proba") else None

    metrics = {
        "Model": name,
        "Accuracy": accuracy_score(y_test_discrete, y_pred),
        "Precision": precision_score(y_test_discrete, y_pred, average="weighted"),
        "Recall": recall_score(y_test_discrete, y_pred, average="weighted"),
        "F1-score": f1_score(y_test_discrete, y_pred, average="weighted"),
        "AUC-ROC": roc_auc_score(y_test_discrete, y_proba, multi_class="ovr") if y_proba is not None else np.nan
    }

    results_word2vec.append(metrics)

# Convertir resultados en DataFrame y ordenarlos por F1-score
df_results_word2vec = pd.DataFrame(results_word2vec).sort_values(by="F1-score", ascending=False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [104]:
print(df_results_word2vec)

                 Model  Accuracy  Precision  Recall  F1-score   AUC-ROC
2              XGBoost     0.515   0.450155   0.515  0.479028  0.587076
0  Logistic Regression     0.500   0.399938   0.500  0.430326  0.640214
1        Random Forest     0.470   0.374888   0.470  0.411148  0.607596
3                  SVM     0.550   0.302500   0.550  0.390323  0.541446


Los resultados obtenidos no son ideales, lo que sugiere que predecir el rating a partir de los reviews no es una tarea sencilla. Para mejorar los resultados, sería necesario agregar más variables al dataset que puedan contribuir al proceso de prediccion.