<a href="https://colab.research.google.com/github/mg5mariano/MiPrimerApp/blob/main/Rese%C3%B1aPelicula.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk
from nltk.corpus import movie_reviews, stopwords
from nltk.tokenize import word_tokenize
import random
import re

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix

import matplotlib.pyplot as plt
import numpy as np

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('movie_reviews')
nltk.download('punkt_tab')

In [None]:
fileids = movie_reviews.fileids()
texts = []
labels = []

for fid in fileids:
    texts.append(movie_reviews.raw(fid))
    labels.append(1 if movie_reviews.categories(fid)[0] == 'pos' else 0)

print("Total documentos:", len(texts))
print("Ejemplo (primer doc, categoría):", movie_reviews.categories(fileids[0]))
print(texts[0][:400], "...\n")


In [None]:
stop_words = set(stopwords.words('english'))

def preprocess_text_eng(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', ' ', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words and len(t) > 1]
    return " ".join(tokens)

texts_proc = [preprocess_text_eng(t) for t in texts]

print("ORIGINAL (recortado):\n", texts[0][:250], "...\n")
print("PROCESADO (recortado):\n", texts_proc[0][:250], "...\n")

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts_proc, labels, test_size=0.2, random_state=42, stratify=labels
)

print("Entrenamiento:", len(X_train), " Prueba:", len(X_test))
print("Distribución train:", sum(y_train), "positivos de", len(y_train))
print("Distribución test:", sum(y_test), "positivos de", len(y_test))


In [None]:
pipe_nb = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', MultinomialNB())
])

pipe_nb.fit(X_train, y_train)
y_pred_nb = pipe_nb.predict(X_test)


In [None]:
pipe_lr = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', LogisticRegression(max_iter=1000))
])

pipe_lr.fit(X_train, y_train)
y_pred_lr = pipe_lr.predict(X_test)


In [None]:
def evaluar_modelo(y_true, y_pred, nombre="Modelo"):
    print(f"--- {nombre} ---")
    print("Accuracy :", accuracy_score(y_true, y_pred))
    print("Precision:", precision_score(y_true, y_pred, zero_division=0))
    print("Recall   :", recall_score(y_true, y_pred, zero_division=0))
    print("F1 score :", f1_score(y_true, y_pred, zero_division=0))
    print("\nClassification report:\n", classification_report(y_true, y_pred, zero_division=0))
    cm = confusion_matrix(y_true, y_pred)
    print("Confusion matrix:\n", cm)
    return cm

cm_nb = evaluar_modelo(y_test, y_pred_nb, "CountVectorizer + MultinomialNB")
cm_lr = evaluar_modelo(y_test, y_pred_lr, "TfidfVectorizer + LogisticRegression")


In [None]:
def plot_confusion(cm, title):
    plt.figure(figsize=(4,4))
    plt.imshow(cm, interpolation='nearest', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicho')
    plt.ylabel('Real')
    ticks = np.arange(2)
    plt.xticks(ticks, ['Neg', 'Pos'])
    plt.yticks(ticks, ['Neg', 'Pos'])
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            plt.text(j, i, cm[i, j], ha='center', va='center', color='black')
    plt.colorbar()
    plt.show()

plot_confusion(cm_nb, "Confusion - NB")
plot_confusion(cm_lr, "Confusion - LR")


In [None]:
def mostrar_errores(X_test_raw, X_test_proc, y_true, y_pred, n=6):
    errores_idx = [i for i,(t,p) in enumerate(zip(y_true,y_pred)) if t!=p]
    print(f"Total errores: {len(errores_idx)}. Mostrando hasta {n} ejemplos:\n")
    for i in errores_idx[:n]:
        print("Índice:", i)
        print("Etiqueta real:", y_true[i], "Predicha:", y_pred[i])
        print("Texto procesado (recortado):", X_test_proc[i][:400])
        print("------\n")

mostrar_errores(None, X_test, y_test, y_pred_lr, n=6)
