In [None]:
import pandas as pd
import numpy as np
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import gc
import joblib

In [None]:
# Cargar tokenizer y embeddings

modelo_bert  = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(modelo_bert)
modelo  = BertModel.from_pretrained(modelo_bert)
modelo .eval()

In [None]:
# Reconstruir palabras a partir de subtokens

def reconstruir_palabras(df):
    """
    Recibe el df de tokens y devuelve un df con una columna 'palabra_id'
    que identifica a qué palabra pertenece cada token.
    """
    
    ids_palabra = []
    id_actual  = 0

    for id_instancia, grupo in df.groupby("instancia_id"):
        ids = []
        id_actual = 0

        for i, tok in enumerate(grupo["token"]):
            if tok.startswith("##"):
                # mismo palabra_id que el anterior
                ids.append(id_actual)
            else:
                # empieza palabra nueva
                id_actual += 1
                ids.append(id_actual)

        ids_palabra.extend(ids)

    df["palabra_id"] = ids_palabra
    return df

In [None]:
# Construir features a nivel palabra

def crear_features(df, nombres_propios=None):
    """
    Crea features por palabra y luego las copia por token.
    nombres_propios: conjunto con palabras consideradas nombres propios (opcional)
    """

    if nombres_propios is None:
        nombres_propios = set()  # si no tenés nada todavía

    # reconstruir palabra concatenando subtokens
    def unir_subtokens(grupo):
        palabra  = ""
        for tok in grupo["token"].tolist():
            if tok.startswith("##"):
                palabra += tok[2:]
            else:
                palabra += tok
        return palabra

    palabras = df.groupby(["instancia_id", "palabra_id"]).apply(unir_subtokens).reset_index()
    palabras.columns = ["instancia_id", "palabra_id", "palabra"]
    df = df.merge(palabras, on=["instancia_id", "palabra_id"], how="left")

    # Posición en el texto (por palabra)
    df["posicion_palabra"] = df.groupby("instancia_id")["palabra_id"].transform(
        lambda x: x.rank(method="dense").astype(int)
    )

    # Cantidad_palabras - posición
    df["total_palabras"] = df.groupby("instancia_id")["palabra_id"].transform("max")
    df["dist_al_final"] = df["total_palabras"] - df["posicion_palabra"]

    # Longitud de la palabra
    df["longitud_palabra"] = df["palabra"].str.len()

    # Frecuencia de la palabra en la instancia
    freq = df.groupby(["instancia_id", "palabra"])["palabra"].transform("count")
    df["frecuencia_en_instancia"] = freq

    # es_nombre?
    df["es_nombre"] = df["palabra"].isin(nombres_propios).astype(int)

 
    # Embeddings a nivel palabra
    
    def obtener_embedding_subtoken(token_id):
        """Embedding del subtoken."""
        with torch.inference_mode():
            return modelo.embeddings.word_embeddings.weight[token_id].cpu().numpy()

    # obtener embedding por subtoken
    df["embedding_subtoken"] = df["token_id"].apply(obtener_embedding_subtoken)

    # promediar embeddings de subtokens por palabra
    embeddings_palabra  = (
        df.groupby(["instancia_id", "palabra_id"])["embedding_subtoken"]
        .apply(lambda xs: np.mean(np.vstack(xs.values), axis=0))
        .reset_index()
    )

    embeddings_palabra.columns = ["instancia_id", "palabra_id", "embedding_palabra"]

    df = df.merge(embeddings_palabra, on=["instancia_id", "palabra_id"], how="left")

    # eliminar columna auxiliar
    df.drop(columns=["embedding_subtoken"], inplace=True)

    # Distancia coseno a palabra anterior y siguiente
    def similitud_coseno(a, b):
        a = torch.tensor(a)
        b = torch.tensor(b)
        return F.cosine_similarity(a, b, dim=0).item()

    df["embedding_anterior"] = df.groupby("instancia_id")["embedding_palabra"].shift(1)
    df["embedding_siguiente"] = df.groupby("instancia_id")["embedding_palabra"].shift(-1)

    def calcular_similitud(fila, columna):
        valor = fila[columna]
        if valor is None or (isinstance(valor, float) and np.isnan(valor)):
            return 0
        return similitud_coseno(fila["embedding_palabra"], fila[columna])

    df["similitud_con_anterior"] = df.apply(lambda r: calcular_similitud(r, "embedding_anterior"), axis=1)
    df["similitud_con_siguiente"] = df.apply(lambda r: calcular_similitud(r, "embedding_siguiente"), axis=1)

    # Quitar embeddings auxiliares
    df.drop(columns=["embedding_anterior", "embedding_siguiente"], inplace=True)
    df.drop(columns=["embedding_palabra"], inplace=True)
    
    gc.collect()
    return df

In [None]:
#  Crear features a nivel token (fusionando features palabra)

def crear_features_token(df):
    """
    Recibe df que ya tiene las columnas por token + las columnas
    a nivel palabra (palabra, posicion_palabra, dist_al_final, longitud_palabra,
    frecuencia_en_instancia, es_nombre, similitud_con_anterior, similitud_con_siguiente (opc))
    Devuelve df_tokens con features listas para RF de puntuacion (por token).
    """

    # -- Copiamos para no mutar original --
    df_tok = df.copy()

    # -- marcar subtokens --
    df_tok["is_subtoken"] = df_tok["token"].str.startswith("##").astype(int)

    # índice del token dentro de la palabra (1 = primer subtoken)
    def idx_en_palabra(gr):
        # devuelve índice por el orden de aparición en el grupo
        return np.arange(1, len(gr) + 1)

    df_tok["token_idx_en_palabra"] = df_tok.groupby(
        ["instancia_id", "palabra_id"]).cumcount() + 1

    # tamaño (longitud) del token textual (sin ##)
    df_tok["token_text"] = df_tok["token"].str.replace("^##", "", regex=True)
    df_tok["longitud_token"] = df_tok["token_text"].str.len()

    # es primer subtoken de la palabra?
    df_tok["es_primer_subtoken"] = (df_tok["token_idx_en_palabra"] == 1).astype(int)

    # es ultimo subtoken
    subtoks_por_palabra = df_tok.groupby(["instancia_id", "palabra_id"])["token"].transform("count")
    df_tok["es_ultimo_subtoken"] = (df_tok["token_idx_en_palabra"] == subtoks_por_palabra).astype(int)

    # indicar si la palabra anterior / siguiente es nombre propio (útil para puntuación)
    df_tok["palabra_anterior_es_nombre"] = df_tok.groupby("instancia_id")["es_nombre"].shift(1).fillna(0).astype(int)
    df_tok["palabra_siguiente_es_nombre"] = df_tok.groupby("instancia_id")["es_nombre"].shift(-1).fillna(0).astype(int)

    # Distancia en caracteres al inicio/fin del token dentro de la palabra
    df_tok["token_pos_relativa"] = df_tok["token_idx_en_palabra"] / subtoks_por_palabra

    # Eliminar columnas intermedias
    df_tok.drop(columns=["token_text"], inplace=True)

    return df_tok

In [None]:
# Conjunto de nombres y apellidos

nombres_y_apellidos = {

    # Nombres masculinos
    "juan","jose","josé","javier","jorge","julian","julían","julio","joel",
    "joaquin","joaquín","miguel","martin","martín","marco","marcos","mateo",
    "matias","matías","maximiliano","manuel","mariano","mauricio","mirko",
    "nahuel","nicolas","nicólas","nazareno","pablo","pedro","patricio",
    "ramiro","ricardo","roberto","rodrigo","román","santiago","sergio",
    "sebastian","sebastián","samuel","tomás","tomas","thiago","tiago",
    "ulises","victor","víctor","valentin","valentín","william","walter",
    "xavier","yago","yamil","zaid","zair","zahir",

    # Nombres femeninos
    "ana","andrea","antonella","antonela","agustina","belén","belen","brenda",
    "brisa","bárbara","barbara","camila","celeste","carolina","candela",
    "delfina","daniela","daiana","elena","eliana","emilia","emily","florencia",
    "fernanda","gabriela","graciela","guadalupe","gimena","ximena","helena",
    "ivana","ivonne","jennifer","julieta","jazmín","jazmin","karina","keila",
    "karen","lucia","lucía","luana","luna","luisa","ludmila","maría","maria",
    "mariana","morena","marta","melina","milena","nadia","noelia","natalia",
    "nerina","paola","pamela","patricia","pía","pia","romina","rocío","rocio",
    "sofia","sofía","sol","serena","tamara","tatiana","ursula","vanesa",
    "vanessa","valeria","valentina","violeta","wendy","xiomara","yesica",
    "yésica","yanina","zaira","zoe","zoé",

    # Apellidos argentinos frecuentes
    "gonzalez","gonzález","rodriguez","rodríguez","fernandez","fernández",
    "lopez","lópez","martinez","martínez","garcia","garcía","perez","pérez",
    "sanchez","sánchez","romero","diaz","díaz","pereyra","pereira","ruiz",
    "torres","flores","acosta","benitez","benítez","medina","herrera",
    "castro","nuñez","núñez","ramos","dominguez","domínguez","ortiz",
    "gimenez","giménez","molina","silva","rios","ríos","suarez","suárez",
    "alvarez","álvarez","aguirre","mendoza","paz","vera","juarez","juárez",
    "rivas","gonzaga","montoya","castillo","campos","morales","vargas",
    "lujan","luján","arias","frias","frías","toledo","solis","solís","moyano",
    "correa","pineda","cabrera","vazquez","váquez","navarro","rosales",
    "espinoza","ospina","manrique","salazar",

    # Apellidos hispanos muy frecuentes
    "moreno","rubio","blanco","marquez","márquez","ibarra","salinas","mejia",
    "ortega","valdez","valdés","caballero","mercedes","ferrer","costas",
    "robles","delgado","rios","montes","cortez","cortes","carvajal","solano",
    "pacheco","maldonado","araujo","padilla","velazquez","velázquez",
    "contreras","sandoval","cordero","miranda","carmona","vidal","rendon",
    "rendón","villalba","villalobos","arrieta",

    # Casos especiales útiles
    "messi","maradona","riquelme","tevez","di maria","dimaria",
    "alberto","cristina","milei","macri"
    
    # Literatura clásica
    "sherlock", "holmes", "watson", "gatsby", "frankenstein", "dracula",
    "harker", "van helsing", "hyde", "jekyll", "albus", "dumbledore",
    "frodo", "samwise", "sam", "gandalf", "aragorn", "boromir", "legolas",
    "bilbo",

    # Harry Potter
    "harry", "potter", "hermione", "granger", "ron", "weasley", "malfoy",
    "draco", "snape", "voldemort", "sirius", "black", "hagrid", "minerva",
    "mcgonagall", "luna", "lovegood", "neville", "longbottom",

    # Star Wars
    "luke", "skywalker", "anakin", "vader", "darth", "leia", "organa",
    "han", "solo", "yoda", "kenobi", "obi-wan", "palpatine", "rey",
    "finn", "poe", "dameron", "chewbacca",

    # Marvel
    "tony", "stark", "ironman", "steve", "rogers", "thor", "loki",
    "natasha", "romanoff", "clint", "barton", "hulk", "banner",
    "peter", "parker", "spiderman", "wanda", "maximoff", "vision",
    "tchalla", "pantera", "negra",

    # DC Comics
    "bruce", "wayne", "batman", "alfred", "pennyworth", "clark", "kent",
    "superman", "lois", "lane", "diana", "prince", "wonderwoman",
    "flash", "barry", "allen", "joker", "harley", "quinn",

    # Videojuegos
    "mario", "luigi", "peach", "bowser", "link", "zelda", "ganondorf",
    "samus", "master", "chief", "kratos", "atreus", "cloud", "sephiroth",
    "sonic", "tails",

    # Series / TV
    "walter", "white", "heisenberg", "jesse", "pinkman", "saul", "goodman",
    "rick", "grimes", "daryl", "dixon", "eleven", "hopper", "mike",
    "dustin", "lucas", "nancy", "jonathan", "vecna",

    # Simpsons
    "homero", "homer", "marge", "bart", "lisa", "maggie", "moe",
    "burns", "smithers", "flanders", "ned", "milhouse",

    # Futurama
    "fry", "bender", "leela", "zoidberg", "hermes", "professor",
    "farnsworth",

    # Toy Story
    "woody", "buzz", "lightyear", "bo", "peep", "jesse",

    # Pixar/Disney
    "nemo", "dory", "marlin", "sully", "mike wazowski", "boo",
    "mr incredible", "elastigirl", "dash", "violet",

    # Autos (Cars)
    "rayo", "mcqueen", "mate", "sally", "doc", "hudson",

    # Shrek
    "shrek", "fiona", "burro", "asno", "lord farquaad", "jengi",

    # Otros animados
    "goku", "vegeta", "gohan", "piccolo", "bulma", "trunks",
    "naruto", "sasuke", "sakura", "kakashi",

    # Anime generales
    "light", "yagami", "l", "lawliet", "eren", "yeager", "mikasa",
    "armin", "levi",

    # Cultura pop general
    "indiana", "jones", "terminator", "neo", "trinity", "morpheus",
    "john", "wick", "yennefer", "geralt", "ciri", "bateman"
}


In [None]:
# Procesar el DF en Chunks (evita MemoryError)

def procesar_en_chunks(df, tamaño_chunk_instancias=300, nombres_propios=None):

    instancias = df["instancia_id"].unique()
    chunks = [instancias[i:i+tamaño_chunk_instancias]
              for i in range(0, len(instancias), tamaño_chunk_instancias)]

    dfs_resultados = []

    for i, subset in enumerate(chunks):
        print(f"Procesando chunk {i+1}/{len(chunks)} ({len(subset)} instancias)...")

        df_chunk = df[df["instancia_id"].isin(subset)].copy()

        df_proc = crear_features(df_chunk, nombres_propios=nombres_propios)

        dfs_resultados.append(df_proc)

        del df_chunk
        del df_proc
        gc.collect()

    return pd.concat(dfs_resultados, ignore_index=True)

# Entrenamiento de los Random Forests

In [None]:
# Imporatmos el dataset y creamos una copia en caso de emergencia
df_original = pd.read_parquet("datos_entrenamiento_RF_60000.parquet")
df = df_original.copy(deep=True)

In [None]:
# Reconstruir palabras
df = reconstruir_palabras(df)

In [None]:
# Procesar en chunks para calcular features sin MemoryError
df = procesar_en_chunks(df, tamaño_chunk_instancias=300, nombres_propios= nombres_y_apellidos)

In [None]:
# Procesar el DF para calcular features a nivel token
df = crear_features_token(df)
df.reset_index(drop=True, inplace=True)
df["indice"] = df.index

In [None]:
# Separo en train y test

# obtener lista de instancias
ids = df["instancia_id"].unique()

train_ids, eval_ids = train_test_split(ids, test_size=0.2, random_state=42)

df_train = df[df["instancia_id"].isin(train_ids)].copy()
df_eval  = df[df["instancia_id"].isin(eval_ids)].copy()

In [None]:
# Seleccionamos features
FEATURES_TOKEN = [
    "posicion_palabra",
    "dist_al_final",
    "longitud_palabra",
    "frecuencia_en_instancia",
    "es_nombre",
    "similitud_con_anterior",
    "similitud_con_siguiente",
    "is_subtoken",
    "token_idx_en_palabra",
    "token_pos_relativa",
    "longitud_token",
    "es_primer_subtoken",
    "es_ultimo_subtoken",
    "palabra_anterior_es_nombre",
    "palabra_siguiente_es_nombre"
]

In [None]:
X_train_cap = df_train[FEATURES_TOKEN].fillna(0)
X_eval_cap  = df_eval[FEATURES_TOKEN].fillna(0)

y_train_cap = df_train["capitalizacion"].astype(int)
y_eval_cap = df_eval["capitalizacion"].astype(int)

In [None]:
# Entrenamos el RF de Capitalizacion
rf_cap = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=None)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_preds = cross_val_predict(rf_cap, X_train_cap, y_train_cap, cv=cv, method="predict")
df_train["pred_cap"] = oof_preds # pred_cap_oof (esto supuestamente evita leakege porque despues se usan estas predicciones como features para los otros RF
                                 # y si calculamos las predicciones para los datos de train, los toros RFs estarian recibiendo información que el primer modelo 
                                 # no podría producir en un escenario real, )

rf_cap.fit(X_train_cap, y_train_cap)

df_eval["pred_cap"] = rf_cap.predict(X_eval_cap)

df.loc[df_train["indice"], "pred_cap"] = df_train["pred_cap"].values
df.loc[df_eval["indice"], "pred_cap"] = df_eval["pred_cap"].values

print("Modelo RF entrenado!")

In [None]:
# Evaluacion
print("Capitalizar:")
print(classification_report(y_eval_cap, rf_cap.predict(X_eval_cap)))

In [None]:
# Guardamos el modelo entrenado

joblib.dump(rf_cap, "rf_cap_model.pkl")

In [None]:
# Agregamos las predicciones del RF anterior como feature para los RFs de puntuacion

# añadir pred_cap a FEATURES_TOKEN si no está
if "pred_cap" not in FEATURES_TOKEN:
    FEATURES_TOKEN.append("pred_cap")

In [None]:
# Separamos los datos en Train y Test
ids = df["instancia_id"].unique()

train_ids, eval_ids = train_test_split(ids, test_size=0.2, random_state=42)

df_train = df[df["instancia_id"].isin(train_ids)].copy()
df_eval  = df[df["instancia_id"].isin(eval_ids)].copy()

In [None]:
# RF puntuacion inicial
X_train_pini = df_train[FEATURES_TOKEN].fillna(0)
y_train_pini = df_train["punt_inicial"].astype(int)


X_eval_pini  = df_eval[FEATURES_TOKEN].fillna(0)
y_eval_pini = df_eval["punt_inicial"].astype(int)

print("Entrenadno el arborl")
rf_pini = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=None)
rf_pini.fit(X_train_pini, y_train_pini)

df["pred_pini"] = rf_pini.predict(df[FEATURES_TOKEN])

# Evaluacion
print("Puntuacion Inicial:")
print(classification_report(y_eval_pini, rf_pini.predict(X_eval_pini)))

In [None]:
joblib.dump(rf_pini, "rf_pini_model.pkl")

In [None]:
# RF puntuacion final
X_train_pfin = df_train[FEATURES_TOKEN].fillna(0)
y_train_pfin = df_train["pfinal"].astype(int)

X_eval_pfin  = df_eval[FEATURES_TOKEN].fillna(0)
y_eval_pfin = df_eval["pfinal"].astype(int)


rf_pfin = RandomForestClassifier(n_estimators=200, random_state=42, max_depth=None)
rf_pfin.fit(X_train_pfin, y_train_pfin)

df["pred_pfin"] = rf_pfin.predict(df[FEATURES_TOKEN])

# Evaluacion
print("Puntuacion Final:")
print(classification_report(y_eval_pfin, rf_pfin.predict(X_eval_pfin)))

In [None]:
joblib.dump(rf_pfin, "rf_pfin_model.pkl")

# Datos de Control

In [None]:
# Cargo los RFs ya entrenados
rf_cap = joblib.load("rf_cap_model.pkl")
rf_pini = joblib.load("rf_pini_model.pkl")
rf_pfin = joblib.load("rf_pfin_model.pkl")

In [None]:
# Imporatmos el dataset y creamos una copia en caso de emergencia
df_original = pd.read_parquet("datos_control_100000.parquet")
df = df_original.copy(deep=True)

In [None]:
# Reconstruir palabras
df = reconstruir_palabras(df)

In [None]:
# Procesar en chunks para calcular features sin MemoryError
df = procesar_en_chunks(df, tamaño_chunk_instancias=300, nombres_propios= nombres_y_apellidos)

In [None]:
df = crear_features_token(df)
df.reset_index(drop=True, inplace=True)

In [None]:
df

In [None]:
# Hacemos las predicciones para Capitalizacion

FEATURES_TOKEN = [
    "posicion_palabra",
    "dist_al_final",
    "longitud_palabra",
    "frecuencia_en_instancia",
    "es_nombre",
    "similitud_con_anterior",
    "similitud_con_siguiente",
    "is_subtoken",
    "token_idx_en_palabra",
    "token_pos_relativa",
    "longitud_token",
    "es_primer_subtoken",
    "es_ultimo_subtoken",
    "palabra_anterior_es_nombre",
    "palabra_siguiente_es_nombre"
]

df["pred_cap"] = rf_cap.predict(df[FEATURES_TOKEN].fillna(0))

In [None]:
if "pred_cap" not in FEATURES_TOKEN:
    FEATURES_TOKEN.append("pred_cap")

In [None]:
# Hacemos las predicciones para Puntuacion Inicial
df["pred_pini"] = rf_pini.predict(df[FEATURES_TOKEN].fillna(0))

In [None]:
# Hacemos las predicciones para Puntuacion Final
df["pred_pfin"] = rf_pfin.predict(df[FEATURES_TOKEN].fillna(0))

In [None]:
# Evaluacion Capitalizacion

print(classification_report(df["capitalizacion"], df["pred_cap"]))

In [None]:
# Evaluacion Capitalizacion

print(classification_report(df["punt_inicial"], df["pred_pini"]))

In [None]:
# Evaluacion Capitalizacion

print(classification_report(df["pfinal"], df["pred_pfin"]))