In [24]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
import torch
import re
import os
from transformers import BertTokenizer, BertModel
from datasets import load_dataset
import pandas as pd
import joblib


##Carga del dataset
Dataset: https://huggingface.co/datasets/Emilianohack6950/Wikipedia-es

In [2]:
#Cargamos el dataset
ds_w = load_dataset("Emilianohack6950/Wikipedia-es")
ds_b = load_dataset("csv", data_files="content/la_dama_boba_sentences.csv")
ds_h = load_dataset("csv", data_files="content/hamlet_sentences.csv")
df_final = pd.read_csv("content/dataset_final.csv")

In [3]:
train_contenido = [item["contenido"] for item in ds_w["train"]]
test_contenido = [item["contenido"] for item in ds_w["test"]]
hamlet = [item['sentence'] for item in ds_h['train']]
dama = [item['sentence'] for item in ds_b['train']]

todo_contenido = train_contenido + test_contenido + hamlet + dama

# Unir todo en una sola cadena
texto_total = " ".join(todo_contenido)

# Contar signos de puntuación
cant_comas = texto_total.count(',')
cant_puntos = texto_total.count('.')
cant_preg_abre = texto_total.count('¿')
cant_preg_cierra = texto_total.count('?')

# Contar letras mayúsculas (A-Z, incluyendo acentos)
cant_mayus = len(re.findall(r'[A-ZÁÉÍÓÚÜÑ]', texto_total))

print("Estadísticas del dataset:")
print(f"Comas (,): {cant_comas:,}")
print(f"Puntos (.): {cant_puntos:,}")
print(f"Signos de pregunta abiertos (¿): {cant_preg_abre:,}")
print(f"Signos de pregunta cerrados (?): {cant_preg_cierra:,}")
print(f"Letras mayúsculas: {cant_mayus:,}")

Estadísticas del dataset:
Comas (,): 26,823
Puntos (.): 20,591
Signos de pregunta abiertos (¿): 887
Signos de pregunta cerrados (?): 919
Letras mayúsculas: 81,631


## Funciones de preprocesamiento

In [None]:
model_name = "bert-base-multilingual-cased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)
model.eval()

# Funcion provista por la catedra
def get_multilingual_token_embedding(token: str):
  """     Devuelve el embedding (estático) para el token.     """
  if not isinstance(token, str):
        #print(f" ⚠️ Token recibido no es string: {token} — convertido a string.")
        token = str(token)

    # Obtener ID desde tokenizer
  token_id = tokenizer.convert_tokens_to_ids(token)

  # Caso OOV: token no existe → usamos UNK pero mantenemos el print original
  if token_id is None or token_id == tokenizer.unk_token_id:
      #print(f" ❌ El token '{token}' no pertenece al vocabulario de multilingual BERT. Usando [UNK].")
      token_id = tokenizer.unk_token_id
  else:
      pass
      #print(f" ✅ Token: '{token}' | ID: {token_id}")

  # Seguridad extra: forzar ID dentro del rango del vocab
  vocab_size = model.embeddings.word_embeddings.weight.shape[0]
  if not (0 <= token_id < vocab_size):
      #print(f" ⚠️ token_id fuera de rango ({token_id}) → usando [UNK]")
      token_id = tokenizer.unk_token_id

  # Devuelve SIEMPRE un tensor válido
  embedding_vector = model.embeddings.word_embeddings.weight[token_id]

  #print(f"Embedding shape: {embedding_vector.shape}")
  return embedding_vector


def tipo_capitalizacion(palabra):
    conteo_mayusculas = 0
    for caracter in palabra:
        if caracter.isupper():
            conteo_mayusculas += 1

    if palabra.isupper():
        return 3
    else:
        if conteo_mayusculas >= 2:
            return 2
        else:
            if conteo_mayusculas == 1:
                if palabra[0].isupper():
                    return 1
                else:
                    return 2
            else:
                return 0

#Capaz conviene tokenizar todo de una en vez de palabra por palabra

def tokenizar_y_etiquetar(texto_original, instancia_id = 1):
  SIGNOS_ESPANOL = "¿¡?.,!\"'’“”()[]:;—-"

  #Separamos por palabra y signo
  partes = re.findall(r"\w+['’]?\w*|[¿?.,]", texto_original)
  resultado = []

  #Para cada Palabra y Signo, aplanamos y tokenizamos
  #Extraemos Capitalización y puntuación de la palabra y signos adyacentes, y se la asignamos a los tokens
  for i in range(len(partes)):
    parte = partes[i]

    if(parte not in  SIGNOS_ESPANOL):
      #tokens = tokenizer.tokenize(parte.lower().strip("¿?.,¡!"))
      tokens = tokenizer.tokenize(parte.lower()) # Sacamos mayúsculas y la tokenizamos
      #Obtenemos la capitalización
      cap = tipo_capitalizacion(parte)

      #Obtenemos la puntuación por token
      punt_ini = '¿' if partes[i-1]=='¿' else ''
      punt_fin = ''
      if(i<len(partes)-1):
        if partes[i+1] in '.?,':
          punt_fin = partes[i+1]

      for j in range(len(tokens)):
        tok = tokens[j]
        punt_ini_token = ''
        punt_fin_token = ''
        if(j==0):#Si es el primer token se le agrega puntuación inicial ¿
            punt_ini_token = punt_ini
        if (j==len(tokens)-1): #Si el último token se le agrega puntuación final
            punt_fin_token = punt_fin

        token_id = tokenizer.convert_tokens_to_ids(tok)
        resultado.append({
                    "instancia_id": instancia_id,
                    "token_id": token_id,
                    "token": tok,
                    "punt_inicial": punt_ini_token,
                    "punt_final": punt_fin_token,
                    "capitalizacion": cap
                })

  return resultado

## Testing para tokenizar y etiquetar

In [5]:
import unittest

class TestFunciones(unittest.TestCase):

    def simplificar_salida(self, salida):
        return [
            [item["token"], item["punt_inicial"], item["punt_final"], item["capitalizacion"]]
            for item in salida
        ]

    def test01_reconoce_Capitalizacion_Token_inicial_mas_de_una_palabra(self):
        texto_1 = "Los árboles altos"
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto_1))
        self.assertEqual(
            salida,
            [['los','','',1], ['árboles','','',0], ['altos','','',0]]
        )

    def test02_reconoce_Punto_final_mas_de_una_palabra(self):
        texto_1 = "Los árboles altos."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto_1))
        self.assertEqual(
            salida,
            [['los','','',1], ['árboles','','',0], ['altos','','.',0]]
        )

    def test03_reconoce_Capitalizacion_Todos_los_tokens(self):
        texto_1 = "Hermosos son los árboles altos."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto_1))
        self.assertEqual(
            salida,
            [['her','','',1], ['##mos','','',1], ['##os','','',1],
             ['son','','',0], ['los','','',0], ['árboles','','',0], ['altos','','.',0]]
        )

    def test04_reconoce_Capitalizacion_todos_los_tokens_May_Min(self):
        texto_1 = "Hermosos son los árboles hermosos."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto_1))
        self.assertEqual(
            salida,
            [['her','','',1], ['##mos','','',1], ['##os','','',1],
             ['son','','',0], ['los','','',0], ['árboles','','',0],
             ['her','','',0], ['##mos','','',0], ['##os','','.',0]]
        )

    def test05_reconoce_capitalizacion_Signo_pregunta_punto(self):
        texto_1 = "¿Cuándo vamos a McDonald's? Ellos no vienen hoy. ¿Dónde están ahora?"
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto_1))
        self.assertEqual(
            salida,
            [['cu','¿','',1], ['##ánd','','',1], ['##o','','',1], ['va','','',0],
             ['##mos','','',0], ['a','','',0], ['m','','',2], ['##c','','',2],
             ['##dona','','',2], ['##ld','','',2], ["'",'','',2], ['s','','?',2],
             ['ellos','','',1], ['no','','',0], ['viene','','',0], ['##n','','',0],
             ['hoy','','.',0], ['dó','¿','',1], ['##nde','','',1],
             ['están','','',0], ['ahora','','?',0]]
        )

    def test06_reconoce_comas(self):
        texto_1 = "Si, es correcto."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto_1))
        self.assertEqual(
            salida,
            [['si','',',',1], ['es','','',0], ['correct','','',0], ['##o','','.',0]]
        )

    def test07_reconoce_varias_comas(self):
        texto = "Sí, claro, entiendo."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto))
        self.assertEqual(
            salida,
            [['sí','',',',1], ['claro','',',',0], ['ent','','',0], ['##iendo','','.',0]]
        )

    def test08_reconoce_todas_mayusculas(self):
        texto = "Me gustaría ir a la NASA."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto))
        self.assertEqual(
            salida,
            [['me','','',1], ['gu','','',0], ['##star','','',0], ['##ía','','',0],
             ['ir','','',0], ['a','','',0], ['la','','',0], ['nasa','','.',3]]
        )

    def test09_reconoce_algunas_mayusculas(self):
        texto = "El iPhone es caro."
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto))
        self.assertEqual(
            salida,
            [['el','','',1], ['i','','',2], ['##phone','','',2], ['es','','',0],
             ['car','','',0], ['##o','','.',0]]
        )

    def test10_otros_signos_no_afectan(self):
        texto = "¡Hola Mundo!(a)[A];a:A-a"
        salida = self.simplificar_salida(tokenizar_y_etiquetar(texto))
        self.assertEqual(
            salida,
            [['hol','','',1], ['##a','','',1], ['mundo','','',1],
             ['a','','',0], ['a','','',3], ['a','','',0],
             ['a','','',3], ['a','','',0]]
        )


if __name__ == "__main__":
    unittest.main(argv=[''], verbosity=2, exit=False)

test01_reconoce_Capitalizacion_Token_inicial_mas_de_una_palabra (__main__.TestFunciones.test01_reconoce_Capitalizacion_Token_inicial_mas_de_una_palabra) ... ok
test02_reconoce_Punto_final_mas_de_una_palabra (__main__.TestFunciones.test02_reconoce_Punto_final_mas_de_una_palabra) ... ok
test03_reconoce_Capitalizacion_Todos_los_tokens (__main__.TestFunciones.test03_reconoce_Capitalizacion_Todos_los_tokens) ... ok
test04_reconoce_Capitalizacion_todos_los_tokens_May_Min (__main__.TestFunciones.test04_reconoce_Capitalizacion_todos_los_tokens_May_Min) ... ok
test05_reconoce_capitalizacion_Signo_pregunta_punto (__main__.TestFunciones.test05_reconoce_capitalizacion_Signo_pregunta_punto) ... ok
test06_reconoce_comas (__main__.TestFunciones.test06_reconoce_comas) ... ok
test07_reconoce_varias_comas (__main__.TestFunciones.test07_reconoce_varias_comas) ... ok
test08_reconoce_todas_mayusculas (__main__.TestFunciones.test08_reconoce_todas_mayusculas) ... ok
test09_reconoce_algunas_mayusculas (__main

In [6]:
texto_1 = "¿Cuándo vamos a McDonald's? Ellos no vienen hoy. ¿Dónde están ahora?"
texto_2 = "cuándo vamos a mcdonald's ellos no vienen hoy dónde están ahora"
ej1_tokens = tokenizer.tokenize(texto_1)
ej2_tokens = tokenizer.tokenize(texto_2)
print(f"Tokens original: {ej1_tokens}")
print(f"Tokens plano: {ej2_tokens}")



Tokens original: ['¿', 'Cu', '##ánd', '##o', 'va', '##mos', 'a', 'McDonald', "'", 's', '?', 'El', '##los', 'no', 'viene', '##n', 'hoy', '.', '¿', 'D', '##ón', '##de', 'están', 'ahora', '?']
Tokens plano: ['cu', '##ánd', '##o', 'va', '##mos', 'a', 'm', '##c', '##dona', '##ld', "'", 's', 'ellos', 'no', 'viene', '##n', 'hoy', 'dó', '##nde', 'están', 'ahora']


## Conversion en embeddings y tensores.

In [7]:
MAP_PUNT_INI = {"": 0, "¿": 1}
MAP_PUNT_FIN = {"": 0, ",": 1, ".": 2, "?": 3}

# Faltaría usar dataset y data loader de pytorch

import torch
import numpy as np
import re

# Regex robusta para español (palabras con acentos + puntuación)
TOKEN_REGEX = r"[A-Za-zÁÉÍÓÚÜÑáéíóúüñ]+|[.,;:!?¿¡…]+|\S"


def procesar_texto_a_tensores(texto, instancia_id=1):
    etiquetas = tokenizar_y_etiquetar(texto, instancia_id)

    # -----------------------------------------
    # 1) Si el tokenizador falla, re-tokenizamos acá
    # -----------------------------------------
    if etiquetas is None or len(etiquetas) == 0:
        tokens = re.findall(TOKEN_REGEX, texto)

        if len(tokens) == 0:
            return {
                "instancia_id": instancia_id,
                "tokens": [],
                "X_emb": torch.empty(0),
                "y_cap": torch.empty(0, dtype=torch.long),
                "y_punt_ini": torch.empty(0, dtype=torch.long),
                "y_punt_fin": torch.empty(0, dtype=torch.long),
                "etiquetas": []
            }

        # construimos etiquetas vacías pero compatibles
        etiquetas = []
        for tok in tokens:
            etiquetas.append({
                "token": tok,
                "capitalizacion": int(tok[0].isupper()),
                "punt_inicial": "",
                "punt_final": "" if tok not in [".", ",", "?", "¿"] else tok
            })

    # -----------------------------------------
    # 2) Extraer los tokens
    # -----------------------------------------
    tokens = [fila["token"] for fila in etiquetas]

    if len(tokens) == 0:
        return {
            "instancia_id": instancia_id,
            "tokens": [],
            "X_emb": torch.empty(0),
            "y_cap": torch.empty(0, dtype=torch.long),
            "y_punt_ini": torch.empty(0, dtype=torch.long),
            "y_punt_fin": torch.empty(0, dtype=torch.long),
            "etiquetas": []
        }

    # -----------------------------------------
    # 3) Generar embeddings de tokens
    # -----------------------------------------
    lista_embeddings = []
    for tok in tokens:
        emb = get_multilingual_token_embedding(tok)

        if emb is None or emb.numel() == 0:
            emb = torch.zeros(768)

        lista_embeddings.append(emb)

    # Torch stack seguro
    embeddings_tensor = torch.stack(lista_embeddings)

    # -----------------------------------------
    # 4) Convertir etiquetas a tensores
    # -----------------------------------------

    MAP_PUNT_INI = {"": 0, "¿": 1}
    MAP_PUNT_FIN = {"": 0, ",": 1, ".": 2, "?": 3}

    y_cap = torch.tensor([fila["capitalizacion"] for fila in etiquetas], dtype=torch.long)
    y_punt_ini = torch.tensor([MAP_PUNT_INI.get(fila["punt_inicial"], 0) for fila in etiquetas], dtype=torch.long)
    y_punt_fin = torch.tensor([MAP_PUNT_FIN.get(fila["punt_final"], 0) for fila in etiquetas], dtype=torch.long)

    return {
        "instancia_id": instancia_id,
        "tokens": tokens,
        "X_emb": embeddings_tensor,
        "y_cap": y_cap,
        "y_punt_ini": y_punt_ini,
        "y_punt_fin": y_punt_fin,
        "etiquetas": etiquetas
    }


# Random Forest

Los features que me parecen útiles para este problema son: la posición de la palabra en la oración, la distancia entre el embedding previo y el posterior.

###Extraer posición

In [8]:
def agregar_position(lista_tensores):
    """
    Modifico los tokens, agrego un campo 'position' con respecto a toda la instancia.
    """
    for idx, fila in enumerate(lista_tensores["etiquetas"]):
        fila["position"] = idx
    return lista_tensores

###Obtener distancia previo

In [9]:
def agregar_dist_prev(lista_tokens):
    """
    Modifico los tokens, añado la distancia con el token previo.
    """
    emb = lista_tokens["X_emb"]
    num_tokens = emb.size(0)

    dist_prev = [0.0]  # El primer token NO tiene previo

    for i in range(1, num_tokens):
        dist = torch.norm(emb[i] - emb[i-1]).item()
        dist_prev.append(dist)

    # Guardamos las distancias en etiquetas:
    for i, fila in enumerate(lista_tokens["etiquetas"]):
        fila["dist_prev"] = dist_prev[i]

    return lista_tokens

### Obtener distancia siguiente

In [10]:
def agregar_dist_next(lista_tokens):
    emb = lista_tokens["X_emb"]
    num_tokens = emb.size(0)

    dist_next = []

    for i in range(num_tokens-1):
        dist = torch.norm(emb[i] - emb[i+1]).item()
        dist_next.append(dist)

    dist_next.append(0.0)  #El último NO tiene siguiente

    for i, fila in enumerate(lista_tokens["etiquetas"]):
        fila["dist_next"] = dist_next[i]

    return lista_tokens

### Unir features

In [11]:
def construir_features(lista_tokens):
    """
    devuelve dict por procesar_texto_a_tensores + funciones de features
    """
    lista_features = []

    for fila in lista_tokens["etiquetas"]:
        f = [
            fila["position"],
            fila["dist_prev"],
            fila["dist_next"]
        ]
        lista_features.append(f)

    X = torch.tensor(lista_features, dtype=torch.float32)
    return X

In [12]:
def generar_features_posicionales(lista_tokens):
  agregar_position(lista_tokens)
  agregar_dist_prev(lista_tokens)
  agregar_dist_next(lista_tokens)
  return construir_features(lista_tokens)

### Construir DataFrame


In [13]:
#PARA "DATASET FINAL"
todo_contenido = df_final["sentence"].tolist()
sum(len(sentence) for sentence in todo_contenido)

19915147

In [14]:
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score
import pandas as pd
from imblearn.under_sampling import RandomUnderSampler

In [15]:
def construir_dataset_RF(lista_textos):
    filas = []
    instancia_global = 0

    for texto in lista_textos:
        proc = procesar_texto_a_tensores(texto, instancia_id=instancia_global)

        if proc["tokens"] == []:
            print("⚠️ Texto sin tokens, saltando:", repr(texto))
            instancia_global += 1
            continue

        instancia_global += 1

        tokens = proc["tokens"]
        X_emb = proc["X_emb"]
        y_cap = proc["y_cap"]
        y_punt_ini = proc["y_punt_ini"]
        y_punt_fin = proc["y_punt_fin"]

        num_tokens = len(tokens)

        # Convertir de tensor → numpy
        X_np = X_emb.detach().cpu().numpy()

        for i in range(num_tokens):
            fila = {}

            fila["instancia_id"] = proc["instancia_id"]
            fila["token"] = tokens[i]
            fila["position"] = i
            fila["token_len"] = len(tokens[i])

            # Norma del embedding
            emb_i = X_np[i]
            fila["emb_norm"] = float(np.linalg.norm(emb_i))

            # Distancia al anterior
            if i > 0:
                fila["dist_prev"] = float(np.linalg.norm(emb_i - X_np[i-1]))
            else:
                fila["dist_prev"] = np.nan

            # Distancia al siguiente
            if i < num_tokens - 1:
                fila["dist_next"] = float(np.linalg.norm(emb_i - X_np[i+1]))
            else:
                fila["dist_next"] = np.nan

            # Etiquetas
            fila["y_cap"] = int(y_cap[i].item())
            fila["y_punt_ini"] = int(y_punt_ini[i].item())
            fila["y_punt_fin"] = int(y_punt_fin[i].item())

            filas.append(fila)

    return pd.DataFrame(filas)

In [None]:
df_rf_train = construir_dataset_RF(todo_contenido)

##Capitalización

In [17]:
df = df_rf_train.copy()

In [18]:
df["is_subword"] = df["token"].apply(lambda x: 1 if x.startswith("##") else 0)
df["token_clean"] = df["token"].apply(lambda x: x.lstrip("#"))

df["token_first_char"] = df["token_clean"].apply(
    lambda x: ord(x[0].lower()) if len(x) > 0 else 0
)

# Rellenar NaNs de dist_prev/dist_next
#df["dist_prev"] = df["dist_prev"].fillna(0)
#df["dist_next"] = df["dist_next"].fillna(0)

atributos = [
    "position",
    "token_len",
    "emb_norm",
    "dist_prev",
    "dist_next",
    "is_subword",
    "token_first_char",
]

X = df[atributos]
y = df["y_cap"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [19]:
classes, counts = np.unique(y_train, return_counts=True)

In [20]:
rus = RandomUnderSampler(
    sampling_strategy={  # cantidad deseada por clase
        0: int(counts[0] * 0.8),  # clase mayoritaria
        1: counts[1],             # otras clases sin tocar
        2: counts[2],
        3: counts[3]
    },
    random_state=42
)

X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)


In [22]:
rf_cap = RandomForestClassifier(
    n_estimators=100, #Probe con 200, tarda 5 minutos y no mejora mucho.
    max_depth=None,
    class_weight="balanced",
    random_state=0
)

rf_cap.fit(X_train_bal, y_train_bal)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [28]:
joblib.dump(rf_cap, "cap_rf.pkl")

['cap_rf.pkl']

In [25]:
y_pred = rf_cap.predict(X_test)
y_proba = rf_cap.predict_proba(X_test)


df_test_csv = X_test.copy()
df_test_csv["y_cap"] = y_test
df_test_csv["y_pred"] = y_pred
df_test_csv[["y_proba_0", "y_proba_1", "y_proba_2", "y_proba_3"]] = y_proba


df_test_csv.to_csv('test_cap_rf.csv', index=False)
#df_train.to_csv('train_cap_rf.csv', index=False)


In [26]:
print("F1:", f1_score(y_test, y_pred, labels=[0,1,2,3] ,average="macro"))
print("Accuracy:", accuracy_score(y_test, y_pred))

F1: 0.8135947954875102
Accuracy: 0.9271292974647755


##Puntuación Inicial

In [27]:
df = df_rf_train.copy()

In [29]:
df["is_subword"] = df["token"].apply(lambda x: 1 if x.startswith("##") else 0)
df["token_clean"] = df["token"].apply(lambda x: x.lstrip("#"))

df["token_first_char"] = df["token_clean"].apply(
    lambda x: ord(x[0].lower()) if len(x) > 0 else 0
)

# Rellenar NaNs de dist_prev/dist_next
df["dist_prev"] = df["dist_prev"].fillna(0)
df["dist_next"] = df["dist_next"].fillna(0)

atributos = [
    "position",
    "token_len",
    "emb_norm",
    "dist_prev",
    "dist_next",
    "is_subword",
    "token_first_char"
]

X = df[atributos]
y = df["y_punt_ini"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [30]:
classes, counts = np.unique(y_train, return_counts=True)

In [31]:
rus = RandomUnderSampler(
    sampling_strategy={  # cantidad deseada por clase
        0: int(counts[0] * 0.8),  # clase mayoritaria
        1: counts[1],             # otras clases sin tocar
    },
    random_state=42
)

X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)


In [32]:
rf_punt_ini = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight="balanced",
    random_state=0
)

rf_punt_ini.fit(X_train_bal, y_train_bal)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [33]:
joblib.dump(rf_punt_ini, "pi_rf.pkl")

['pi_rf.pkl']

In [34]:
y_pred = rf_punt_ini.predict(X_test)
y_proba = rf_punt_ini.predict_proba(X_test)


df_test_csv = X_test.copy()
df_test_csv["y_punt_ini"] = y_test
df_test_csv["y_pred"] = y_pred
df_test_csv[["y_proba_0", "y_proba_1"]] = y_proba


df_test_csv.to_csv('test_ini_rf.csv', index=False)
#df_train.to_csv('train_ini_rf.csv', index=False)

In [35]:
print("F1:", f1_score(y_test, y_pred, labels=[0, 1] ,average="macro"))
print("Accuracy:", accuracy_score(y_test, y_pred))

F1: 0.918249967057159
Accuracy: 0.9982585859358049


##Puntuación Final

In [36]:
df = df_rf_train.copy()

In [37]:
df["is_subword"] = df["token"].apply(lambda x: 1 if x.startswith("##") else 0)
df["token_clean"] = df["token"].apply(lambda x: x.lstrip("#"))

df["token_first_char"] = df["token_clean"].apply(
    lambda x: ord(x[0].lower()) if len(x) > 0 else 0
)

# Rellenar NaNs de dist_prev/dist_next
df["dist_prev"] = df["dist_prev"].fillna(0)
df["dist_next"] = df["dist_next"].fillna(0)

atributos = [
    "position",
    "token_len",
    "emb_norm",
    "dist_prev",
    "dist_next",
    "is_subword",
    "token_first_char"
]

X = df[atributos]
y = df["y_punt_fin"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [38]:
classes, counts = np.unique(y_train, return_counts=True)

In [39]:
rus = RandomUnderSampler(
    sampling_strategy={  # cantidad deseada por clase
        0: int(counts[0] * 0.8),  # clase mayoritaria
        1: counts[1],             # otras clases sin tocar
        2: counts[2],
        3: counts[3]
    },
    random_state=42
)

X_train_bal, y_train_bal = rus.fit_resample(X_train, y_train)

In [40]:
rf_punt_fin = RandomForestClassifier(
    n_estimators=100,
    max_depth=None,
    class_weight="balanced",
    random_state=0
)

rf_punt_fin.fit(X_train_bal, y_train_bal)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [41]:
joblib.dump(rf_punt_fin, "pf_rf.pkl")

['pf_rf.pkl']

In [42]:
y_pred = rf_punt_fin.predict(X_test)
y_proba = rf_punt_fin.predict_proba(X_test)


df_test_csv = X_test.copy()
df_test_csv["y_punt_fin"] = y_test
df_test_csv["y_pred"] = y_pred
df_test_csv[["y_proba_0", "y_proba_1", "y_proba_2", "y_proba_3"]] = y_proba


df_test_csv.to_csv('test_fin_rf.csv', index=False)
#df_train.to_csv('train_fin_rf.csv', index=False)

In [None]:
print("F1:", f1_score(y_test, y_pred, labels=[0, 1, 2, 3] ,average="macro"))
print("Accuracy:", accuracy_score(y_test, y_pred))

In [47]:
eval = construir_dataset_RF("content/poemas_del_alma_sentences_part_1_processed.csv")

eval["is_subword"] = eval["token"].apply(lambda x: 1 if x.startswith("##") else 0)
eval["token_clean"] = eval["token"].apply(lambda x: x.lstrip("#"))

eval["token_first_char"] = eval["token_clean"].apply(
    lambda x: ord(x[0].lower()) if len(x) > 0 else 0
)

# Rellenar NaNs de dist_prev/dist_next
eval["dist_prev"] = eval["dist_prev"].fillna(0)
eval["dist_next"] = eval["dist_next"].fillna(0)

atributos = [
    "position",
    "token_len",
    "emb_norm",
    "dist_prev",
    "dist_next",
    "is_subword",
    "token_first_char"
]

X = eval[atributos]
Y_cap = eval["y_cap"]
Y_pi = eval["y_punt_ini"]
Y_fi = eval["y_punt_fin"]

In [51]:
y_pred_cap = rf_cap.predict(X)
y_pred_pi = rf_punt_ini.predict(X)
y_pred_pf = rf_punt_fin.predict(X)


In [52]:
print("F1:", f1_score(Y_cap, y_pred_cap, labels=[0, 1, 2, 3] ,average="macro"))
print("Accuracy:", accuracy_score(Y_cap, y_pred_cap))

F1: 0.15384615384615385
Accuracy: 0.4444444444444444


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [53]:
print("F1:", f1_score(Y_pi, y_pred_pi, labels=[0, 1] ,average="macro"))
print("Accuracy:", accuracy_score(Y_pi, y_pred_pi))

F1: 0.5
Accuracy: 1.0


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [54]:
print("F1:", f1_score(Y_fi, y_pred_pf, labels=[0, 1, 2, 3] ,average="macro"))
print("Accuracy:", accuracy_score(Y_fi, y_pred_pf))

F1: 0.04489164086687306
Accuracy: 0.09259259259259259


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [56]:
y_proba = rf_punt_fin.predict_proba(X)


df_test_csv = X.copy()
df_test_csv["y_punt_fin"] = Y_fi
df_test_csv["y_pred"] = y_pred_pf
df_test_csv[["y_proba_0", "y_proba_1", "y_proba_2", "y_proba_3"]] = y_proba


df_test_csv.to_csv('test_fin_rf_p.csv', index=False)

In [76]:
y_proba = rf_punt_ini.predict_proba(X)


df_test_csv = X.copy()
df_test_csv["y_punt_ini"] = Y_pi
df_test_csv["y_pred"] = y_pred_pi
df_test_csv[["y_proba_0", "y_proba_1"]] = y_proba


df_test_csv.to_csv('test_ini_rf_p.csv', index=False)

In [74]:
y_proba = rf_punt_fin.predict_proba(X)


df_test_csv = X.copy()
df_test_csv["y_cap"] = Y_cap
df_test_csv["y_pred"] = y_pred_cap
df_test_csv[["y_proba_0", "y_proba_1", "y_proba_2", "y_proba_3"]] = y_proba


df_test_csv.to_csv('test_cap_rf_p.csv', index=False)