# Filtrado de datos

## Carga de datasets

In [46]:
import re
import nltk
import pandas as pd
from datasets import load_dataset

ds = load_dataset("Emilianohack6950/Wikipedia-es")

df_hamlet = pd.read_csv('../data/raw/hamlet_sentences.csv')
print(df_hamlet.head())

df_dama_boba = pd.read_csv('../data/raw/la_dama_boba_sentences.csv')
print(df_dama_boba.head())

                              instance_id  \
0  modified_WilliamShakespeareHamlet_p000   
1  modified_WilliamShakespeareHamlet_p001   
2  modified_WilliamShakespeareHamlet_p002   
3  modified_WilliamShakespeareHamlet_p003   
4  modified_WilliamShakespeareHamlet_p004   

                                            sentence  
0  Libro descargado en www.elejandria.com, tu sit...  
1                                    dominio público  
2                      ¡Esperamos que lo disfrutéis!  
3                                             Hamlet  
4                                                Por  
         instance_id                                           sentence
0  la_dama_boba_s000  La dama boba Lope de Vega  Biblioteca Virtual ...
1  la_dama_boba_s001  Accesible desde http://cervantesvirtual.com Añ...
2  la_dama_boba_s002                              PERSONAJES caballero.
3  la_dama_boba_s003                                            lacayo.
4  la_dama_boba_s004                   

In [47]:
# Convertir el split 'train' del dataset ds a un DataFrame
df_wikipedia_train = ds['train'].to_pandas()

# Convertir el split 'test' del dataset ds a un DataFrame
df_wikipedia_test = ds['test'].to_pandas()

# Unir ambos DataFrames
df_wikipedia_combined = pd.concat([df_wikipedia_train, df_wikipedia_test], ignore_index=True)


## Filtro de oraciones
Se filtran las oraciones que no tienen signos de puntuación además del punto final, ni mayúsculas además de la primera.

In [None]:
# Usamos nltk para que nos divida en oraciones de una forma simple sin tener que escribir el código a mano
nltk.download('punkt', force=True)
# Some NLTK versions store a different Spanish sentence tokenizer under 'punkt_tab'
# Download it proactively to avoid the "Resource punkt_tab not found" error
nltk.download('punkt_tab', quiet=True)


def sentences_from_dataframe(df: pd.DataFrame, column: str, language: str = 'spanish') -> list:
    """Une la columna y segmenta en oraciones con nltk (si ya tenés oraciones, no hace falta usarla)."""
    text = df[column].astype(str).str.cat(sep=' ')
    return nltk.tokenize.sent_tokenize(text, language=language)


def is_interesting_sentence(sent: str) -> bool:
    """Return True if the sentence contains punctuation besides a final-only '.'
    or contains uppercase characters beyond the first character.

    The filtering rule implemented matches your description:
    - Keep sentences that contain any punctuation other than a final '.' (e.g. commas, ?, ¡, ¿, ;, etc.)
    - OR keep sentences that have uppercase letters after the first character

    Sentences that have neither are filtered out.
    """
    s = sent.strip()
    if len(s) == 0:
        return False

    # Check uppercase beyond the first character
    if any(ch.isupper() for ch in s[1:]):
        return True

    # Check punctuation other than a final-only '.'
    # punctuation chars to consider (excluding the dot)
    punct_re = re.compile(r'¿,?')
    if punct_re.search(s):
        return True

    # If there is a dot anywhere except the final position, consider interesting
    if '.' in s[:-1]:
        return True

    # If none of the above, sentence is uninteresting (will be filtered out)
    return False


def filter_sentences(sentences: list) -> list:
    """Return a list of sentences that are "interesting" per is_interesting_sentence()."""
    return [s for s in sentences if is_interesting_sentence(s)]


def process_dataframe(df: pd.DataFrame, column: str, language: str = 'spanish', max_sentences: int = None, apply_filter: bool = False) -> list:
    """Conveniencia: extrae oraciones de un DataFrame y las procesa (usa sentences_from_dataframe).

    Args:
        df: DataFrame that contains the text column.
        column: name of the column with textual content.
        language: language for tokenization (default 'spanish').
        max_sentences: optional limit for number of sentences to return.
        apply_filter: if True, keeps only sentences considered "interesting".

    Returns:
        list of sentences (strings).
    """
    sents = sentences_from_dataframe(df, column, language=language)

    if apply_filter:
        sents = filter_sentences(sents)

    if max_sentences is not None:
        sents = sents[:max_sentences]

    return sents


[nltk_data] Downloading package punkt to /home/santiago/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [51]:
processed_wiki = process_dataframe(df_wikipedia_combined, "contenido")
len(processed_wiki), len(sentences_from_dataframe(df_wikipedia_combined, "contenido"))

(11774, 14817)

In [52]:
processed_hamlet = process_dataframe(df_hamlet, "sentence")
len(processed_hamlet), df_hamlet.shape[0]

(1421, 3650)

In [54]:
processed_dama = process_dataframe(df_dama_boba, "sentence")
len(processed_dama), df_dama_boba.shape[0]

(953, 1725)