In [64]:
import pandas as pd
import re
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from sklearn.model_selection import train_test_split

In [32]:
def load_raw_data(file_path: str) -> str:
    """
    Carga el texto crudo.
    
    Args:
    file_path (str): Ruta del archivo de texto.
    
    Returns:
    str: Texto crudo.
    """
    # Leer el texto del libro
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()

    return text

In [54]:
def extract_sentences(book: str) -> list[str]:
    """
    Extrae las oraciones del texto.
    
    Args:
    book (str): Texto crudo.
    
    Returns:
    list[str]: Oraciones.
    """
    # Separar el texto en bloques usando líneas completamente vacías como delimitadores
    lines = book.split('***')[2].split('\n\n')

    # Eliminar espacios en blanco al inicio y al final de cada línea
    lines = [line.strip() for line in lines]

    # Eliminar lineas vacias
    lines = [line for line in lines if line]

    # Eliminar lineas genericas
    lines = [line for line in lines if not (
        line.startswith('CHAPTER')) or 
        line.startswith('[Illustration]')
    ]

    # Eliminar saltos de lineas de las oraciones
    lines = [line.replace('\n', ' ') for line in lines]

    # Solo procesar lineas con mas de 150 caracteres
    lines = [line for line in lines if len(line) >= 150]

    # Separar adicionalmente por . si la oracion es muy larga
    sentences = []
    for sentence in lines:
        if len(sentence) > 250:
            sentences.extend(sentence.split('.'))
        else:
            sentences.append(sentence)

    # Solo procesar lineas con mas de 150 y menos de 250 caracteres
    sentences = [sentence for sentence in sentences if len(sentence) >= 150 and len(sentence) <= 250]

    # Eliminar espacios en blanco al inicio y al final de cada línea
    sentences = [sentence.strip() for sentence in sentences]

    return sentences

In [24]:
# Ruta a los libros originales
raw_books = {
    'austen_sense-and-sensibility': {
        'file_path': 'data/raw/austen_sense-and-sensibility.txt',
        'author': 'Jane Austen',
    },
    'austen_pride-and-prejudice': {
        'file_path': 'data/raw/austen_pride-and-prejudice.txt',
        'author': 'Jane Austen',
    },
    'austen_emma': {
        'file_path': 'data/raw/austen_emma.txt',
        'author': 'Jane Austen',
    },
    'tolstoy_youth': {
        'file_path': 'data/raw/tolstoy_youth.txt',
        'author': 'Leo Tolstoy',
    },
    'tolstoy_war-and-peace': {
        'file_path': 'data/raw/tolstoy_war-and-peace.txt',
        'author': 'Leo Tolstoy',
    },
    'tolstoy_anna-karenina': {
        'file_path': 'data/raw/tolstoy_anna-karenina.txt',
        'author': 'Leo Tolstoy',
    },
    'joyce_dubliners': {
        'file_path': 'data/raw/joyce_dubliners.txt',
        'author': 'James Joyce',
    },
    'joyce_a-portrait-of-the-artist-as-a-young-man': {
        'file_path': 'data/raw/joyce_a-portrait-of-the-artist-as-a-young-man.txt',
        'author': 'James Joyce',
    },
    'joyce_ulysses': {
        'file_path': 'data/raw/joyce_ulysses.txt',
        'author': 'James Joyce',
    }
}

In [55]:
df = pd.DataFrame(columns=['author', 'sentence'])

for book in raw_books.values():
    corpus = load_raw_data(book['file_path'])
    author = book['author']
    
    # Extraer las oraciones del texto
    sentences = extract_sentences(corpus)

    df = pd.concat([df, pd.DataFrame({'author': author, 'sentence': sentences})], ignore_index=True)

df.head()

Unnamed: 0,author,sentence
0,Jane Austen,"Their estate was large, and their residence wa..."
1,Jane Austen,The late owner of this estate was a single man...
2,Jane Austen,"But her death, which happened ten years before..."
3,Jane Austen,"Henry Dashwood to his wishes, which proceeded ..."
4,Jane Austen,"The son, a steady respectable young man, was a..."


In [56]:
df.to_csv('data/classifier/sentences.csv', index=False)

In [57]:
# Contar el número de datos de entrenamiento por cada autor
author_counts = df['author'].value_counts()

# Convertir los conteos en un DataFrame
summary_df = author_counts.reset_index()
summary_df.columns = ['author', 'num_training_data']

summary_df

Unnamed: 0,author,num_training_data
0,Leo Tolstoy,10551
1,Jane Austen,3934
2,James Joyce,2864


In [60]:
# Conjunto de entrenamiento y prueba
x_train, x_test, y_train, y_test = train_test_split(df['sentence'], df['author'],
                                                    train_size=0.7, random_state=42)

In [65]:
# Tokenizacion y preprocesamiento
def preprocess_text(text: str) -> list[str]:
    """
    Limpia y tokeniza el texto mediante:
    1. Eliminación de puntuación y caracteres especiales.
    2. Convierte el texto a minúsculas.
    3. Tokenización del texto en palabras.
    4. Eliminación de palabras vacías (stopwords).
    
    Args:
    text (str): Texto de entrada a preprocesar.
    
    Returns:
    list: Una lista de tokens (palabras) del texto limpiado.
    """
    # Eliminar cualquier carácter no alfabético, números, etc.
    text = re.sub(r'[^\w\s]', '', text)
    
    # Tokenizar y convertir el texto a minúsculas
    tokens = gensim.utils.simple_preprocess(text, deacc=True)
    
    # Eliminar palabras vacías (stopwords)
    tokens = [word for word in tokens if word not in STOPWORDS]
    
    return tokens

x_train = x_train.apply(preprocess_text)
x_train.head()

2746     [apologies, friend, good, charade, confined, s...
6631     [thirteenth, june, french, russian, emperors, ...
10127    [handsome, young, soldier, brought, wood, sett...
16398    [nonperishable, goods, bought, moses, herzog, ...
11648    [race, began, ring, yards, away, course, obsta...
Name: sentence, dtype: object