In [312]:
from glob import glob
from tqdm import tqdm
from pandas import DataFrame
import re

## Data reading

In [313]:
# Compilamos las expresiones regulares fuera de la función para mejorar la eficiencia
email_re = re.compile(r'\S*@\S*\s?|from: |re: |subject: |urllink')
punctuation_re = re.compile(r'[?!:]')
non_alphanumeric_re = re.compile(r'[^A-Za-z0-9. \n]')
numbers_re = re.compile(r'\b\d+\b')
multiple_newlines_re = re.compile(r'\n{2,}')
single_newline_re = re.compile(r'\n')
multiple_dots_re = re.compile(r'\.\.+')
multiple_spaces_re = re.compile(r'\s+')
multiple_char_repetition_re = re.compile(r'(.)\1{2,}')

def clean_text(txt:str)->str:
    """Limpia el texto eliminando correos, etiquetas HTML y caracteres no alfanuméricos."""
    txt = txt.lower()
    txt = email_re.sub('', txt)
    txt = multiple_dots_re.sub(' ', txt)
    txt = multiple_char_repetition_re.sub(r'\1', txt)
    txt = txt.replace('-',' ')
    txt = punctuation_re.sub('.', txt)
    txt = non_alphanumeric_re.sub('', txt)
    txt = numbers_re.sub('NUM', txt)
    txt = multiple_newlines_re.sub('.', txt)
    txt = single_newline_re.sub(' ', txt)
    txt = multiple_spaces_re.sub(' ', txt)
    return txt.strip()


def read_file(f:str)->str:
    """_summary_

    Args:
        f (str): _description_

    Returns:
        str: _description_
    """    
    try:
        with open(f, 'r', encoding='utf-8') as file:
            txt = file.read()
    except UnicodeDecodeError:
        with open(f, 'r', encoding='latin1') as file:
            txt = file.read()
    
    return txt

#### 20 News

In [314]:
files_20n = glob('./raw_data/20news-18828/*/*')
len(files_20n)

18828

In [315]:
df_news_rows = []
for f in tqdm(files_20n):
    txt = read_file(f)
    txt_cln = clean_text(txt)
    
    # Dividir el texto en oraciones y limpiar oraciones cortas
    sentences = [f'<s> {s.strip()} </s>' for s in re.split(r'\.\s*', txt_cln) if len(s.strip().split()) > 1]
    
    # Crear filas de DataFrame
    df_news_rows.extend([{
        'text': s,
        'source': f,
        'length': len(s.split())
    } for s in sentences])
        
        

100%|██████████| 18828/18828 [00:04<00:00, 4520.89it/s]


In [316]:
df_news = DataFrame(df_news_rows)
df_news

Unnamed: 0,text,source,length
0,<s> nick pettefar bmw battery </s>,./raw_data/20news-18828/rec.motorcycles/104315,6
1,<s> keith hanlan on the wed NUM apr NUM NUM </s>,./raw_data/20news-18828/rec.motorcycles/104315,11
2,<s> NUM gmt wibbled </s>,./raw_data/20news-18828/rec.motorcycles/104315,5
3,<s> in article craig a </s>,./raw_data/20news-18828/rec.motorcycles/104315,6
4,<s> vechorik writes </s>,./raw_data/20news-18828/rec.motorcycles/104315,4
...,...,...,...
405879,<s> janet reno and the fbi have the murder of ...,./raw_data/20news-18828/talk.politics.guns/54348,17
405880,<s> hope they can sleep at night </s>,./raw_data/20news-18828/talk.politics.guns/54348,8
405881,<s> vasilion kb2nmv suny buffalo std </s>,./raw_data/20news-18828/talk.politics.guns/54348,7
405882,<s> all you cult haters happy now </s>,./raw_data/20news-18828/talk.politics.guns/54348,8


In [317]:
df_news.to_parquet('./data/20news.parquet', index=False)

##### Train - Test Split (20N)

In [329]:
# sample 80% of the data
df_news_train = df_news.sample(frac=0.8, random_state=42)
# the rest of the data is for testing
df_news_test = df_news.drop(df_news_train.index)

df_news_train.to_parquet('./data/train_test/20news_train.parquet', index=False)
df_news_test.to_parquet('./data/train_test/20news_test.parquet', index=False)

#### BAC

In [318]:
files_bac = glob('./raw_data/blogs/*')
len(files_bac)

19320

In [323]:
def extract_and_process_text_from_xml(file_path:str)->list:
    """Extrae y limpia el texto contenido en etiquetas <post> de un archivo XML."""
    xml_content = read_file(file_path)
    
    # Encuentra todo el contenido entre las etiquetas <post> y </post>
    post_matches = re.findall(r'<post>(.*?)</post>', xml_content, re.DOTALL)
    
    df_rows = []
    for post in post_matches:
        cleaned_post = clean_text(post.strip())
        sentences = [f'<s> {s.strip()} </s>' for s in re.split(r'\.\s*', cleaned_post) if len(s.strip().split()) > 1]
        df_rows.extend([{
            'text': s,
            #'source': file_path,
            'length': len(s.split())
        } for s in sentences])

    return df_rows

In [324]:
df_bac_rows = []
for f in tqdm(files_bac):
    df_bac_rows.extend(extract_and_process_text_from_xml(f))
    


100%|██████████| 19320/19320 [01:38<00:00, 196.73it/s]


In [325]:
df_bac = DataFrame(df_bac_rows)
df_bac

Unnamed: 0,text,length
0,<s> only NUM days NUM hour NUM minutes and NUM...,18
1,<s> cant wait </s>,4
2,<s> and this time jeans gonna kick some ass </s>,10
3,<s> poor lucy </s>,4
4,<s> she always had a huge smile on her face bu...,23
...,...,...
9509671,<s> i can come off sweet and nice but i can be...,19
9509672,<s> soulfish stew is primarily for me but if o...,18
9509673,<s> i hope to make an interesting and readable...,11
9509674,<s> the college era drinking may have stopped ...,15


In [326]:
df_bac.to_parquet('./data/bac.parquet', index=False)

##### Train - Test Split (BAC)

In [332]:
# sample 80% of the data
df_bac_train = df_bac.sample(frac=0.8, random_state=42)
# the rest of the data is for testing
df_bac_test = df_bac.drop(df_bac_train.index)

df_bac_train.to_parquet('./data/train_test/bac_train.parquet', index=False)
df_bac_test.to_parquet('./data/train_test/bac_test.parquet', index=False)