In [163]:
from glob import glob
from tqdm import tqdm
from pandas import DataFrame
import re

## 20 News

In [164]:
news_corpus = glob('./raw_data/20news-18828/*/*')
len(news_corpus)

18828

In [169]:
def clean_text(txt):
    txt = txt.lower()
    # Remove emails, HTML tags, and specific keywords in one pass
    txt = re.sub(r'\S*@\S*\s?|from: |re: |subject: ', '', txt)
    
    # Replace ?,!, and : with .
    txt = re.sub(r'[?!:]', '.', txt)
    
    # Replace non-alphanumeric characters, except ., space, and newline
    txt = re.sub(r'[^A-Za-z0-9. \n]', '', txt)
    
    # Replace numbers with 'NUM' if they are standalone
    txt = re.sub(r'\b\d+\b', 'NUM', txt)
    
    # Replace multiple newlines with '.', and single newline with space
    txt = re.sub(r'\n{2,}', '.', txt)
    txt = re.sub(r'\n', ' ', txt)
    
    # Remove suspensive dots '...' and replace multiple dots with a single dot
    txt = re.sub(r'\.\.+', '.', txt)
    
    # Replace multiple spaces with a single space
    txt = re.sub(r'\s+', ' ', txt)
    
    return txt.strip()


In [170]:
def read_file(f):
    try:
        with open(f, 'r', encoding='utf-8') as file:
            txt = file.read()
    except UnicodeDecodeError:
        with open(f, 'r', encoding='latin1') as file:
            txt = file.read()
    
    return txt

df_rows = []
for f in tqdm(news_corpus):
    txt = read_file(f)
    txt_cln = clean_text(txt)
    
    # Dividir el texto en oraciones y limpiar oraciones cortas
    sentences = [f'<s> {s.strip()} </s>' for s in re.split(r'\.\s*', txt_cln) if len(s.strip().split()) > 1]
    
    # Crear filas de DataFrame
    df_rows.extend([{
        'text': s,
        'source': f,
        'length': len(s.split())
    } for s in sentences])
        
        

100%|██████████| 18828/18828 [00:03<00:00, 5696.64it/s]


In [174]:
df_news = DataFrame(df_rows)
df_news

Unnamed: 0,text,source,length
0,<s> nick pettefar bmw battery </s>,./raw_data/20news-18828/rec.motorcycles/104315,6
1,<s> keith hanlan on the wed NUM apr NUM NUM </s>,./raw_data/20news-18828/rec.motorcycles/104315,11
2,<s> NUM gmt wibbled </s>,./raw_data/20news-18828/rec.motorcycles/104315,5
3,<s> in article craig a </s>,./raw_data/20news-18828/rec.motorcycles/104315,6
4,<s> vechorik writes </s>,./raw_data/20news-18828/rec.motorcycles/104315,4
...,...,...,...
410236,<s> janet reno and the fbi have the murder of ...,./raw_data/20news-18828/talk.politics.guns/54348,17
410237,<s> hope they can sleep at night </s>,./raw_data/20news-18828/talk.politics.guns/54348,8
410238,<s> vasilion kb2nmv suny buffalo std </s>,./raw_data/20news-18828/talk.politics.guns/54348,7
410239,<s> all you cult haters happy now </s>,./raw_data/20news-18828/talk.politics.guns/54348,8


In [173]:
df_news.to_parquet('./data/20news.parquet', index=False)