In [32]:
from glob import glob
from tqdm import tqdm
from pandas import DataFrame, Series
import re
import numpy as np
from random import random

from collections import Counter, defaultdict
from itertools import chain

from aux_functions import read_process_file
from aux_functions import n_grams
from aux_functions import clean_text

from importlib import import_module
import_module('aux_functions')
import_module('aux_functions.read_process_file')
import_module('aux_functions.n_grams')

<module 'aux_functions.n_grams' from '/home/raul/Escritorio/extra/misis/nlp/nlp/HW02/aux_functions/n_grams.py'>

## 20 News

In [2]:
files_20n = glob('./raw_data/20news-18828/*/*')
len(files_20n)

18828

In [3]:
df_news_rows = []
for f in tqdm(files_20n):
    txt = read_process_file.read_file(f)
    txt_cln = read_process_file.clean_text(txt)
    
    # Dividir el texto en oraciones y limpiar oraciones cortas
    sentences = [f'<s> {s.strip()} </s>' for s in re.split(r'\.\s*', txt_cln) if len(s.strip().split()) > 1]
    
    # Crear filas de DataFrame
    df_news_rows.extend([{
        'text': s,
        'source': f,
        'length': len(s.split())
    } for s in sentences])
        
        

  0%|          | 0/18828 [00:00<?, ?it/s]

100%|██████████| 18828/18828 [00:07<00:00, 2378.29it/s]


In [4]:
df_news = DataFrame(df_news_rows)
df_news

Unnamed: 0,text,source,length
0,<s> nick pettefar bmw battery </s>,./raw_data/20news-18828/rec.motorcycles/104315,6
1,<s> keith hanlan on the wed NUM apr NUM NUM </s>,./raw_data/20news-18828/rec.motorcycles/104315,11
2,<s> NUM gmt wibbled </s>,./raw_data/20news-18828/rec.motorcycles/104315,5
3,<s> in article craig a </s>,./raw_data/20news-18828/rec.motorcycles/104315,6
4,<s> vechorik writes </s>,./raw_data/20news-18828/rec.motorcycles/104315,4
...,...,...,...
405879,<s> janet reno and the fbi have the murder of ...,./raw_data/20news-18828/talk.politics.guns/54348,17
405880,<s> hope they can sleep at night </s>,./raw_data/20news-18828/talk.politics.guns/54348,8
405881,<s> vasilion kb2nmv suny buffalo std </s>,./raw_data/20news-18828/talk.politics.guns/54348,7
405882,<s> all you cult haters happy now </s>,./raw_data/20news-18828/talk.politics.guns/54348,8


In [5]:
df_news.to_parquet('./data/20news.parquet', index=False)

##### Train - Test Split (20N)

In [6]:
# Muestra del 80% de los datos para entrenar
df_news_train = df_news.sample(frac=0.8, random_state=42)
# el resto para evaluar
df_news_test = df_news.drop(df_news_train.index)

df_news_train.to_parquet('./data/train_test/20news_train.parquet', index=False)
df_news_test.to_parquet('./data/train_test/20news_test.parquet', index=False)

### N-Gram model and prob inference

In [7]:
n = 2
ngram_counts, final_unigram = n_grams.create_ngram_model(n_gram=n,
                                                         text_corpus=df_news_train.text)

In [8]:
# Con este proceso, practicamnte ya esta servido el calculo de la perplejidad
[n_grams.estimate_probability(token_text=i, n_gram=n, 
                              final_unigram=final_unigram, 
                              ngram_counts=ngram_counts) for i in n_grams.create_ngrams(sentence=df_news.text.iloc[0],n=n)]

[0.0002865013194831411,
 0.0007686257686257686,
 1.7924359204158452e-05,
 7.138013490845498e-05,
 0.0009812142079817317]

('<s>', 'uk')

In [19]:
# Generar texto
n_gram_probs = {k: n_grams.estimate_probability(token_text=k,
                             n_gram=n,
                             final_unigram=final_unigram,
                             ngram_counts=ngram_counts
                             ) for k in ngram_counts.keys()}

In [79]:
# Create sentences 

sentence_length = 20
initial_token = '<s>'
sentence = []
for _ in range(sentence_length):
    next_token = [(k,i) for k,i in n_gram_probs.items() if k[0]==initial_token]
    next_token = sorted(next_token, key= lambda x: x[1], reverse=True)
    token = next_token[0][0][1]
    i = 0
    # FIXME: <UNK> is not in the vocab. In the vocab is (<UNK>,)
    while token in ['NUM','<UNK>','</s>']: 
        token = next_token[i][0][1]
        i += 1
        if random()<.8:
            token='NUM'
        
    sentence.append(token)
    
    initial_token = token
    
sentence

['the',
 'same',
 'thing',
 'as',
 'a',
 'new',
 'york',
 'islanders',
 'and',
 'the',
 'same',
 'thing',
 'about',
 'the',
 'same',
 'thing',
 'then',
 'the',
 'same',
 'thing']

## BAC

In [318]:
files_bac = glob('./raw_data/blogs/*')
len(files_bac)

19320

In [324]:
df_bac_rows = []
for f in tqdm(files_bac):
    df_bac_rows.extend(read_process_file.extract_and_process_text_from_xml(f))
    


100%|██████████| 19320/19320 [01:38<00:00, 196.73it/s]


In [325]:
df_bac = DataFrame(df_bac_rows)
df_bac

Unnamed: 0,text,length
0,<s> only NUM days NUM hour NUM minutes and NUM...,18
1,<s> cant wait </s>,4
2,<s> and this time jeans gonna kick some ass </s>,10
3,<s> poor lucy </s>,4
4,<s> she always had a huge smile on her face bu...,23
...,...,...
9509671,<s> i can come off sweet and nice but i can be...,19
9509672,<s> soulfish stew is primarily for me but if o...,18
9509673,<s> i hope to make an interesting and readable...,11
9509674,<s> the college era drinking may have stopped ...,15


In [326]:
df_bac.to_parquet('./data/bac.parquet', index=False)

##### Train - Test Split (BAC)

In [332]:
# sample 80% of the data
df_bac_train = df_bac.sample(frac=0.8, random_state=42)
# the rest of the data is for testing
df_bac_test = df_bac.drop(df_bac_train.index)

df_bac_train.to_parquet('./data/train_test/bac_train.parquet', index=False)
df_bac_test.to_parquet('./data/train_test/bac_test.parquet', index=False)