In [2]:
from IPython.core.display import display, HTML
import datetime
def info(str_):
    print(f'{datetime.datetime.now()} [ \033[1;94mINFO\x1b[0m  ] {str_}')
def ok(str_):
    print(f'{datetime.datetime.now()} [  \033[1;92mOK\x1b[0m   ] {str_}')
def warning(str_):
    print(f'{datetime.datetime.now()} [\x1b[1;31mWARNING\x1b[0m] {str_}')
def html(str_=''):
    display(HTML(str_))

In [3]:
import pandas as pd
from lxml import etree
from bs4 import BeautifulSoup

def get_date(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
#     title = root.find('.//Title').text
    date = root.find('.//NumericDate').text
#     publisher = root.find('.//PublisherName').text
    assert date is not None
    
    return date

def get_title_and_text(filename):
    tree = etree.parse(filename)
    root = tree.getroot()
    if root.find('.//HiddenText') is not None:
        text = (root.find('.//HiddenText').text)

    elif root.find('.//Text') is not None:
        text = (root.find('.//Text').text)

    else:
        text = None
                       
    title = root.find('.//Title')
    if title is not None:
        title = title.text
    if not text is None:
        text = BeautifulSoup(text, parser='html.parser').get_text()

    return title,text

In [4]:
import os
GM_all_part1 = '/home/ec2-user/SageMaker/data/GM_all_1945_1956/'
GM_all_part2 = '/home/ec2-user/SageMaker/data/GM_all_1957-1967/'
GM_dp_dirpath = '/home/ec2-user/SageMaker/data/GM_DP_and_Canada1945_1967/'

all_files = [GM_all_part1+file_ for file_ in os.listdir(GM_all_part1)]
all_files += [GM_all_part2+file_ for file_ in os.listdir(GM_all_part2)]

dp_files = [GM_dp_dirpath+file_ for file_ in os.listdir(GM_dp_dirpath)]

# GM_dirpath = '/home/ec2-user/SageMaker/data/The_Globe_and_Mail_with_DP_filter_by_article_type/'
# all_files = [TS_dirpath+file_id for file_id in os.listdir(TS_dirpath)]
# all_files += [GM_dirpath+file_id for file_id in os.listdir(GM_dirpath)]

info(f'len(all_files):       {len(all_files):10,}')
info(f'len(dp_files):        {len(dp_files):10,}')

2022-02-22 21:43:38.829135 [ [1;94mINFO[0m  ] len(all_files):        2,079,786
2022-02-22 21:43:38.829325 [ [1;94mINFO[0m  ] len(dp_files):             6,938


In [5]:
import spacy
import pickle
import string
from tqdm import tqdm
nlp = spacy.load('en_core_web_lg', disable=['textcat','lemmatizer', 'parser', 'tagger','ner'])

def remove_punctuation(word):
    return ''.join([char for char in word if not char in string.punctuation+' '])

def tokenize(str_):
    tokens = [word.text.lower() for word in nlp(str_) if not word.is_stop]
    tokens = [word.replace('\n', '') for word in tokens if not word.isnumeric() and len(remove_punctuation(word))!=0]
    return tokens



def build_vocab(file_list, umbral=None,threshold=10000):
    if os.path.isfile('cache/vocab.p'):
        vocab = pickle.load(open('cache/vocab.p', 'rb'))
    else:
        visited = set()
        freq = {}
        for file_ in tqdm(file_list):
            title, text = get_title_and_text(file_)
            tokens = tokenize(title+' '+text)
            for token in tokens:
                if token in visited:
                    if not token in freq:
                        freq[token]=2
                    else:
                        freq[token]+=1
                else:
                    visited.add(token)
        word_frequency_list = [(word, freq[word]) for word in freq if not word.strip()=='' and len(word)>=3]
        word_frequency_list = sorted(word_frequency_list, key=lambda x: x[1],reverse=True)
        vocab = [word for word,_ in word_frequency_list[:threshold]]

        pickle.dump(vocab,open('cache/vocab.p', 'wb'))        
        del(freq)
        del(word_frequency_list)
        del(visited)
    return vocab
vocab = build_vocab(dp_files)
len(vocab)

10000

In [6]:
import spacy
import pickle
from tqdm import tqdm
import numpy as np

word2idx = dict([(word,idx) for idx, word in enumerate(vocab)])



def process_file(file_):
    output = file_[:-4] + '.bow_vector'
    x = np.zeros(shape=(len(vocab)+1), dtype='float32')
    title, text = get_title_and_text(file_)

    tokens = tokenize(title+' '+text)
    for token in tokens:
        if token in word2idx:
            x[word2idx[token]]+=1
        else:
            x[-1]+=1

    pickle.dump(x, open(output,'wb'))


In [None]:
import concurrent.futures

info('Starting...')

with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    executor.map(process_file, all_files)

ok('Done!')

2022-02-22 21:45:06.189658 [ [1;94mINFO[0m  ] Starting...


In [None]:
!du -hs