In [1]:
import gzip
#import json
import simplejson as json
import pandas as pd
import re
import zipfile

from datetime import datetime

In [2]:
data_dirpath = '../data'
cvs_filepath = data_dirpath + '/resumes_we2_edu1_about100.txt.zip'
embedding_filepath = data_dirpath + '/glove_s300.zip'

In [3]:
def get_valid_tokens(embedding_filepath):
    z = zipfile.ZipFile(embedding_filepath, 'r')
    if 'glove_s300.txt' not in z.namelist():
        return []
    
    # Read line-by-line due to filesize (~3gb in raw text)
    valid_tokens = set()
    for row, line in enumerate(z.open('glove_s300.txt')):
        line = line.decode('utf-8')
        if row == 0: # HEADER
            num_embeddings, embedding_size = line.split(' ')
        else:
            token_name, embedding_str = line.split(' ', 1)
            valid_tokens.add(token_name)
    z.close()
    return valid_tokens

In [4]:
token_set = get_valid_tokens(embedding_filepath)

In [5]:
# REFERENCE:
# # https://github.com/nathanshartmann/portuguese_word_embeddings/blob/master/preprocessing.py

# Punctuation list
punctuations = re.escape('!"#%\'()*+,./:;<=>?@[\\]^_`{|}~')

# ##### #
# Regex #
# ##### #
re_remove_brackets = re.compile(r'\{.*\}')
re_remove_html = re.compile(r'<(\/|\\)?.+?>', re.UNICODE)
re_transform_numbers = re.compile(r'\d', re.UNICODE)
re_transform_emails = re.compile(r'[^\s]+@[^\s]+', re.UNICODE)
re_transform_url = re.compile(r'(http|https)://[^\s]+', re.UNICODE)
# Different quotes are used.
re_quotes_1 = re.compile(r"(?u)(^|\W)[‘’′`']", re.UNICODE)
re_quotes_2 = re.compile(r"(?u)[‘’`′'](\W|$)", re.UNICODE)
re_quotes_3 = re.compile(r'(?u)[‘’`′“”]', re.UNICODE)
re_dots = re.compile(r'(?<!\.)\.\.(?!\.)', re.UNICODE)
re_punctuation = re.compile(r'([,";:]){2},', re.UNICODE)
re_hiphen = re.compile(r' -(?=[^\W\d_])', re.UNICODE)
re_tree_dots = re.compile(u'…', re.UNICODE)
# Differents punctuation patterns are used.
re_punkts = re.compile(r'(\w+)([%s])([ %s])' %
                       (punctuations, punctuations), re.UNICODE)
re_punkts_b = re.compile(r'([ %s])([%s])(\w+)' %
                         (punctuations, punctuations), re.UNICODE)
re_punkts_c = re.compile(r'(\w+)([%s])$' % (punctuations), re.UNICODE)
re_changehyphen = re.compile(u'–')
re_doublequotes_1 = re.compile(r'(\"\")')
re_doublequotes_2 = re.compile(r'(\'\')')
re_trim = re.compile(r'\s+', re.UNICODE)

def tokenize_text(text, token_set):
    """Apply all regex above to a given string."""
    text = text.lower()
    text = re.sub(r'\r?\n', ' ', text)
    text = text.replace('\xa0', ' ')
    text = re_tree_dots.sub('...', text)
    text = re.sub('\.\.\.', '', text)
    text = re_remove_brackets.sub('', text)
    text = re_changehyphen.sub('-', text)
    text = re_remove_html.sub(' ', text)
    text = re_transform_numbers.sub('0', text)
    text = re_transform_url.sub('URL', text)
    text = re_transform_emails.sub('EMAIL', text)
    text = re_quotes_1.sub(r'\1"', text)
    text = re_quotes_2.sub(r'"\1', text)
    text = re_quotes_3.sub('"', text)
    text = re.sub('"', '', text)
    text = re_dots.sub('.', text)
    text = re_punctuation.sub(r'\1', text)
    text = re_hiphen.sub(' - ', text)
    text = re_punkts.sub(r'\1 \2 \3', text)
    text = re_punkts_b.sub(r'\1 \2 \3', text)
    text = re_punkts_c.sub(r'\1 \2', text)
    text = re_doublequotes_1.sub('\"', text)
    text = re_doublequotes_2.sub('\'', text)
    text = re_trim.sub(' ', text)
    text = text.strip()
    return [word if not token_set or word in token_set else '<unk>'
            for word in text.split(' ')]

In [6]:
date_format = '%Y-%m-%dT%H:%M:%S%z'
def format_date(date_str):
    date = datetime.strptime(date_str, date_format)
    date_template = '0{}/{}' if date.month < 10 else '{}/{}'
    return date_template.format(date.month, date.year)

cv_tokens_set = set()
def tokenize_cv(cv_data, token_set=[]):
    if isinstance(cv_data, str):
        tokens = tokenize_text(cv_data, token_set)
        cv_tokens_set.update(tokens)
        return ' '.join(tokens)
    
    elif isinstance(cv_data, list):
        return [tokenize_cv(entry, token_set) for entry in cv_data]
    
    elif isinstance(cv_data, dict):
        processed_cv = dict(cv_data)
        for key in processed_cv.keys():
            if key in ['dateInit', 'dateEnd']:
                processed_cv[key] = format_date(processed_cv[key])
            processed_cv[key] = tokenize_cv(processed_cv[key], token_set)
        return processed_cv
    elif isinstance(cv_data, int):
        return cv_data
    elif not cv_data:
        return None
    else:
        raise ValueError('Type not mapped: ' + str(type(cv_data)))

In [8]:
cv_tokens_set = set()

cvs = []
with gzip.open(cvs_filepath, mode='rt', encoding='utf8') as z:
    for i, line in enumerate(z):
        if i % 10000 == 0:
            print(i)
        cv = json.loads(line)
        cv = tokenize_cv(cv, token_set)
        cvs.append(json.dumps(cv))

# After save file, replace '.txt' by '.txt.zip'
with gzip.open(data_dirpath + '/resumes_we2_edu1_about100_[preprocessed].txt', mode='w') as zipfile:
    zipfile.write('\n'.join(cvs).encode('utf-8'))

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000
210000
220000
230000
240000
250000
260000
270000
280000
290000
300000
310000
320000
330000
340000
350000
360000
370000
380000


In [9]:
with open(data_dirpath + '/resumes_we2_edu1_about100_[tokens].txt', 'w', encoding='utf-8') as fp:
    for token in cv_tokens_set:        
        fp.write(token + '\n')