In [1]:
import argparse
import mlflow
import mlflow.sklearn

import gc  # Consumo de memória é muito alto, então chamada
            # ao coletor de lixo ajuda um pouco
import os
import json
import time
import pickle
import importlib
import itertools

import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.stats import randint as sp_randint

import sys
sys.path.append('..')

import util
import upbg


In [2]:
T0 = time.time()

def sec_exp(t): 
    d = t // 86400; t -= d * 86400 
    h = t // 3600; t -= h * 3600 
    m = t // 60; t -= m * 60 
    s = t; t -= s 
    return dict(d=d, h=h, m=m, s=s) 

def beautiful_date(dic):
    return "{}d, {}h, {}m e {:.2f}s".format(
        dic.get('d', 0), dic.get('h', 0), dic.get('m'), dic.get('s')
    )

def show_spent_time(t, s='', t0=T0):
    spent_time = beautiful_date(sec_exp(t - t0))
    print(s or "Tempo desde o começo do experimento:", spent_time)

In [3]:

parser = argparse.ArgumentParser(description='Run PBG on jurix2020 corpus.')
parser.add_argument('--n_components', type=int, default=100)
parser.add_argument('--alpha', type=float, default=0.005)
parser.add_argument('--beta', type=float, default=0.001)
parser.add_argument('--local_max_itr', type=int, default=1)
parser.add_argument('--global_max_itr', type=int, default=1)
parser.add_argument('--local_threshold', type=float, default=1)
parser.add_argument('--global_threshold', type=float, default=1)
parser.add_argument('--ngram_min', type=int, default=1)
parser.add_argument('--ngram_max', type=int, default=1)
parser.add_argument('--lines_percentage', type=float, default=1, 
    help='How many lines will be used for train/test/validation datasets',
)
# parser.add_argument('--huge_mem', type=int, default=0,)
# parser.add_argument('--use_spacy', type=int, default=0,)
parser.add_argument('--data_size', type=str, default='small',
    choices=['small', 'medium'],
    help='Which dataset will be used for train/test/validation ',
)

args, unknown = parser.parse_known_args()
dict_args = args.__dict__
print("ARGS: ", args)
open('log.txt', 'w').write(str(args))

ARGS:  Namespace(alpha=0.005, beta=0.001, data_size='small', global_max_itr=1, global_threshold=1, lines_percentage=1, local_max_itr=1, local_threshold=1, n_components=100, ngram_max=1, ngram_min=1)


191

In [4]:
DATA_SIZE=dict_args.pop('data_size')
VALIDATION_DATA_PATH='csv/validation_{}.csv'.format(DATA_SIZE)
TRAIN_DATA_PATH='csv/train_{}.csv'.format(DATA_SIZE)
TEST_DATA_PATH='csv/test_{}.csv'.format(DATA_SIZE)

THEMES = [
    5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409,
    555, 589, 597, 634, 660, 695, 729, 766, 773, 793, 800,
    810, 852, 895, 951, 975
]


def groupby_process(df):
    new_df = df.sort_values(['process_id', 'page'])
    new_df = new_df.groupby(
                ['process_id', 'themes'],
                group_keys=False
            ).apply(lambda x: x.body.str.cat(sep=' ')).reset_index()
    new_df = new_df.rename(index=str, columns={0: "body"})
    return new_df

# Nota: para rápida iteração, limitar qtd de linhas carregadas
def get_data(path, preds=None, key=None, lines_per=.02):
    data = pd.read_csv(path)
    if lines_per is not None:
        lines = int(lines_per * data.shape[0])
        data = data.iloc[:lines, :]
    
    data = data.rename(columns={ 'pages': 'page'})
#     data["preds"] = preds[key]
#     data = data[data["preds"] != "outros"]
    data = groupby_process(data)
    
#     data.themes = data.themes.apply(lambda x: literal_eval(x))
    data.themes = data.themes.apply(lambda x: eval(x))
    return data

def transform_y(train_labels, test_labels):
    mlb = MultiLabelBinarizer()
    mlb.fit(train_labels)

    mlb_train = mlb.transform(train_labels)
    mlb_test = mlb.transform(test_labels)

    print(mlb.classes_)

    return mlb_train, mlb_test, mlb

In [5]:
train_data = get_data(TRAIN_DATA_PATH, lines_per=dict_args['lines_percentage'])
test_data = get_data(TEST_DATA_PATH, lines_per=dict_args['lines_percentage'])

print("MAX_TRAIN_TYPE:", type(train_data.body[0]))
print("MAX_TRAIN_LEN:", max(map(len, train_data.body)))
print("MAX_TEST_LEN:", max(map(len, test_data.body)))

dict_args.pop('lines_percentage', None)
# validation_data = get_data(VALIDATION_DATA_PATH)

MAX_TRAIN_TYPE: <class 'str'>
MAX_TRAIN_LEN: 7527991
MAX_TEST_LEN: 1948317


1

In [6]:
train_data.themes = train_data.themes.apply(
    lambda x: list(set(sorted([i if i in THEMES else 0 for i in x])))
)
test_data.themes = test_data.themes.apply(
    lambda x: list(set(sorted([i if i in THEMES else 0 for i in x])))
)
# validation_data.themes = validation_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))


y_train, y_test, mlb = transform_y(train_data.themes, test_data.themes)
del test_data; gc.collect()

X_train = train_data.body
# X_test = test_data.body
X_train_themes = train_data.themes

print('X_train: {}, \n\ty_train: {}'.format(X_train.shape, y_train.shape))
# print('X_test: {}, \n\ty_test: {}'.format(X_test.shape, y_test.shape))


vectorizer = TfidfVectorizer(
    ngram_range=(dict_args.pop('ngram_min'), dict_args.pop('ngram_max')),
    sublinear_tf=True,
)


X_train_vect = vectorizer.fit_transform(X_train)
# X_valid = vectorizer.transform(validation_data.body)
# X_test_vect = vectorizer.transform(X_test)
# y_valid = mlb.transform(validation_data.themes)

del X_train
# del X_test
gc.collect()

[  0   5   6  26  33 139 163 232 313 339 350 406 409 555 589 597 634 660
 695 729 766 773 793 800 810 852 895 951 975]
X_train: (2743,), 
	y_train: (2743, 29)


73

In [7]:
# %%time
import util
importlib.reload(util)
# DISCLAIMER: só pode ser executada uma vez (não pergunte o motivo,
# mas parece ter a ver com `docs` do SimplePreprocessingBR)

print('preprocessing...')
params=dict(
    use_nltk=True,
    extra_stop_words=[i.lower().strip() for i in open('stopwords.txt').readlines()],
)

# pp = util.SimplePreprocessingBR_Lite(**params)
t0_proc_train = time.time()


pp = util.SimplePreprocessing_MemConstrained(**params)
M_train = pp.transform(train_data.body)

show_spent_time(time.time(), f"Tempo gasto preprocessando treino:", t0_proc_train)

print("PREPROCESSOU O TREINO")

# M_test = pp.transform(test_data.body)
# print("PREPROCESSOU O TESTE")


print('done.')

categories = set((itertools.chain(*train_data.themes)))
n_class = len(categories)
print(f'nclass {n_class}')

preprocessing...
done.
nclass 29
CPU times: user 3min 29s, sys: 538 ms, total: 3min 30s
Wall time: 3min 30s


In [8]:
del pp
del train_data
# del test_data
gc.collect()

27

In [9]:
M_train_vectorized = vectorizer.fit_transform(M_train)
del M_train; gc.collect()

# M_test_vectorized = vectorizer.transform(M_test)
# del M_test; gc.collect()


0

In [10]:

importlib.reload(upbg)
hyperparams=dict_args

pbg = upbg.UPBG(
    **hyperparams,
    feature_names=vectorizer.get_feature_names(),
    debug=True,
)
dict_args.clear()
del hyperparams

In [11]:
print("fitting...")
# pbg.fit(M_train, newsgroups_train.target)
t0_fit = time.time()
pbg.fit(
    M_train_vectorized,
    X_train_themes,
)
show_spent_time(time.time(), "Tempo gasto no treinamento:", t0_fit)
print('done')

# mlflow.sklearn.log_model(pbg, 'pbg_model_spacy')

fitting...


docs processed (itr 0): 100%|##########| 2743/2743 [00:29<00:00, 93.74it/s] 
global propagation:   : 100%|##########| 175273/175273 [03:59<00:00, 732.60it/s] 


done


## Vai fazer dump tema-topico

In [12]:

import json 

temas = json.load(open('temas.json')) 
tema_topico = {} 
topicos = pbg.get_topics(20) 
for tema, topico in pbg.map_class_.items(): 
    if tema > 0: 
        tema = str(tema)         
        tema_topico[temas[tema]] = topicos[topico] 
mlflow.log_dict(tema_topico, "tema_topico.json", )

Abaixo, é chegado o momento de fazer dump tema_topico e dos topicos sem temas pois, por algum motivo, acho que o dump dos modelos treinados com a base `medium` estão falhando.