In [1]:
%load_ext autoreload
%autoreload 2

import os
import sys
import pickle
import pandas as pd
import numpy as np
from joblib import dump, load
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import confusion_matrix, classification_report, hamming_loss, zero_one_loss, accuracy_score
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from ast import literal_eval

In [2]:
THEMES = [5, 6, 26, 33, 139, 163, 232, 313, 339, 350, 406, 409, 555, 589,
          597, 634, 660, 695, 729, 766, 773, 793, 800, 810, 852, 895, 951, 975]
TRAIN_DATA_PATH = '../train.csv'
TEST_DATA_PATH = '../test.csv'
VALIDATION_DATA_PATH = '../validation.csv'

In [3]:
def groupby_process(df):
    new_df = df.sort_values(['process_id', 'page'])
    new_df = new_df.groupby(
                ['process_id', 'themes'],
                group_keys=False
            ).apply(lambda x: x.body.str.cat(sep=' ')).reset_index()
    new_df = new_df.rename(index=str, columns={0: "body"})
    return new_df

def get_data(path, preds=None, key=None):
    data = pd.read_csv(path)
    data = data.rename(columns={ 'pages': 'page'})
#     data["preds"] = preds[key]
#     data = data[data["preds"] != "outros"]
    data = groupby_process(data)
    data.themes = data.themes.apply(lambda x: literal_eval(x))
    return data

def transform_y(train_labels, test_labels):
    mlb = MultiLabelBinarizer()
    mlb.fit(train_labels)

    mlb_train = mlb.transform(train_labels)
    mlb_test = mlb.transform(test_labels)

    print(mlb.classes_)

    return mlb_train, mlb_test, mlb

In [4]:
train_data = get_data(TRAIN_DATA_PATH)
test_data = get_data(TEST_DATA_PATH)
validation_data = get_data(VALIDATION_DATA_PATH)

train_data.themes = train_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
test_data.themes = test_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))
validation_data.themes = validation_data.themes.apply(lambda x: list(set(sorted([i if i in THEMES else 0 for i in x]))))

y_train, y_test, mlb = transform_y(train_data.themes, test_data.themes)

X_train = train_data.body
X_test = test_data.body
print('X_train: {}, \n\ty_train: {}'.format(X_train.shape, y_train.shape))
print('X_test: {}, \n\ty_test: {}'.format(X_test.shape, y_test.shape))
print('Classes: ', mlb.classes_)
print('We\'re classifying {} themes!'.format(y_train.shape[1]))

[  0   5   6  26  33 139 163 232 313 339 350 406 409 555 589 597 634 660
 695 729 766 773 793 800 810 852 895 951 975]
X_train: (31851,), 
	y_train: (31851, 29)
X_test: (6839,), 
	y_test: (6839, 29)
Classes:  [  0   5   6  26  33 139 163 232 313 339 350 406 409 555 589 597 634 660
 695 729 766 773 793 800 810 852 895 951 975]
We're classifying 29 themes!


In [21]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from xgboost.sklearn import XGBClassifier
from sklearn.multiclass import OneVsRestClassifier

vectorizer = TfidfVectorizer(ngram_range=(1, 1), sublinear_tf=True,
                                   min_df=50, max_df=0.5)

xgboost = OneVsRestClassifier(XGBClassifier(
                n_jobs=-1,
                max_depth=4,
                learning_rate=0.1,
                n_estimators=500,
            ),
            n_jobs=1)

In [18]:
from scipy.stats import randint as sp_randint
from scipy.stats import uniform as sp_uni

param_dist = {"estimator__max_depth": sp_randint(1, 8),
              "estimator__learning_rate": [0.1, 0.3, 0.5],
              "estimator__n_estimators": [30, 100, 300, 500, 1000]}


In [19]:
X_train = vectorizer.fit_transform(X_train)
X_valid = vectorizer.transform(validation_data.body)
X_test = vectorizer.transform(X_test)
y_valid = mlb.transform(validation_data.themes)

In [20]:
len(vectorizerctorizer.vocabulary_)

81424

In [22]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(xgboost, param_distributions=param_dist,
                                   n_iter=20, n_jobs=1, iid=False, refit=False,
                                   verbose=2, random_state=42)
random_search.fit(X_valid, y_valid)

Fitting 3 folds for each of 20 candidates, totalling 60 fits
[CV] estimator__learning_rate=0.5, estimator__max_depth=4, estimator__n_estimators=1000 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=4, estimator__n_estimators=1000, total=18.6min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 19.8min remaining:    0.0s


[CV] estimator__learning_rate=0.5, estimator__max_depth=4, estimator__n_estimators=1000 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=4, estimator__n_estimators=1000, total=17.1min
[CV] estimator__learning_rate=0.5, estimator__max_depth=4, estimator__n_estimators=1000 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=4, estimator__n_estimators=1000, total=18.8min
[CV] estimator__learning_rate=0.5, estimator__max_depth=3, estimator__n_estimators=1000 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=3, estimator__n_estimators=1000, total=17.7min
[CV] estimator__learning_rate=0.5, estimator__max_depth=3, estimator__n_estimators=1000 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=3, estimator__n_estimators=1000, total=16.4min
[CV] estimator__learning_rate=0.5, estimator__max_depth=3, estimator__n_estimators=1000 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=3, estimator__n_estimators=1000, total=18.3min
[CV] estimator__learning_rate=0.1, 

[CV] estimator__learning_rate=0.5, estimator__max_depth=5, estimator__n_estimators=30 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=5, estimator__n_estimators=30, total= 3.5min
[CV] estimator__learning_rate=0.5, estimator__max_depth=2, estimator__n_estimators=500 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=2, estimator__n_estimators=500, total= 9.4min
[CV] estimator__learning_rate=0.5, estimator__max_depth=2, estimator__n_estimators=500 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=2, estimator__n_estimators=500, total= 8.8min
[CV] estimator__learning_rate=0.5, estimator__max_depth=2, estimator__n_estimators=500 
[CV]  estimator__learning_rate=0.5, estimator__max_depth=2, estimator__n_estimators=500, total= 9.9min
[CV] estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=100 
[CV]  estimator__learning_rate=0.1, estimator__max_depth=4, estimator__n_estimators=100, total= 6.5min
[CV] estimator__learning_rate=0.1, estimator__m

[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed: 645.8min finished


RandomizedSearchCV(cv=None, error_score='raise',
          estimator=OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          n_jobs=1),
          fit_params=None, iid=False, n_iter=20, n_jobs=1,
          param_distributions={'estimator__max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f60bdb47f98>, 'estimator__learning_rate': [0.1, 0.3, 0.5], 'estimator__n_estimators': [30, 100, 300, 500, 1000]},
          pre_dispatch='2*n_jobs', random_state=42, refit=False,
          return_train_score='warn', scoring=None, verbose=2)

In [23]:
random_search.cv_results_



{'mean_fit_time': array([1053.79960982, 1012.17796572,  411.63576015,  464.25849446,
         633.2705272 , 1091.47795343,  261.72865176,  484.03086972,
        1178.29943895,  339.45550481,  259.84234103,  456.94132328,
         158.04578495,  447.44196924,  164.8339657 ,  525.04841661,
         347.27201931,  176.61875502,  642.8788201 ,  614.17706052]),
 'std_fit_time': array([51.08567669, 55.25825137, 36.49884395, 26.60470786, 33.57374011,
        50.75376202, 19.3187573 , 32.28077435, 59.39604779, 23.58725634,
        18.12687872, 25.80619513, 13.48224903, 26.10983934, 12.98115601,
        33.92960511, 29.7745101 , 12.39511817, 44.12709222, 31.81958291]),
 'mean_score_time': array([36.4256444 , 36.7630314 , 37.14055093, 37.08242623, 36.53297575,
        35.77673372, 35.71005948, 35.92802262, 37.55801479, 36.33566658,
        35.80561972, 35.78043906, 34.89083219, 35.69628565, 35.48374176,
        35.65119521, 35.54993876, 36.51039608, 36.81485224, 35.81126372]),
 'std_score_time':

In [36]:
best_params = random_search.best_params_; best_params

{'estimator__learning_rate': 0.1,
 'estimator__max_depth': 2,
 'estimator__n_estimators': 500}

In [35]:
xgboost = random_search.estimator.set_params(**best_params); xgboost

OneVsRestClassifier(estimator=XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=2, min_child_weight=1, missing=None, n_estimators=500,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=1),
          n_jobs=1)

In [None]:
xgboost.fit(X_train, y_train)

In [None]:
target_names=[str(x) for x in mlb.classes_]

In [None]:
preds_test = xgboost.predict(X_test)
print(classification_report(y_test, preds_test, target_names=target_names, digits=4))
print(accuracy_score(y_test, preds_test))

In [None]:
from sklearn.externals import joblib

joblib.dump(xgboost, './models/tfidf_xgboot.pkl')
joblib.dump(vectorizer, "./models/tfidf_vectorizer.pkl")