In [1]:
%load_ext autoreload
%autoreload 2

import requests
from bs4 import BeautifulSoup
from tqdm import tqdm_notebook as tqdm
import time
import numpy as np
import pandas as pd
import nltk   
import unicodedata
from html.parser import HTMLParser
import re
from tqdm import tqdm
import glob
from utils import *
import pathlib
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import LeaveOneGroupOut
from sklearn.metrics import precision_recall_fscore_support
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler

import requests
import pickle
import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv('text_store_label.csv')
df = df.fillna(" ")
u_class = df['store'].values
l  = df['link'].values
X  = df['text'].values
y  = df['label'].values
logo = LeaveOneGroupOut()

In [3]:
clss = [GaussianNB(), DecisionTreeClassifier(), SVC(gamma='auto'), 
        MLPClassifier(hidden_layer_sizes=(256, 64, 32)), LogisticRegression(solver='lbfgs'),
        RandomForestClassifier(n_estimators=100, n_jobs=3)]

acc_results,pre_results,rec_results = [], [], []
metrics = ['Acuracy', 'Precision','Recall', 'Train Time']
header = [cls.__class__.__name__ for cls in clss]
header = [[h + " " + m for m in metrics] for h in header]
header = np.array(header).reshape(-1)

In [4]:
u_class

array(['magazineluiza', 'amazon', 'amazon', ..., 'ricardoeletro',
       'ricardoeletro', 'ricardoeletro'], dtype=object)

In [5]:
stop_words_pt = ["de","a","o","que","e","do","da","em","um","para","é","com","não","uma","os","no",
                 "se","na","por","mais","as","dos","como","mas","foi","ao","ele","das","tem","à","seu",
                 "sua","ou","ser","quando","muito","há","nos","já","está","eu","também","só","pelo","pela",
                 "até","isso","ela","entre","era","depois","sem","mesmo","aos","ter","seus","quem","nas","me",
                 "esse","eles","estão","você","tinha","foram","essa","num","nem","suas","meu","às","minha","têm",
                 "numa","pelos","elas","havia","seja","qual","será","nós","tenho","lhe","deles","essas","esses",
                 "pelas","este","fosse","dele","tu","te","vocês","vos","lhes","meus","minhas","teu","tua","teus",
                 "tuas","nosso","nossa","nossos","nossas","dela","delas","esta","estes","estas","aquele","aquela",
                 "aqueles","aquelas","isto","aquilo","estou","está","estamos","estão","estive","esteve","estivemos",
                 "estiveram","estava","estávamos","estavam","estivera","estivéramos","esteja","estejamos","estejam",
                 "estivesse","estivéssemos","estivessem","estiver","estivermos","estiverem","hei","há","havemos","hão",
                 "houve","houvemos","houveram","houvera","houvéramos","haja","hajamos","hajam","houvesse","houvéssemos",
                 "houvessem","houver","houvermos","houverem","houverei","houverá","houveremos","houverão","houveria",
                 "houveríamos","houveriam","sou","somos","são","era","éramos","eram","fui","foi","fomos","foram","fora",
                 "fôramos","seja","sejamos","sejam","fosse","fôssemos","fossem","for","formos","forem","serei","será",
                 "seremos","serão","seria","seríamos","seriam","tenho","tem","temos","tém","tinha","tínhamos","tinham",
                 "tive","teve","tivemos","tiveram","tivera","tivéramos","tenha","tenhamos","tenham","tivesse","tivéssemos",
                 "tivessem","tiver","tivermos","tiverem","terei","terá","teremos","terão","teria","teríamos","teriam"]

In [None]:
opt = [0, 1]
for use_lowercase in opt:
    for use_stopwords in opt:
        stop_words = stop_words_pt if use_stopwords else None
        for use_tfidf in tqdm(opt):
            #print(use_lowercase, use_stopwords, use_tfidf)

            final_metrics = []
            for cls in clss:
                text_clf = Pipeline([
                    ('vect', CountVectorizer(lowercase=use_lowercase, stop_words=stop_words)),
                    ('tfidf', TfidfTransformer() if use_tfidf else None),
                    ('tranf', DenseTransformer()),
                    ('clf', cls),
                ])
                model_metrics = []

                for train_index, test_index in (logo.split(X, y, u_class)):

                    X_train, X_test = X[train_index], X[test_index]
                    y_train, y_test = y[train_index], y[test_index]

                    start_time = time.time()
                    text_clf.fit(X_train, y_train)
                    train_time = time.time()  - start_time

                    y_pred = text_clf.predict(X_test).astype(np.int)
                    precision, recall, fscore, _ = precision_recall_fscore_support(
                                                    y_test, y_pred, average='macro')

                    acc = np.mean(y_pred == y_test)
                    model_metrics.append([acc, precision, recall, train_time])

                if final_metrics == []:
                    final_metrics = np.array(model_metrics)
                else:
                    final_metrics = np.concatenate((final_metrics, model_metrics),axis=1)

            results = pd.DataFrame(final_metrics,columns=header)
            # lowercase, stop_words, tfidf
            results.to_csv(f"results_{use_lowercase}_{use_stopwords}_{use_tfidf}.csv")
            results.describe()

### Save Logs

In [153]:
results_file_t = np.sort(glob.glob('results_*.csv'))
results_file = [pd.read_csv(file) for file in results_file_t]
results_file = [file[file.columns[1:]] for file in results_file]
header = results_file[0].columns
header = [h.replace('Classifier', "") for h in header]
clean_matrix_all = []
for idx in range(len(results_file_t)):
    results_file[idx].columns = header
    clean_matrix = results_file[idx].describe().loc[['mean', 'std']]
    clean_matrix_all.append(clean_matrix)
    #clean_matrix.to_csv('clean_'+results_file_t[idx])

In [211]:
columns     = clean_matrix.columns
index       = clean_matrix.index
n_matrix    = len(clean_matrix_all)
new_index   = np.array(tuple(index.values) * n_matrix)
new_columns = columns.values#np.array(tuple(columns.values) * n_matrix)
new_vals = np.concatenate(clean_matrix_all)

In [212]:
values = [v.values for v in clean_matrix_all]
n_val = np.concatenate(tuple(values),axis=0)


In [219]:
n_val.shape, new_columns.shape, new_index.shape

((16, 24), (24,), (16,))

In [222]:
pd.DataFrame(n_val, columns=new_columns).to_csv("all_metrics.csv")

# TPOT

In [233]:
from tpot import TPOTClassifier

In [234]:
tpot = TPOTClassifier(verbosity=3, 
                      scoring="balanced_accuracy", 
                      random_state=23, 
                      periodic_checkpoint_folder="tpot_mnst1.txt", 
                      n_jobs=-1, 
                      generations=3, 
                      population_size=5)


In [None]:
opt = [0, 1]
use_lowercase = True
use_stopwords = True
use_tfidf = False
stop_words = stop_words_pt if use_stopwords else None

final_metrics = []
            
text_clf = Pipeline([
    ('vect', CountVectorizer(lowercase=use_lowercase, stop_words=stop_words)),
    ('tfidf', TfidfTransformer() if use_tfidf else None),
    ('tranf', DenseTransformer()),
    ('clf', tpot),
])
model_metrics = []

for train_index, test_index in (logo.split(X, y, u_class)):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    start_time = time.time()
    text_clf.fit(X_train, y_train)
    train_time = time.time()  - start_time

    y_pred = text_clf.predict(X_test).astype(np.int)
    precision, recall, fscore, _ = precision_recall_fscore_support(
                                    y_test, y_pred, average='macro')

    acc = np.mean(y_pred == y_test)
    model_metrics.append([acc, precision, recall, train_time])

if final_metrics == []:
    final_metrics = np.array(model_metrics)
else:
    final_metrics = np.concatenate((final_metrics, model_metrics),axis=1)

results = pd.DataFrame(final_metrics,columns=header)
# lowercase, stop_words, tfidf
results.to_csv(f"results_tpot_{use_lowercase}_{use_stopwords}_{use_tfidf}.csv")
results.describe()

In [245]:
clss = [MLPClassifier(hidden_layer_sizes=(1024, 512, 256, 64, 32), learning_rate_init=0.001,early_stopping=True)]#, MLPClassifier(hidden_layer_sizes=(512, 32),early_stopping=True)  ]

acc_results,pre_results,rec_results = [], [], []
metrics = ['Acuracy', 'Precision','Recall', 'Train Time']
header = [cls.__class__.__name__ for cls in clss]
header = [[h + " " + m for m in metrics] for h in header]
header = np.array(header).reshape(-1)


use_lowercase = True
use_stopwords = True
use_tfidf = False
stop_words = stop_words_pt if use_stopwords else None

final_metrics = []
for cls in clss:
    text_clf = Pipeline([
        ('vect', CountVectorizer(lowercase=use_lowercase, stop_words=stop_words)),
        ('tfidf', TfidfTransformer() if use_tfidf else None),
        ('tranf', DenseTransformer()),
        ('clf', cls),
    ])
    model_metrics = []

    for train_index, test_index in tqdm(logo.split(X, y, u_class)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        start_time = time.time()
        text_clf.fit(X_train, y_train)
        train_time = time.time()  - start_time

        y_pred = text_clf.predict(X_test).astype(np.int)
        precision, recall, fscore, _ = precision_recall_fscore_support(
                                        y_test, y_pred, average='macro')

        acc = np.mean(y_pred == y_test)
        model_metrics.append([acc, precision, recall, train_time])

    if final_metrics == []:
        final_metrics = np.array(model_metrics)
    else:
        final_metrics = np.concatenate((final_metrics, model_metrics),axis=1)

results = pd.DataFrame(final_metrics,columns=header)
# lowercase, stop_words, tfidf
results.to_csv(f"results_mlp_a_{use_lowercase}_{use_stopwords}_{use_tfidf}.csv")
results.describe()

Unnamed: 0,MLPClassifier Acuracy,MLPClassifier Precision,MLPClassifier Recall,MLPClassifier Train Time
count,10.0,10.0,10.0,10.0
mean,0.893194,0.869116,0.879217,15.034539
std,0.153287,0.169704,0.160797,1.54244
min,0.547554,0.612319,0.515196,12.751517
25%,0.846118,0.708454,0.812147,14.042177
50%,0.978701,0.957593,0.959649,15.430089
75%,0.994842,0.98868,0.996318,16.087667
max,1.0,1.0,1.0,17.443422


In [22]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('classifier' , LogisticRegression())])
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear','lbfgs']},]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)


In [23]:
clss = [clf]

acc_results,pre_results,rec_results = [], [], []
metrics = ['Acuracy', 'Precision','Recall', 'Train Time']
header = [cls.__class__.__name__ for cls in clss]
header = [[h + " " + m for m in metrics] for h in header]
header = np.array(header).reshape(-1)


use_lowercase = True
use_stopwords = True
use_tfidf = False
stop_words = stop_words_pt if use_stopwords else None

final_metrics = []
for cls in clss:
    text_clf = Pipeline([
        ('vect', CountVectorizer(lowercase=use_lowercase, stop_words=stop_words)),
        ('tfidf', TfidfTransformer() if use_tfidf else None),
        ('tranf', DenseTransformer()),
        ('clf', cls),
    ])
    model_metrics = []

    for train_index, test_index in (logo.split(X, y, u_class)):

        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        start_time = time.time()
        text_clf.fit(X_train, y_train)
        train_time = time.time()  - start_time

        y_pred = text_clf.predict(X_test).astype(np.int)
        precision, recall, fscore, _ = precision_recall_fscore_support(
                                        y_test, y_pred, average='macro')

        acc = np.mean(y_pred == y_test)
        model_metrics.append([acc, precision, recall, train_time])

    if final_metrics == []:
        final_metrics = np.array(model_metrics)
    else:
        final_metrics = np.concatenate((final_metrics, model_metrics),axis=1)

results = pd.DataFrame(final_metrics,columns=header)
# lowercase, stop_words, tfidf
results.to_csv(f"results_mlp_{use_lowercase}_{use_stopwords}_{use_tfidf}.csv")
results.describe()

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  42 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 192 tasks      | elapsed:   15.6s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   16.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   14.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    4.0s
[Parallel(n_jobs=-1)]: Done 193 out of 200 | elapsed:   14.3s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   15.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.9s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   11.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   14.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   11.9s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   14.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.1s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   12.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done 144 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:    8.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=-1)]: Done  76 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:   14.5s finished


Unnamed: 0,GridSearchCV Acuracy,GridSearchCV Precision,GridSearchCV Recall,GridSearchCV Train Time
count,10.0,10.0,10.0,10.0
mean,0.908806,0.899356,0.892291,13.408756
std,0.158033,0.156045,0.16576,2.390862
min,0.532609,0.60221,0.515196,8.241345
25%,0.922106,0.924313,0.871382,12.071824
50%,0.986713,0.968815,0.973473,14.121754
75%,0.995781,0.991011,0.997007,14.50175
max,1.0,1.0,1.0,16.709162


In [46]:
clss = [clf.best_estimator_]
header = [cls.__class__.__name__ for cls in clss]
header = [[h + " " + m for m in metrics] for h in header]
header = np.array(header).reshape(-1)


text_clf = Pipeline([
        ('vect', CountVectorizer(lowercase=use_lowercase, stop_words=stop_words)),
        ('tfidf', TfidfTransformer() if use_tfidf else None),
        ('tranf', DenseTransformer()),
        ('clf', clss[0]),
    ])
model_metrics = []
for train_index, test_index in (logo.split(X, y, u_class)):

    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

    start_time = time.time()
    text_clf.fit(X_train, y_train)
    train_time = time.time()  - start_time

    y_pred = text_clf.predict(X_test).astype(np.int)
    precision, recall, fscore, _ = precision_recall_fscore_support(
                                    y_test, y_pred, average='macro')

    acc = np.mean(y_pred == y_test)
    model_metrics.append([acc, precision, recall, train_time])

results = pd.DataFrame(model_metrics,columns=header)
results.describe()

Unnamed: 0,Pipeline Acuracy,Pipeline Precision,Pipeline Recall,Pipeline Train Time
count,10.0,10.0,10.0,10.0
mean,0.89964,0.895743,0.868074,0.063229
std,0.156885,0.154544,0.173888,0.012108
min,0.532609,0.60221,0.515196,0.048243
25%,0.880486,0.923609,0.758142,0.053342
50%,0.977971,0.956534,0.97806,0.060979
75%,0.995781,0.991011,0.997173,0.072081
max,1.0,1.0,1.0,0.083267


In [47]:
filename = 'page_clf3.sav'
pickle.dump(text_clf, open(filename, 'wb'))

In [None]:
from sklearn.model_selection import GridSearchCV
pipe = Pipeline([('classifier' , LogisticRegression())])
param_grid = [
    {'classifier' : [LogisticRegression()],
     'classifier__penalty' : ['l1', 'l2'],
    'classifier__C' : np.logspace(-4, 4, 20),
    'classifier__solver' : ['liblinear']},]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)
