In [1]:
#importing dependencies
import time
import numpy as np
import pandas as pd
import string
from nltk.corpus import stopwords

In [2]:
#Reading the Data
data = pd.read_pickle('./data/data_clean/Data.pik')

In [3]:
X = data['Corpus']
y = data['Label']

In [4]:
def text_process(mess):
    """
    Takes in a string of text, then performs the following:
    1. Remove all punctuation
    2. Remove all stopwords
    3. Returns a list of the cleaned text
    """
    # Check characters to see if they are in punctuation
    nopunc = [char for char in mess if char not in string.punctuation]

    # Join the characters again to form the string.
    nopunc = ''.join(nopunc)
    
    # Now just remove any stopwords
    return [word.lower() for word in nopunc.split() if word.lower() not in stopwords.words('portuguese')]

In [5]:
# Check to make sure its working
data['Corpus'].head(5).apply(text_process)

0    [tivit, terceirização, processos, serviços, te...
1    [laudo, avaliação, conforme, instrução, cvm, 3...
2    [rec, 844, securitizadora, créditos, imobiliár...
3    [página, é, parte, integrante, ata, assembleia...
4    [1, tec, toy, sa, companhia, aberta, cnpjmf, n...
Name: Corpus, dtype: object

In [82]:
#Creating a corpus tidy to future. it will take about 1 hour to run
corpus_tidy = data['Corpus'].apply(text_process)
#Saving corpus_tidy  to future using
from sklearn.externals import joblib
joblib.dump(corpus_tidy, './variables/vectorizer/corpus_tidy.pik') 

CorpusTidy = pd.DataFrame(corpus_tidy)
data['Corpus_tidy'] = CorpusTidy
data.to_pickle('./data/data_clean/Data_Tidy.pik')

['./variables/vectorizer/corpus_tidy.pik']

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
initime = time.time()

# Might take awhile...
bow_transformer = CountVectorizer(analyzer=text_process).fit(data['Corpus'])

fimtime = time.time()

In [7]:
# Print total number of vocab words
print(len(bow_transformer.vocabulary_))

#Print total time to create the vectorizer
totaltime = (fimtime-initime) / 60
print ("Total Time: ", totaltime, "minutes" ) 

407680
Total Time:  81.23165499766668 minutes


In [8]:
#start the clock
initime = time.time()

#Creates a bow corpus 
corpus_bow = bow_transformer.transform(data['Corpus'])

#Print total time to create bow corpus
fimtime = time.time()
totaltime = (fimtime-initime) / 60
print ("Total Time: ", totaltime, "minutes" ) 

Total Time:  89.77143399715423 minutes


In [9]:
print('Shape of Sparse Matrix: ', corpus_bow.shape)
print('Amount of Non-Zero occurences: ', corpus_bow.nnz)

Shape of Sparse Matrix:  (6837, 407680)
Amount of Non-Zero occurences:  3753943


In [57]:
#Saving corpus_bow and bow_transformer to future using
from sklearn.externals import joblib
joblib.dump(bow_transformer, './variables/vectorizer/bow_transformer.pik') 
joblib.dump(corpus_bow, './variables/vectorizer/corpus_bow.pik') 

['./variables/vectorizer/corpus_bow.pik']

# TF-IDF

In [11]:
from sklearn.feature_extraction.text import TfidfTransformer
#start the clock
initime = time.time()

tfidf_transformer = TfidfTransformer().fit(corpus_bow)
corpus_tfidf = tfidf_transformer.transform(corpus_bow)

print(corpus_tfidf.shape)

#Print total time to create TF-IDF corpus
fimtime = time.time()
totaltime = (fimtime-initime) / 60
print ("Total Time: ", totaltime, "minutes" ) 


(6837, 407680)
Total Time:  0.012891169389088948 minutes


# Creating a BOW Model

In [32]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(corpus_bow, y, test_size=0.2,random_state=101)

In [33]:
from sklearn.naive_bayes import MultinomialNB
bow_model = MultinomialNB().fit(X_train, y_train)

In [34]:
y_pred = bow_model.predict(X_test)

In [35]:
from sklearn.metrics import classification_report, confusion_matrix
print (classification_report(y_test, y_pred))
print (confusion_matrix(y_test, y_pred))

                                  precision    recall  f1-score   support

        Cancelamento de Registro       0.46      0.11      0.17        56
              Controle Acionario       0.40      0.07      0.12        27
          Direito de Preferencia       0.58      0.11      0.18        65
                       Dividendo       0.90      0.96      0.93       783
                  Oferta Publica       0.72      0.77      0.75       190
Pedido de Recuperação Judicial       0.95      0.51      0.67        72
                    Subscrição       0.59      0.89      0.71       175

                     avg / total       0.80      0.81      0.78      1368

[[  6   0   0  17  22   0  11]
 [  0   2   1   9   9   2   4]
 [  0   0   7  12   4   0  42]
 [  3   2   1 749   3   0  25]
 [  4   0   2  17 147   0  20]
 [  0   0   1  14  12  37   8]
 [  0   1   0  12   6   0 156]]


In [39]:
from sklearn.externals import joblib
joblib.dump(bow_model, './model/bow_model_v1.pkl') 

['./model/bow_model_v1.pkl']

# Creating a TF-IDF

In [20]:
#Train Test Split
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = \
train_test_split(corpus_tfidf, y, test_size=0.2,random_state=101)

In [21]:
from sklearn.naive_bayes import MultinomialNB
bow_model = MultinomialNB().fit(X_train, y_train)

In [22]:
y_pred = bow_model.predict(X_test)

In [23]:
from sklearn.metrics import classification_report, confusion_matrix
print (classification_report(y_test, y_pred))
print (confusion_matrix(y_test, y_pred))

                                  precision    recall  f1-score   support

        Cancelamento de Registro       0.00      0.00      0.00        56
              Controle Acionario       0.00      0.00      0.00        27
          Direito de Preferencia       0.00      0.00      0.00        65
                       Dividendo       0.58      1.00      0.74       783
                  Oferta Publica       0.93      0.07      0.13       190
Pedido de Recuperação Judicial       0.00      0.00      0.00        72
                    Subscrição       0.91      0.06      0.11       175

                     avg / total       0.58      0.59      0.45      1368

[[  0   0   0  55   1   0   0]
 [  0   0   0  27   0   0   0]
 [  0   0   0  64   0   0   1]
 [  0   0   0 783   0   0   0]
 [  0   0   0 177  13   0   0]
 [  0   0   0  72   0   0   0]
 [  0   0   0 165   0   0  10]]


  'precision', 'predicted', average, warn_for)


it is possible to notice that TF-IDF don't work for this model

# Creating a Grid Search with Cross validation to bow model 

In [58]:
from sklearn.model_selection import GridSearchCV
parameters = {'alpha' : (1, 1e-2,1e-3,1e-4, 1e-5, 1e-6, 1e-7, 1e-8, 1e-9, 1e-10)}

In [59]:
gs_bow_model = GridSearchCV(MultinomialNB(), parameters, n_jobs=-1,cv=10)

In [60]:
gs_bow_model.fit(X_train, y_train)

GridSearchCV(cv=10, error_score='raise',
       estimator=MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'alpha': (1, 0.01, 0.001, 0.0001, 1e-05, 1e-06, 1e-07, 1e-08, 1e-09, 1e-10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [61]:
gs_bow_model.best_score_

0.84073870908758452

In [62]:
for param_name in sorted(parameters.keys()):
    print("%s: %r" % (param_name, gs_bow_model.best_params_[param_name]))

alpha: 1e-06


In [63]:
y_pred = gs_bow_model.predict(X_test)

In [64]:
from sklearn.metrics import classification_report, confusion_matrix
print (classification_report(y_test, y_pred))
print (confusion_matrix(y_test, y_pred))

                                  precision    recall  f1-score   support

        Cancelamento de Registro       0.36      0.50      0.42        56
              Controle Acionario       0.58      0.26      0.36        27
          Direito de Preferencia       0.61      0.42      0.50        65
                       Dividendo       0.94      0.95      0.95       783
                  Oferta Publica       0.79      0.71      0.75       190
Pedido de Recuperação Judicial       0.97      0.79      0.87        72
                    Subscrição       0.71      0.85      0.77       175

                     avg / total       0.84      0.84      0.84      1368

[[ 28   1   1   8  14   0   4]
 [  2   7   1   6   7   1   3]
 [  0   0  27   7   3   1  27]
 [ 19   2   1 747   4   0  10]
 [ 26   1   4  12 135   0  12]
 [  1   0   1   7   3  57   3]
 [  2   1   9  10   5   0 148]]
