In [1]:
import numpy as np
import pandas as pd
import nltk # Pacote para remoção de stopwords em português
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline

In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [3]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bruno.g.toledo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [4]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),
                             min_df=0.001,
                             max_df=0.75,
                             stop_words=nltk.corpus.stopwords.words('portuguese'))


In [5]:
X = vectorizer.fit_transform(train_df['fala'])
y = train_df['situacao']

In [10]:
print(X.shape, y.shape)

(2740, 7647) (2740,)


In [11]:
# get baseline performance
most_frequent = DummyClassifier(strategy='most_frequent')
print(cross_val_score(most_frequent , X, y=y, cv=5, n_jobs=-1,
                      scoring="f1_micro").mean())
# fine-tune classifier
base_clf = CalibratedClassifierCV(cv=5,
                                  base_estimator=LogisticRegression(n_jobs=-1,
                                                                    solver='lbfgs'))

0.5416058394160583


In [12]:
param_grid = {'base_estimator__C': [50, 20, 10, 1.0, 0.5, 0.1,
                                    0.05, 0.01],
                                    'base_estimator__class_weight': ['balanced', 'auto']}
search = GridSearchCV(base_clf , param_grid , cv=5, scoring='f1_micro')
search.fit(X, y)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\bruno.g.toledo\AppData\Local\anaconda3_2\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\bruno.g.toledo\AppData\Local\anaconda3_2\Lib\site-packages\sklearn\calibration.py", line 395, in fit
    self.calibrated_classifiers_ = parallel(
                                   ^^^^^^^^^
  File "c:\Users\bruno.g.toledo\AppData\Local\anaconda3_2\Lib\site-packages\sklearn\utils\parallel.py", line 63, in __call__
    return super().__call__(iterable_with_config)
        

In [13]:
# use best classifier to get performance estimate
clf = search.best_estimator_.base_estimator
print(cross_val_score(clf, X, y=y, cv=5, n_jobs=-1, scoring="f1_micro").mean())

0.6240875912408759


In [14]:
# set up the sequence
pipe = Pipeline([('reduce_dim', 'passthrough'),('classifier', clf)])

In [15]:
# specify selection range
N_FEATURES = [1800, 1500, 1000, 500, 300]
param_grid = [{'reduce_dim': [SelectKBest(chi2)],'reduce_dim__k': N_FEATURES},]

In [16]:
# fit the model to different feature sets
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid , cv=5, scoring='f1_micro')
grid.fit(X, y)

In [20]:
# save the best selector
selector = grid.best_params_['reduce_dim']
#X_sel = selector.transform(X) # Tá dando erro - Descobrir pq.
X_sel = selector.fit_transform(X,y) # Tá dando erro - Descobrir pq.



In [21]:
clf.fit(X_sel, y)

In [22]:
cv_reg = cross_val_score(clf, X_sel, y=y, cv=5, n_jobs=-1,
                          scoring="f1_micro")
print("5-CV on train: {}".format(cv_reg.mean()))

5-CV on train: 0.7175182481751825


In [24]:
 Z = vectorizer.transform(test_df['fala'])


In [25]:
Z_sel = selector.transform(Z)

In [26]:
predictions = clf.predict(Z_sel)

In [27]:
print(predictions)

[0 1 1 ... 0 0 1]
