In [50]:
import numpy as np
import pandas as pd
import nltk # Pacote para remoção de stopwords em português
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline

In [51]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [52]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\toled\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [53]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),
                             min_df=0.001,
                             max_df=0.75,
                             stop_words=nltk.corpus.stopwords.words('portuguese'))


In [54]:
X = vectorizer.fit_transform(train_df['fala'])
y = train_df['situacao']

In [55]:
print(X.shape, y.shape)

(2740, 7647) (2740,)


In [56]:
# get baseline performance
most_frequent = DummyClassifier(strategy='most_frequent')
print(cross_val_score(most_frequent , X, y=y, cv=5, n_jobs=-1,
                      scoring="f1_micro").mean())
# fine-tune classifier
base_clf = CalibratedClassifierCV(cv=5,
                                  base_estimator=LogisticRegression(n_jobs=-1,
                                                                    solver='lbfgs'))

0.5416058394160583


In [57]:
param_grid = {'base_estimator__C': [50, 20, 10, 1.0, 0.5, 0.1,
                                    0.05, 0.01],
                                    'base_estimator__class_weight': ['balanced', 'auto']}
search = GridSearchCV(base_clf , param_grid , cv=5, scoring='f1_micro')
search.fit(X, y)

40 fits failed out of a total of 80.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\toled\AppData\Roaming\Python\Python311\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\toled\AppData\Roaming\Python\Python311\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\toled\AppData\Roaming\Python\Python311\site-packages\sklearn\calibration.py", line 428, in fit
    self.calibrated_classifiers_ = parallel(
                              

In [58]:
# use best classifier to get performance estimate
clf = search.best_estimator_.base_estimator
print(cross_val_score(clf, X, y=y, cv=5, n_jobs=-1, scoring="f1_micro").mean())

0.6240875912408759


In [59]:
# set up the sequence
pipe = Pipeline([('reduce_dim', 'passthrough'),('classifier', clf)])

In [60]:
# specify selection range
N_FEATURES = [1800, 1500, 1000, 500, 300]
param_grid = [{'reduce_dim': [SelectKBest(chi2)],'reduce_dim__k': N_FEATURES},]

In [None]:
# fit the model to different feature sets
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid , cv=5, scoring='f1_micro')
grid.fit(X, y)

In [66]:
# save the best selector
selector = grid.best_params_['reduce_dim']
X_sel = selector.transform(X) # Tá dando erro - Descobrir pq.

NotFittedError: This SelectKBest instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.