In [None]:
import numpy as np
import pandas as pd
import nltk # Pacote para remoção de stopwords em português
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.dummy import DummyClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.calibration import CalibratedClassifierCV
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.pipeline import Pipeline

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
nltk.download('stopwords')
stopwords = nltk.corpus.stopwords.words('portuguese')

In [None]:
vectorizer = TfidfVectorizer(ngram_range=(1,2),
                             min_df=0.001,
                             max_df=0.75,
                             stop_words=nltk.corpus.stopwords.words('portuguese'))


In [None]:
X = vectorizer.fit_transform(train_df['fala'])
y = train_df['situacao']

In [None]:
print(X.shape, y.shape)

In [None]:
# get baseline performance
most_frequent = DummyClassifier(strategy='most_frequent')
print(cross_val_score(most_frequent , X, y=y, cv=5, n_jobs=-1,
                      scoring="f1_micro").mean())
# fine-tune classifier
base_clf = CalibratedClassifierCV(cv=5,
                                  base_estimator=LogisticRegression(n_jobs=-1,
                                                                    solver='lbfgs'))

In [None]:
param_grid = {'base_estimator__C': [50, 20, 10, 1.0, 0.5, 0.1,
                                    0.05, 0.01],
                                    'base_estimator__class_weight': ['balanced', 'auto']}
search = GridSearchCV(base_clf , param_grid , cv=5, scoring='f1_micro')
search.fit(X, y)

In [None]:
# use best classifier to get performance estimate
clf = search.best_estimator_.base_estimator
print(cross_val_score(clf, X, y=y, cv=5, n_jobs=-1, scoring="f1_micro").mean())

In [None]:
# set up the sequence
pipe = Pipeline([('reduce_dim', 'passthrough'),('classifier', clf)])

In [None]:
# specify selection range
N_FEATURES = [1800, 1500, 1000, 500, 300]
param_grid = [{'reduce_dim': [SelectKBest(chi2)],'reduce_dim__k': N_FEATURES},]

In [None]:
# fit the model to different feature sets
grid = GridSearchCV(pipe, n_jobs=1, param_grid=param_grid , cv=5, scoring='f1_micro')
grid.fit(X, y)

In [None]:
# save the best selector
selector = grid.best_params_['reduce_dim']
X_sel = selector.transform(X) # Tá dando erro - Descobrir pq.