In [None]:
import pickle
from time import time
import pandas as pd
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier, BaggingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
from sklearn.neural_network import MLPClassifier

## Load training subset

In [None]:
df_train = pd.read_pickle('./text/newsgroups/train.pkl')
print(df_train.shape)

## Load pre-built feature transformer

In [None]:
VOCABULARY_SIZE = 'trim'
transformer = None
with open('./models/tfidf_transformer_{}.pkl'.format(VOCABULARY_SIZE), 'rb') as f:
    transformer = pickle.load(f)

In [None]:
transformer

## Create features from training subset

In [None]:
corpus_train = df_train['tokens'].map(lambda x: ' '.join(x))
X_train = transformer.transform(corpus_train).toarray()
y_train = df_train['categoryid']

In [None]:
def select_random_forest_cls(random_state):
    param_grid = {
        'n_estimators': [100, 300, 600], 
        'max_depth': [6, None],
        'max_features': ['log2', 'sqrt'],
        #'min_samples_split': [2, 4],
        #'min_samples_leaf': [1, 2],
        #'bootstrap': [True, False],
        'criterion': ['gini', 'entropy'],
        'random_state': [random_state],
    }
    grid = GridSearchCV(RandomForestClassifier(), param_grid, cv=2, verbose=1, n_jobs=1)

    return grid

In [None]:
def select_mlp_cls(random_state):
    param_grid = {
        'hidden_layer_sizes': [(256, 256)],
        'activation': ['tanh', 'relu'],
        'max_iter': [1000],
        'alpha': [1e-4],
        'solver': ['sgd', 'adam'],
        'verbose': [False],
        'tol': [1e-4],
        'learning_rate_init': [.1],
        'random_state': [random_state],
    }
    grid = GridSearchCV(MLPClassifier(), param_grid, cv=2, verbose=1, n_jobs=1)

    return grid

In [None]:
random_state = 42
models = {
    'GaussianNB': GaussianNB(),
    'AdaBoost': AdaBoostClassifier(random_state=random_state),
    'RandomForestClassifier': select_random_forest_cls(random_state=random_state),
    'Bagging': BaggingClassifier(random_state=random_state),
    'MLPClassifier': select_mlp_cls(random_state=random_state),
    'LinearSVC': LinearSVC(random_state=random_state, tol=1e-4, max_iter=5000)
}

In [None]:
for model_name, model in models.items():
    start_time = time()
    model.fit(X_train, y_train)
    duration = time() - start_time
   
    file_name = './models/{}_{}.pkl'.format(model_name, str(VOCABULARY_SIZE))
    with open(file_name, 'wb') as f:
        pickle.dump(model, f, protocol=pickle.HIGHEST_PROTOCOL)
        print('{} - {:.2f} secs'.format(file_name, duration))