# Random Search

Simple implementation of random search displaying a progress bar for convenience

## Importing modules and creating Trainer class

In [1]:
#vectorizers
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer, HashingVectorizer

# classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.pipeline import Pipeline
from sklearn import metrics
# from sklearn.model_selection import RandomizedSearchCV
from dask_ml.model_selection import RandomizedSearchCV
from dask.diagnostics import ProgressBar

from collections import defaultdict
import logging
import argparse
import random
import codecs
import sys
import csv
import pandas as pd

In [2]:
# Trainer class by Mathias Mueller

class Trainer(object):
    """
    Reads training data and trains a classifier.
    """

    def __init__(self, model="model.pkl", data=None, verbose=False):
        """
        """
        self._model = model
        self._data = data
        self._verbose = verbose

        # outcomes
        self.classes = []
        self.num_classes = 0
        self.train_X = None
        self.train_y = None
        self.vectorizer = None
        self.classifier = None
        self.pipeline = None

    def train(self):
        """
        Preprocesses data, fits a model, and finally saves the model to a file.
        """
        self._preprocess()
        self._build_pipeline()
        self._fit()

    def _preprocess(self):
        """
        Reads lines from the raw training data.
        """
        d = defaultdict(list)

        if self._data:
            data = codecs.open(self._data, "r", "UTF-8")
        else:
            logging.warning("--data not found, assuming input from STDIN")
            data = sys.stdin

        reader = csv.DictReader(data, delimiter=",", quotechar='"')

        for row in reader:
            X, y = row['Text'], row['Label']
            d[y].append(X)

        logging.debug("Examples per class:")
        for k, v in d.items():
            logging.debug("%s %d" % (k, len(v)))
        logging.debug("Total training examples: %d\n" %
                      sum([len(v) for v in d.values()]))

        self.classes = d.keys()
        self.classes = sorted(self.classes)
        self.num_classes = len(self.classes)

        l = []
        logging.debug("Samples from the data:")
        for k, values in d.items():
            logging.debug("%s\t%s" % (values[0], k))
            for value in values:
                l.append( (value, k) )

        # shuffle, just to be sure
        random.shuffle(l)
        self.train_X, self.train_y = zip(*l)

    def _build_pipeline(self):
        """
        Builds an sklearn Pipeline. The pipeline consists of a kind of
        vectorizer, followed by a kind of classifier.
        """
        self.vectorizer = TfidfVectorizer(stop_words=None)
        # self.classifier = KNeighborsClassifier(n_jobs=-1, algorithm='auto', n_neighbors=100)
        self.classifier = MLPClassifier(hidden_layer_sizes=(100,), activation='relu', solver='adam', learning_rate='adaptive',
                                        early_stopping=True, n_iter_no_change=10, verbose=True)


        self.pipeline = Pipeline([
            ("vectorizer", self.vectorizer),
            ("clf", self.classifier)
        ])

        logging.debug(self.vectorizer)
        logging.debug(self.classifier)
        logging.debug(self.pipeline)

    def _fit(self):
        """
        Fits a model for the preprocessed data.
        """
        self.pipeline.fit(self.train_X, self.train_y)

    def save(self):
        """
        Saves the whole pipeline to a pickled file.
        """
        from sklearn.externals import joblib
        joblib.dump(self.pipeline, self._model)
        logging.debug("Classifier saved to '%s'" % self._model)

## Function definitions for controlling vectorization and random search

In [3]:
def do_random_search(trainer, models):
    X = vectorize_features(trainer)
    y = trainer.train_y
    clf = MLPClassifier()

    # things to be tested
    hidden_layer_size_range = range(20, 210, 10)
    activation_values = ['identity', 'logistic', 'tanh', 'relu']
    solver_values = ['lbfgs', 'sgd', 'adam']
    learning_rate_values = ['constant', 'invscaling', 'adaptive']
    learning_rate_init_values = [0.1, 0.01, 0.001, 0.0001]
    momentum_range = [n / 10 for n in range(0, 11, 1)] # only used with solver='sgd'
    nesterovs_momentum_values = [True, False] # only used with solver='sgd'

    # constants
    EARLY_STOPPING = [True]
    N_ITER_NO_CHANGE = [5]
    VALIDATION_FRACTION = [0.1]

    param_grid =   {'hidden_layer_sizes':hidden_layer_size_range,
                    'activation':activation_values,
                    'solver':solver_values,
                    'learning_rate':learning_rate_values,
                    'learning_rate_init':learning_rate_init_values,
                    'momentum':momentum_range,
                    'nesterovs_momentum':nesterovs_momentum_values,
                    'early_stopping':EARLY_STOPPING,
                    'validation_fraction':VALIDATION_FRACTION,
                    'n_iter_no_change':N_ITER_NO_CHANGE}

    random_search = RandomizedSearchCV( estimator=clf,
                                        param_distributions=param_grid,
                                        n_iter=models,
                                        cv=10,
                                        scoring='accuracy',
                                        n_jobs=-1, # will use all available cores
                                        return_train_score=True)
    with ProgressBar():
        random_search.fit(X,y)
    return random_search


def vectorize_features(trainer):
    vec = TfidfVectorizer()
    return vec.fit_transform(trainer.train_X)


my_trainer = Trainer(data='../csv/train.csv')
my_trainer._preprocess()
random_search = do_random_search(my_trainer, 18)

FileNotFoundError: [Errno 2] No such file or directory: 'csv/train.csv'

## Inspecting results
Visualizing as a data frame

In [None]:
df = pd.DataFrame.from_dict(random_search.cv_results_)
df = df.sort_values(by=["rank_test_score"])
# print(list(df.columns.values))
df_relevant = df[['rank_test_score', 'mean_test_score', 'params', 'mean_fit_time']]
df_relevant

The best parameters are:

In [None]:
random_search.best_params_