In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV

from validation.dot_data import LemmaTokenizer, get_dictionary
from validation.data import indeed_test_data, dot_train_data, get_soc_n
from classification.embedding import PreEmbeddedVectorizer

In [None]:
SAMPLE_SIZE = 500000
SOC_LEVEL = 3

In [None]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, ids = indeed_test_data('data/us/everything.csv', SAMPLE_SIZE, SOC_LEVEL)

# Hyperparameter Search

In [None]:
class VectorizedData():
    def __init__(self, vectorizer, splits, n_jobs):
        self.vectorizer = vectorizer
        self.X_train, self.X_test, self.y_train, self.y_test = splits
        self.n_jobs = n_jobs
        
    def vectorize(self):
        self.V_train = self.vectorizer.fit_transform(self.X_train)
        self.V_test = self.vectorizer.transform(self.X_test)
        
    def top_n_results(self, search, n=5):
        res = search.cv_results_
        tops = np.flip(np.argsort(res['mean_test_score']), 0)[:5]
        scores = np.array(res['mean_test_score'])[tops]
        params = pd.DataFrame(np.array(res['params'])[tops].tolist())
        return params.assign(score = scores)

    def run_search(self, model, param_grid):
        y = pd.concat([self.y_train, self.y_test])
        try:
            X = np.concatenate([self.V_train, self.V_test])
        except ValueError:
            X = vstack([self.V_train, self.V_test]) 
        cv = [(np.arange(0, self.X_train.shape[0]), np.arange(self.X_train.shape[0], X.shape[0]))]
        search = GridSearchCV(model, param_grid=param_grid, cv = cv, n_jobs=8)
        search.fit(X, y)
        return self.top_n_results(search)

In [None]:
splits = [X_train, X_test, y_train, y_test]

ss_embedder = PreEmbeddedVectorizer('../ss_embeds/ss_100_us.txt', '../ss_models/sentencespace', SAMPLE_SIZE)
embedded = VectorizedData(ss_embedder,splits, 8)
embedded.vectorize()

tfidf = TfidfVectorizer()
bow = VectorizedData(tfidf, splits, 8)
bow.vectorize()

In [None]:
param_grid = {
    'num_leaves': [9, 31],
    'max_depth': [-1, 2],
    'n_estimators': [100, 400]
}

embedded.run_search(LGBMClassifier(), param_grid)

In [None]:
param_grid = {
    'C': [1.0, 5.0, 10.0, 20.0]
}

print(embedded.run_search(SVC(), param_grid))

      C     score
0  20.0  0.525174
1  10.0  0.519633
2   5.0  0.502294
3   1.0  0.308884


In [None]:
param_grid = {
    'C': [1.0, 5.0, 10.0],
    'multi_class': ['multinomial', 'ovr']
}

print(embedded.run_search(LogisticRegression(solver='newton-cg'), param_grid))

      C  multi_class     score
0   5.0  multinomial  0.537747
1  10.0  multinomial  0.537270
2   1.0  multinomial  0.532444
3  10.0          ovr  0.518620
4   5.0          ovr  0.517786


In [None]:
param_grid = {
    'C': [1.0, 5.0, 10.0],
    'multi_class': ['multinomial', 'ovr']
}

print(bow.run_search(LogisticRegression(solver='newton-cg'), param_grid))

      C  multi_class     score
0   5.0          ovr  0.489900
1  10.0          ovr  0.483287
2   5.0  multinomial  0.475958
3   1.0          ovr  0.469761
4   1.0  multinomial  0.469582
