In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from lightgbm import LGBMClassifier

from validation.dot_data import LemmaTokenizer, get_dictionary
from validation.data import indeed_test_data, dot_train_data, get_soc_n
from classification.embedding import PreEmbeddedVectorizer
from validation.scoring import BubbleUpMixin

In [2]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 6

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', SAMPLE_SIZE, SOC_LEVEL)

y_test = y_test.map(lambda x: str(x)[:3])

In [18]:
y_train = y_train.astype(str)

0        516031
1        272011
2        111011
3        212099
4        171011
5        172121
6        171011
7        173011
8        172011
9        172011
10       419031
11       172112
12       172011
13       172011
14       172011
15       172011
16       172011
17       172011
18       173021
19       173013
20       173021
21       172072
22       172071
23       172071
24       172071
25       172071
26       172072
27       172061
28       172072
29       172071
          ...  
19500    537111
19501    537111
19502    537111
19503    537111
19504    537111
19505    537111
19506    537111
19507    537111
19508    537111
19509    537111
19510    537111
19511    537121
19512    537121
19513    537121
19514    537121
19515    537121
19516    537121
19517    537121
19518    537121
19519    537121
19520    537121
19521    537121
19522    537121
19523    537121
19524    537121
19525    537121
19526    537121
19527    537121
19528    537121
19529    537121
Length: 32398, dtype: ob

# Hyperparameter Search

In [4]:
class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

class BubbleUpSVC(BubbleUpMixin, SVC):
    pass

class BubbleUpLGBM(BubbleUpMixin, LGBMClassifier):
    pass

class BubbleUpKNN(BubbleUpMixin, KNeighborsClassifier):
    pass

In [5]:
class VectorizedData():
    def __init__(self, vectorizer, splits, n_jobs):
        self.vectorizer = vectorizer
        self.X_train, self.X_test, self.y_train, self.y_test = splits
        self.n_jobs = n_jobs
        
    def vectorize(self):
        self.V_train = self.vectorizer.fit_transform(self.X_train)
        self.V_test = self.vectorizer.transform(self.X_test)
        
    def top_n_results(self, search, n=5):
        res = search.cv_results_
        tops = np.flip(np.argsort(res['mean_test_score']), 0)[:5]
        scores = np.array(res['mean_test_score'])[tops]
        params = pd.DataFrame(np.array(res['params'])[tops].tolist())
        return params.assign(score = scores)

    def run_search(self, model, param_grid):
        y = pd.concat([self.y_train, self.y_test])
        try:
            X = np.concatenate([self.V_train, self.V_test])
        except ValueError:
            X = vstack([self.V_train, self.V_test]) 
        cv = [(np.arange(0, self.X_train.shape[0]), np.arange(self.X_train.shape[0], X.shape[0]))]
        search = GridSearchCV(model, param_grid=param_grid, cv = cv, n_jobs=self.n_jobs)
        search.fit(X, y)
        return self.top_n_results(search)

In [19]:
splits = [X_train, X_test, y_train, y_test]
ss_embedder = PreEmbeddedVectorizer('../indeed-embeds/model', cache_dir='indeed_embed_cache')
embedded = VectorizedData(ss_embedder,splits, 8)
embedded.vectorize()

# tfidf = TfidfVectorizer()
# bow = VectorizedData(tfidf, splits, 8)
# bow.vectorize()

In [None]:
param_grid = {
    'num_leaves': [9, 31],
    'max_depth': [-1, 2],
    'n_estimators': [100, 400]
}

print(embedded.run_search(BubbleUpLGBM(class_weight='balanced', n_jobs=3), param_grid))

In [None]:
param_grid = {
    'C': [0.25, 0.5, 1.0, 5.0, 10.0, 20.0, 50.0]
}

print(embedded.run_search(BubbleUpSVC(probability=True, class_weight='balanced'), param_grid))

In [20]:
param_grid = {
    'C': [0.25, 0.5, 1.0, 5.0],
}

print(embedded.run_search(BubbleUpLogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced', n_jobs=6), param_grid))

      C     score
0   1.0  0.499441
1   5.0  0.487555
2  10.0  0.466196


In [13]:
model = BubbleUpLogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced', n_jobs=-1)

model.fit(embedded.V_train, embedded.y_train)

BubbleUpLogisticRegression(C=1.0, class_weight='balanced', dual=False,
                           fit_intercept=True, intercept_scaling=1,
                           l1_ratio=None, max_iter=100,
                           multi_class='multinomial', n_jobs=-1, penalty='l2',
                           random_state=None, solver='lbfgs', tol=0.0001,
                           verbose=0, warm_start=False)

In [15]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, model.predict(embedded.V_test))

0.49944144063631085

In [None]:
param_grid = {
    'C': [1.0, 5.0, 10.0],
}

print(bow.run_search(LogisticRegression(multi_class='multinomial', solver='lbfgs', class_weight='balanced'), param_grid))

      C  multi_class     score
0   5.0          ovr  0.489900
1  10.0          ovr  0.483287
2   5.0  multinomial  0.475958
3   1.0          ovr  0.469761
4   1.0  multinomial  0.469582
