In [None]:
! pip install --quiet fuzzywuzzy
! pip install --quiet nltk
! pip install --quiet diskcache
! pip install --quiet python-Levenshtein
! pip install --quiet lightgbm
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [None]:
%load_ext autoreload 
%autoreload 2

import os
import pandas as pd
import numpy as np
from glob import glob
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.base import BaseEstimator, TransformerMixin
from embed_software.preprocess import *
from embed_software.utils import *
from validation.title_matching import layered_matcher, title_matcher, punct_lookup, exact_matcher
from validation.dot_data import LemmaTokenizer
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix

from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from scipy.sparse import vstack 
from validation.data import indeed_test_data, dot_train_data

pd.set_option('max_colwidth',50)

In [5]:
class PreEmbeddedVectorizer(BaseEstimator, TransformerMixin):
    def __init__(self, embed_path, model, lim, dims):
        self.embed_path = embed_path
        self.model = model
        self.lim = lim
        self.dims = dims
        
    def fit(self, X, y=None):
        self.fit_X = X
        self.embeddings = get_embeddings(self.embed_path, self.lim, self.dims)
        return self
    
    def transform(self, X):
        # Our test set is pre-embedded, but our train set not!
        # This should simply be a cached embedding...? 
        if self.fit_X is X:
            return embed_docs(self.model, '\n'.join(X))
        else:
            return self.embeddings[X.index] 

In [6]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 2

In [8]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, ids = indeed_test_data('us-everything.csv', SAMPLE_SIZE, SOC_LEVEL)

In [6]:
class VectorizedData():
    def __init__(self, vectorizer, splits, n_jobs):
        self.vectorizer = vectorizer
        self.X_train, self.X_test, self.y_train, self.y_test = splits
        self.n_jobs = n_jobs
        
    def vectorize(self):
        self.V_train = self.vectorizer.fit_transform(self.X_train)
        self.V_test = self.vectorizer.transform(self.X_test)
        
    def top_n_results(self, search, n=5):
        res = search.cv_results_
        tops = np.flip(np.argsort(res['mean_test_score']), 0)[:5]
        scores = np.array(res['mean_test_score'])[tops]
        params = pd.DataFrame(np.array(res['params'])[tops].tolist())
        return params.assign(score = scores)

    def run_search(self, model, param_grid):
        y = pd.concat([self.y_train, self.y_test])
        try:
            X = np.concatenate([self.V_train, self.V_test])
        except ValueError:
            X = vstack([self.V_train, self.V_test]) 
        cv = [(np.arange(0, self.X_train.shape[0]), np.arange(self.X_train.shape[0], X.shape[0]))]
        search = GridSearchCV(model, param_grid=param_grid, cv = cv, n_jobs=8)
        search.fit(X, y)
        return self.top_n_results(search)

# Hyperparameter Search

In [7]:
splits = [X_train, X_test, y_train, y_test]

ss_embedder = PreEmbeddedVectorizer('../ss_embeds/ss_100_us.txt', '../ss_models/sentencespace', SAMPLE_SIZE)
embedded = VectorizedData(ss_embedder,splits, 8)
embedded.vectorize()

tfidf = TfidfVectorizer()
bow = VectorizedData(tfidf, splits, 8)
bow.vectorize()

In [None]:
param_grid = {
    'num_leaves': [9, 31],
    'max_depth': [-1, 2],
    'n_estimators': [100, 400]
}

embedded.run_search(LGBMClassifier(), param_grid)

In [8]:
param_grid = {
    'C': [1.0, 5.0, 10.0, 20.0]
}

print(embedded.run_search(SVC(), param_grid))

      C     score
0  20.0  0.525174
1  10.0  0.519633
2   5.0  0.502294
3   1.0  0.308884


In [9]:
param_grid = {
    'C': [1.0, 5.0, 10.0],
    'multi_class': ['multinomial', 'ovr']
}

print(embedded.run_search(LogisticRegression(solver='newton-cg'), param_grid))

      C  multi_class     score
0   5.0  multinomial  0.537747
1  10.0  multinomial  0.537270
2   1.0  multinomial  0.532444
3  10.0          ovr  0.518620
4   5.0          ovr  0.517786


In [10]:
param_grid = {
    'C': [1.0, 5.0, 10.0],
    'multi_class': ['multinomial', 'ovr']
}

print(bow.run_search(LogisticRegression(solver='newton-cg'), param_grid))

      C  multi_class     score
0   5.0          ovr  0.489900
1  10.0          ovr  0.483287
2   5.0  multinomial  0.475958
3   1.0          ovr  0.469761
4   1.0  multinomial  0.469582


# COMPARE MODELS

In [32]:
models = [
    Pipeline([('tfidf', TfidfVectorizer()),
              ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="ovr", n_jobs=-1))]),
    Pipeline([('embed', PreEmbeddedVectorizer('../ss_embeds/ss_100_us_b.txt', '../ss_models/sentencespace_us', SAMPLE_SIZE, 100)),
             ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="multinomial", n_jobs=-1))]),
#     Pipeline([('embed', PreEmbeddedVectorizer('../ss_embeds/ss_100_us.txt', '../ss_models/sentencespace', SAMPLE_SIZE)),
#              ('knn', KNeighborsClassifier(7))]),
    Pipeline([('embed', PreEmbeddedVectorizer('../ss_embeds/ss_100_us_b.txt', '../ss_models/sentencespace_us', SAMPLE_SIZE, 100)),
             ('svc', SVC(C=20., probability=True))]),
#     Pipeline([('embed', PreEmbeddedVectorizer('../ss_embeds/ss_100_us.txt', '../ss_models/sentencespace', SAMPLE_SIZE)),
#              ('lgbm', LGBMClassifier(n_estimators=400, max_depth=2))])
]

In [33]:
@attr.s
class Predictor():
    X_train = attr.ib()
    y_train = attr.ib()
    X_test = attr.ib()

    def fn(self, m):
        return (m
                .fit(self.X_train, self.y_train)
                .predict(self.X_test))

In [34]:
p = Predictor(X_train, y_train, X_test)

from concurrent.futures import ProcessPoolExecutor as Pool

pool = Pool()
preds = pool.map(p.fn, models)

In [None]:
preds = [p for p in preds]

In [121]:
[accuracy_score(p, y_test) for p in preds]

[0.48990049454805457, 0.5673598283977834, 0.5657510576178275]

In [46]:
p = pd.DataFrame(preds).T.assign(y = y_test.values)

differ = p[p[0] != p[1]]

In [None]:
differ[differ[0] == differ['y']].y.value_counts()

In [34]:
print(classification_report(preds[0], y_test))

              precision    recall  f1-score   support

          11       0.71      0.21      0.32      4534
          13       0.54      0.49      0.51      3119
          15       0.75      0.75      0.75      2641
          17       0.54      0.66      0.59       439
          19       0.12      0.39      0.19        85
          21       0.26      0.34      0.30        96
          23       0.09      0.73      0.15        11
          25       0.56      0.60      0.58       315
          27       0.45      0.54      0.49      1309
          29       0.64      0.52      0.57      1269
          31       0.03      0.95      0.06        19
          33       0.11      0.56      0.19        16
          35       0.52      0.92      0.67       704
          37       0.21      0.47      0.29        40
          39       0.30      0.57      0.39       222
          41       0.27      0.64      0.38       435
          43       0.31      0.54      0.39       937
          45       0.00    

  'recall', 'true', average, warn_for)


In [35]:
print(classification_report(preds[1], y_test))

              precision    recall  f1-score   support

          11       0.28      0.45      0.35       806
          13       0.60      0.66      0.63      2567
          15       0.84      0.73      0.78      3029
          17       0.73      0.40      0.52       999
          19       0.25      0.20      0.22       339
          21       0.40      0.38      0.39       130
          23       0.67      0.70      0.69        88
          25       0.60      0.43      0.50       478
          27       0.46      0.60      0.52      1227
          29       0.75      0.53      0.62      1441
          31       0.30      0.72      0.42       234
          33       0.36      0.30      0.33        97
          35       0.89      0.84      0.86      1315
          37       0.34      0.86      0.49        36
          39       0.68      0.39      0.49       750
          41       0.60      0.57      0.59      1092
          43       0.37      0.57      0.44      1042
          45       0.10    

In [32]:
dot_dict = get_dictionary('', SOC_LEVEL)
un = dot_dict.groupby('soc').apply(lambda df: df.head(1))

In [33]:

model_names = ['-'.join(m.named_steps.keys()) for m in models]
category_names = un['desc_soc{}'.format(SOC_LEVEL)]
for name,p in zip(model_names, preds):
    df = pd.DataFrame(confusion_matrix(y_test, p, un.soc), 
                      index=category_names, 
                      columns=category_names)
    df.to_csv('confusion-matrices/soc-{}/{}.csv'.format(SOC_LEVEL, name), index=False)