In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.scoring import bubbleup_score, BubbleUpMixin
from validation.dot_data import LemmaTokenizer, get_dictionary
from classification.embedding import PreEmbeddedVectorizer, Embedding, WordEmbeddingVectorizer

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

pd.set_option('max_colwidth',50)
pd.set_option('display.width',200)

In [2]:
SOC_LEVEL = 6

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL, include_tasks=False)

In [4]:
noprod_idx = get_soc_n(y_train.astype(str), 2) == 51
X_train, y_train = X_train[noprod_idx], y_train[noprod_idx]

In [5]:
class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

models = [
#     Pipeline([('glove_100_va', WordEmbeddingVectorizer('../glove-models/glove-va-100.txt', cache_dir='va_glove_embed_cache')),
#               ('lr', LogisticRegression(C=1., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
#     Pipeline([('sentencespace_100_va', PreEmbeddedVectorizer('../ss-models/va-ss-100', cache_dir='va_embed_cache')),
#               ('lr', LogisticRegression(C=1., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('sentencespace_100_indeed', PreEmbeddedVectorizer('../indeed-embeds/model', cache_dir='indeed_embed_cache')),
              ('lr', BubbleUpLogisticRegression(C=2., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1).set_bubbles(3))]),
#     Pipeline([('tfidf', TfidfVectorizer(min_df=10, max_df=.7, ngram_range=(1,2))),
#               ('lr', LogisticRegression(C=500., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1))])
]

In [6]:
# WITHIN DOT ACCURACY 
from sklearn.model_selection import cross_val_predict, LeaveOneOut

def in_sample_predict(model, X_train, y_train):
    model.fit(X_train, y_train)
    return model.predict(X_train)

preds = [None, None, None, None]

In [7]:
preds[0] = in_sample_predict(models[0], X_train, y_train)

In [8]:
accuracy_score(get_soc_n(y_train.astype(str), 3).astype(str), preds[0])

# preds[0], get_soc_n(y_train.astype(str), 2).astype(str)

0.6823218997361478

In [None]:
preds[1] = in_sample_predict(models[1], X_train, y_train)
accuracy_score(y_train, preds[1])

In [None]:
from validation.scoring import make_code_lookup
from validation.data import make_desc_lookup

desc_lookup = make_desc_lookup('', 6)

In [None]:
comp = pd.DataFrame({ 'content': X_train, 
                      'dot_desc': [desc_lookup(y) for y in y_train], 
                      'pred_desc': [desc_lookup(p) for p in preds[2]], 
                      'dots': y_train, 
                      'predicted': preds[2]})

falses = preds[2] != y_train
mistakes = comp[falses]

In [None]:
comp.content.str.len().mean()

In [None]:
mistakes.content.str.len().mean()

In [None]:
mistakes.loc[83].content

In [None]:
mistakes.content.str.contains('performs duties as described under').sum() / mistakes.shape[0]

In [None]:
comp[comp.content.str.contains('duties as described under')].to_csv('weird-duties.csv', index=False)

In [None]:
comp[comp.content.str.contains('performs duties as described under')].to_csv('weird-performs-duties.csv', index=False)

In [None]:
mistakes.content.values

In [None]:
X_train[y_train == 119111].values

In [None]:
mistakes.loc[930].content

In [None]:
mistakes.loc[800:1200]

In [50]:
dot_dict = get_dictionary('', 3)

In [58]:
d = dot_dict.groupby('soc').head(1)
d[d.soc == 291].desc_soc3

355    Health Diagnosing and Treating Practitioners
Name: desc_soc3, dtype: object

In [48]:
def print_confusion_matrices(models, preds, y, path, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    un = un[un.soc.isin(y)].reset_index(drop=True)
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = path.format(SOC_LEVEL, name)
        df.to_csv(filename, index=False)

In [49]:
print_confusion_matrices([models[0]], [preds[0].astype(int)], get_soc_n(y_train.astype(str), 3), 'confusion-matrices/insample-{}/{}-onlyprod.csv', 3)