In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import KNeighborsClassifier

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.dot_data import LemmaTokenizer, get_dictionary
from classification.embedding import PreEmbeddedVectorizer, Embedding, WordEmbeddingVectorizer

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [2]:
SOC_LEVEL = 3

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL, include_tasks=False)

In [20]:
models = [
    Pipeline([('glove_100_va', WordEmbeddingVectorizer('../glove-models/glove-va-100.txt', cache_dir='va_glove_embed_cache')),
              ('lr', LogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('sentencespace_100_va', PreEmbeddedVectorizer('../ss-models/va-ss-100', cache_dir='va_embed_cache')),
              ('lr', LogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1))]),
    Pipeline([('sentencespace_100_indeed', PreEmbeddedVectorizer('../indeed-embeds/model', cache_dir='indeed_embed_cache')),
              ('lr', LogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1))])
]

In [25]:
models = [
    Pipeline([('glove_100_va', WordEmbeddingVectorizer('../glove-models/glove-va-100.txt', cache_dir='va_glove_embed_cache')),
              ('knn', KNeighborsClassifier(3, weights='distance', n_jobs=-1))]),
    Pipeline([('sentencespace_100_va', PreEmbeddedVectorizer('../ss-models/va-ss-100', cache_dir='va_embed_cache')),
              ('knn', KNeighborsClassifier(3, weights='distance', n_jobs=-1))]),
    Pipeline([('sentencespace_100_indeed', PreEmbeddedVectorizer('../indeed-embeds/model', cache_dir='indeed_embed_cache')),
              ('knn', KNeighborsClassifier(3, weights='distance', n_jobs=-1))])
]

In [None]:
# WITHIN DOT ACCURACY 
from sklearn.model_selection import cross_val_predict, LeaveOneOut

preds = [cross_val_predict(model, X_train, y_train, cv=LeaveOneOut()) for model in models]

In [None]:
preds

In [None]:
def print_confusion_matrices(models, preds, y, path, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = path.format(SOC_LEVEL, name)
        df.to_csv(filename, index=False)

In [None]:
print_confusion_matrices(models, preds, y_train, 'confusion-matrices/dot-{}/{}.csv', 3)