In [1]:
! pip install --quiet fuzzywuzzy
! pip install --quiet diskcache
! pip install --quiet python-Levenshtein
! pip install --quiet lightgbm
! pip install --quiet lime
# ! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
! pip install --quiet torch
! pip install --quiet transformers

In [52]:
import warnings

warnings.filterwarnings('ignore')

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from scipy.sparse import vstack 

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.dot_data import LemmaTokenizer, get_dictionary
from classification.embedding import PreEmbeddedVectorizer, Embedding, WordEmbeddingVectorizer
from validation.scoring import bubbleup_score, BubbleUpMixin

pd.set_option('max_colwidth',50)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [2]:
SAMPLE_SIZE = 500000
SOC_LEVEL = 3
BUBBLE_UP = 2
PROD = True

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', SAMPLE_SIZE, SOC_LEVEL)
if PROD == False:
    noprod_idx = get_soc_n(y_train.astype(str), 2) != 51
    X_train, y_train = X_train[noprod_idx], y_train[noprod_idx]

In [4]:
from transformers import BertForSequenceClassification, BertTokenizer
from transformers import DistilBertModel, DistilBertTokenizer
import torch

In [114]:
lookup = {val:i for i,val in enumerate(y_train.sort_values().unique())}
labels = np.array([lookup[code] for code in y_train])
labels = torch.tensor(labels)

In [5]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
dbert = DistilBertModel.from_pretrained('distilbert-base-uncased')
# bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(lookup.items()))

In [30]:
torch.set_num_threads(30)

In [62]:
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import TensorDataset, DataLoader

def embed(tokenizer, model, X):
    ids = [tokenizer.encode(x, max_length=512) for x in X]
    masks = pad_sequence([torch.ones(len(idx)) for idx in ids], batch_first=True)
    idss = [torch.tensor(i) for i in ids]
    idss = pad_sequence(idss, batch_first=True)
    dataset = TensorDataset(idss, masks)

    
    pooled = [model(x, attention_mask=m)[0][:,0].detach() 
              for x,m in DataLoader(dataset, batch_size=64)]

    return torch.cat(pooled).numpy()

In [None]:
Xt_test = embed(tokenizer, dbert, X_test[:5000])

In [5]:
from sklearn.neighbors import KNeighborsClassifier

class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

models = [
    Pipeline([('sentencespace_100_indeed', PreEmbeddedVectorizer('../indeed-embeds/model', cache_dir='indeed_embed_cache')),
              ('lr', BubbleUpLogisticRegression(C=2., solver='lbfgs', class_weight='balanced', multi_class="multinomial", n_jobs=-1).set_bubbles(BUBBLE_UP))]),
]

for model in models:
    model.fit(X_train, y_train)

In [None]:
model = LogisticRegression(C=2., solver='lbfgs', class_weight='balanced', multi_class='multinomial', n_jobs=-1)

model.fit(Xt, y_train)

In [98]:
model.score(Xt_test, y_test[:5000])

0.164

In [6]:
preds = [model.predict(X_test) for model in models]

In [7]:
accuracy_score(get_soc_n(y_test.astype(str), 2).astype(str), preds[0])

0.6546626603778631

# Confusion Matrices

In [10]:
def print_confusion_matrices(models, preds, y, SOC_LEVEL, prod):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y, p, un.soc.astype(str)), 
                          index=category_names, 
                          columns=category_names)
        filename = f'confusion-matrices/soc-{SOC_LEVEL}/{prod}/{name}.csv'
        df.to_csv(filename, index=False)

print_confusion_matrices(models, 
                         preds, 
                         get_soc_n(y_test.astype(str), BUBBLE_UP).astype(str), 
                         BUBBLE_UP, 
                         'withprod' if PROD else 'noprod')

# LIME

In [64]:
from lime.lime_text import LimeTextExplainer
from validation.data import make_desc_lookup

def _get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc').sum().T)


def get_pred(model, X):
    vals = model.predict_proba(X)
    df = pd.DataFrame(vals)
    df.columns = model.classes_
    n=3
    return _get_soc_n(df, n)

def run_lime(lookup, doc):
    classes = get_pred(model, doc).columns.tolist()

    explainer = LimeTextExplainer(class_names = classes)
    exp = explainer.explain_instance(doc, lambda X: get_pred(model, X).values, num_features=10, top_labels=3)

    for label in exp.available_labels():
        print(classes[label], lookup(classes[label]))

    return exp


lookup = make_desc_lookup('', 3)

In [None]:
model = models[0]

idx = np.random.choice(X_test.index)


# idx = 87583

print(idx)

run_lime(lookup, X_test[idx]).show_in_notebook()

In [None]:
68147, 1246, 82942, 22706, 99539, 6491, 88072, 91587, 61655

(68147, 1246, 17258, 82942, 22706, 99539, 6491)

In [None]:
examples = [68147, 99539, 61655, 70538, 1246]

for i in examples:
    run_lime(lookup, X_test[i]).save_to_file(f'lime/lime-example-{i}.html')