In [None]:
! pip install --quiet strsim
! pip install --quiet fuzzywuzzy
! pip install --quiet nltk
! pip install --quiet diskcache
! pip install --quiet python-Levenshtein
! pip install --quiet lightgbm
! pip install --quiet lime
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
! conda install -c numba --yes numba

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from scipy.sparse import vstack 

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from validation.scoring import BubbleUpMixin
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.dot_data import LemmaTokenizer, get_dictionary
from validation.data import virginia_test_data
from classification.embedding import PreEmbeddedVectorizer, Embedding, WordEmbeddingVectorizer

pd.set_option('max_colwidth',50)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [2]:
SOC_LEVEL = 6

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, va_df = virginia_test_data('../data/va_job_posts.json', SOC_LEVEL)
# X_test, y_test = matches.description, get_soc_n(matches.code, 6)

In [4]:
va_df['title'] = va_df.title.str.lower()

In [None]:
from validation.data import make_matcher

d = va_df[va_df.onet_soc_code.notna()].reset_index()
matcher = make_matcher()
matches = matcher(d).set_index('index')

In [None]:
# matches[['title', 'assigned_title', 'code', 'onet_soc_code', 'occupationalCategory']].head(20).loc[352]

In [None]:
model = Pipeline([('glove_100_va', WordEmbeddingVectorizer('../glove-models/glove-va-100.txt')),
                  ('lr', LogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1))])

model.fit(X_train, y_train)

In [5]:
class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

model = Pipeline([('sentencespace_100_va', PreEmbeddedVectorizer('../ss-models/va-ss-100', cache_dir='va_embed_cache')),
                  ('lr', BubbleUpLogisticRegression(C=5., solver='newton-cg', class_weight='balanced', multi_class="multinomial", n_jobs=-1).set_bubbles(soc_n=3, top_x=1))])

model.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('sentencespace_100_va',
                 PreEmbeddedVectorizer(cache_dir='va_embed_cache',
                                       chunk_size=1000,
                                       model='../ss-models/va-ss-100')),
                ('lr',
                 BubbleUpLogisticRegression(C=5.0, class_weight='balanced',
                                            dual=False, fit_intercept=True,
                                            intercept_scaling=1, l1_ratio=None,
                                            max_iter=100,
                                            multi_class='multinomial',
                                            n_jobs=-1, penalty='l2',
                                            random_state=None,
                                            solver='newton-cg', tol=0.0001,
                                            verbose=0, warm_start=False))],
         verbose=False)

In [6]:
from validation.data import get_title_lookup
from validation.scoring import bubbleup_score


title_lookup = get_title_lookup('crosswalks')
title_lookup['code'] = get_soc_n(title_lookup.code, 6)

In [7]:
from similarity.metric_lcs import MetricLCS

def string_match_title(title_lookup, title, codes):
    lookup = title_lookup[title_lookup.code.isin(codes)].reset_index(drop=True)

    lcs = MetricLCS()
    lookup['distance'] = [lcs.distance(x.split(), title.split()) for x in lookup.title]
    top_guesses = lookup.sort_values('distance').code[:5]
    return top_guesses

string_match_title(title_lookup, 'chief officer', [112011, 111011])

1     111011
2     111011
18    111011
50    111011
17    111011
Name: code, dtype: int64

In [None]:
[string_match_title(title_lookup, title, preds) for title,preds in zip(va_df.title, preds)]

In [2]:
get_soc_n_str = lambda x: ''.join(x.strip().split('.')[0].split('-'))[:3]
get_all_possibilities = lambda y: set([get_soc_n_str(i) for i in y])


y_possibilities = [[r.onet_soc_code] + r.occupationalCategory.split(',') for i,r in va_df.iterrows()]
y_possibilities = [get_all_possibilities(y) for y in y_possibilities]

NameError: name 'va_df' is not defined

In [None]:
# accuracy with one-shot on all titles from VA dataset

preds = model.predict(X_test)

preds = [set(p) for p in preds]
hits = [len(y&p) > 0 for y,p in zip(y_possibilities, preds)]
hits

In [10]:
h = pd.Series(hits)
h.sum() / h.shape[0]

NameError: name 'hits' is not defined

In [None]:
title_lookup

In [None]:
idx = X_test.notna()

bubbleup_score(y_train, X_test[idx], y_test[idx], model)

In [21]:
# GloVe vectors on VA

idx = X_test.notna()

bubbleup_score(y_train, X_test[idx], y_test[idx], model)

0.20506294803666322

In [75]:
idx = X_test.notna()

bubbleup_score(y_train, X_test[idx], y_test[idx], model)

0.43231615726227796

In [74]:
yidx = (matches[idx].onet_soc_code != '')
y_test_va = get_soc_n(matches[idx][yidx].onet_soc_code, 6)

bubbleup_score(y_train, X_test[idx][yidx], y_test_va, model)

0.39304142709715467

In [77]:
# VA!
# X_test, y_test, va_df = virginia_test_data('../data/va_job_posts.json', SOC_LEVEL)

# df = pd.DataFrame({'X': X_test, 'y': y_test})
# sample = df[df.X.notna()].sample(50000)

# preds = model.predict_proba(sample.X)

labels = np.unique(y_train)
df = pd.DataFrame(preds)
df.columns = labels

accuracy_score(get_soc_n_preds(df, 3).values, sample.y.astype(str).map(lambda s: s[0:3]))

# preds = get_top_soc_n_preds(df, 3, 1)
# istop = [y in preds[i] for i,y in enumerate(sample.y.astype(str).map(lambda s: s[0:3]))]
# np.array(istop).mean()    

0.2956

In [13]:
# UK - 6/3
df = pd.DataFrame(preds[0])
df.columns = labels
# np.save('ss_models/sentencespace_100_uk/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.41385476727667753

In [None]:
# US - 6/3
df = pd.DataFrame(preds[0])
df.columns = labels
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.48249329065506391

In [None]:
# India - predicting at 6, aggregating to 3
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.43210204120311579

In [9]:
# India - Sentencespace 100
[get_accuracy(p, y_test) for p in preds]

[0.40573887004150638]

In [16]:
# US - Sentencespace 100
[get_accuracy(p, y_test) for p in preds]

[0.42613479319168668]

In [139]:
# India
[get_accuracy(p, y_test) for p in preds]

[0.23045040528118993, 0.02092420823932481, 0.023130274922704103]

In [94]:
# UK 
[get_accuracy(p, y_test) for p in preds]

[0.23045040528118993, 0.02092420823932481, 0.023130274922704103]

In [None]:
# US
[get_accuracy(p, y_test) for p in preds]

[0.30347637686457379, 0.36610287365032945, 0.025223570530595506]

In [None]:
# OLD - SOC2?
[accuracy_score(p, y_test) for p in preds]

[0.49040702886856735, 0.58729304883151034, 0.57982188751419517]

In [46]:
p = pd.DataFrame(preds).T.assign(y = y_test.values)

differ = p[p[0] != p[1]]

In [None]:
differ[differ[0] == differ['y']].y.value_counts()

In [None]:
print(classification_report(preds[0], y_test))

In [None]:
print(classification_report(preds[1], y_test))

# Confusion Matrices

In [57]:
def print_confusion_matrices(models, preds, y, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = 'confusion-matrices/{}.csv'.format(name)
        df.to_csv(filename, index=False)

In [58]:
print_confusion_matrices([model], [preds], y_train, 3)

In [60]:
accuracy_score(preds, y_train)

0.38563508532846036

0.38563508532846036