In [None]:
! pip install --quiet fuzzywuzzy
! pip install --quiet nltk
! pip install --quiet diskcache
! pip install --quiet python-Levenshtein
! pip install --quiet lightgbm
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [None]:
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

In [3]:
%load_ext autoreload 
%autoreload 2

import pandas as pd
import numpy as np
import re
import attr
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import SVC
from lightgbm import LGBMClassifier
from scipy.sparse import vstack 

from validation.data import indeed_test_data, dot_train_data, get_soc_n
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from validation.dot_data import LemmaTokenizer, get_dictionary
from classification.embedding import PreEmbeddedVectorizer

pd.set_option('max_colwidth',50)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [4]:
SAMPLE_SIZE = 500000
SOC_LEVEL = 6

In [5]:
X_train, y_train = dot_train_data(SOC_LEVEL)
X_test, y_test, ids = indeed_test_data('data/us/everything.csv', SAMPLE_SIZE, SOC_LEVEL)

In [17]:
models = [
    Pipeline([('tfidf', TfidfVectorizer()),
              ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="multinomial", n_jobs=-1))]), 
    # Pipeline([('sentencespace_100_us', PreEmbeddedVectorizer('./ss_models/sentencespace_100_us/embeds.txt', './ss_models/sentencespace_100_us/model', SAMPLE_SIZE, 100)),
    #          ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="multinomial", n_jobs=-1))])
    # Pipeline([('embed', PreEmbeddedVectorizer('./ss_models/sentencespace_us/embeds_b.txt', './ss_models/sentencespace_us/model', SAMPLE_SIZE, 100)),
    #          ('lr', LogisticRegression(C=5., solver='newton-cg', multi_class="multinomial", n_jobs=-1))]),
#     Pipeline([('embed', PreEmbeddedVectorizer('../ss_embeds/ss_100_us.txt', '../ss_models/sentencespace', SAMPLE_SIZE)),
#              ('knn', KNeighborsClassifier(7))]),
    # Pipeline([('embed', PreEmbeddedVectorizer('../ss_embeds/ss_100_us_b.txt', '../ss_models/sentencespace_us', SAMPLE_SIZE, 100)),
             # ('svc', SVC(C=20., probability=True))]),
]

In [18]:
@attr.s
class Predictor():
    X_train = attr.ib()
    y_train = attr.ib()
    X_test = attr.ib()

    def fn(self, m):
        return (m
                .fit(self.X_train, self.y_train)
                .predict_proba(self.X_test))

In [19]:
p = Predictor(X_train, y_train, X_test)

from concurrent.futures import ProcessPoolExecutor as Pool

pool = Pool()
preds = map(p.fn, models)

In [20]:
preds = [p for p in preds]

In [21]:
labels = np.unique(y_train)

def get_accuracy(probs, y_test):
    p = [labels[i] for i in np.argmax(probs,1)]
    return accuracy_score(p, y_test)

In [22]:
def get_soc_n(df, n):
    return (df.T
            .reset_index()
            .pipe(lambda df: df.assign(soc = df['index'].map(lambda i: str(i)[0:n])))
            .set_index('soc')
            .drop('index', 1)
            .groupby('soc')
            .sum().T
            .idxmax(1))

In [24]:
# US - 6/3 TFIDF

df = pd.DataFrame(preds[0])
df.columns = labels
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.23268275720641601

In [12]:
# US - 6/3
df = pd.DataFrame(preds[0])
df.columns = labels
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.48249329065506391

In [None]:
# India - predicting at 6, aggregating to 3
# np.save('ss_models/sentencespace_100_india/predictions-63', get_soc_n(df, 3).values)
accuracy_score(get_soc_n(df, 3).values, y_test.astype(str).map(lambda s: s[0:3]))

0.43210204120311579

In [9]:
# India - Sentencespace 100
[get_accuracy(p, y_test) for p in preds]

[0.40573887004150638]

In [16]:
# US - Sentencespace 100
[get_accuracy(p, y_test) for p in preds]

[0.42613479319168668]

In [139]:
# India
[get_accuracy(p, y_test) for p in preds]

[0.23045040528118993, 0.02092420823932481, 0.023130274922704103]

In [94]:
# UK 
[get_accuracy(p, y_test) for p in preds]

[0.23045040528118993, 0.02092420823932481, 0.023130274922704103]

In [None]:
# US
[get_accuracy(p, y_test) for p in preds]

[0.30347637686457379, 0.36610287365032945, 0.025223570530595506]

In [None]:
# OLD - SOC2?
[accuracy_score(p, y_test) for p in preds]

[0.49040702886856735, 0.58729304883151034, 0.57982188751419517]

In [46]:
p = pd.DataFrame(preds).T.assign(y = y_test.values)

differ = p[p[0] != p[1]]

In [None]:
differ[differ[0] == differ['y']].y.value_counts()

In [None]:
print(classification_report(preds[0], y_test))

In [None]:
print(classification_report(preds[1], y_test))

# Confusion Matrices

In [None]:
def print_confusion_matrices(models, preds, SOC_LEVEL):
    dot_dict = get_dictionary('', SOC_LEVEL)
    model_names = ['-'.join(m.named_steps.keys()) for m in models]
    un = dot_dict.groupby('soc').apply(lambda df: df.head(1))
    category_names = un['desc_soc{}'.format(SOC_LEVEL)]
    for name,p in zip(model_names, preds):
        df = pd.DataFrame(confusion_matrix(y_test, p, un.soc), 
                          index=category_names, 
                          columns=category_names)
        filename = 'confusion-matrices/soc-{}/{}.csv'.format(SOC_LEVEL, name)
        df.to_csv(filename, index=False)

In [None]:
print_confusion_matrices(models, preds, 3)

# Making Predictions

In [11]:
def make_predictions_df(df, preds):
    key = 'predicted_soc{}'.format(SOC_LEVEL)
    desc = 'desc_soc{}'.format(SOC_LEVEL)
    dot_dict = get_dictionary('', SOC_LEVEL)
    dd = dot_dict.groupby('soc').first()[desc].reset_index()
    found = (pd.DataFrame({key: preds})
             .merge(dd, how='left', left_on=key, right_on='soc')
             .drop('soc', 1))
    return (pd.concat([df, found], 1)
            .rename(columns = {'content': 'description'}))

def print_predictions(model, infile, outpath, SOC_LEVEL):
    X_train, y_train = dot_train_data(SOC_LEVEL)
    df = pd.read_csv(infile)
    all_preds = model.fit(X_train, y_train).predict(df.content)
    filename = '{}/us-soc{}-predictions.csv'.format(outpath, SOC_LEVEL)
    make_predictions_df(df, all_preds).to_csv(filename, index=False)    

In [216]:
df = pd.read_csv('confusion-matrices/soc-3/embed-lr.csv')

In [217]:
micro(df)

(0.41054341938978156, 0.40667660208643813)

In [218]:
macro(df)

(0.19705964432013226, 0.19270635975295053)

In [225]:
macro(df, 'weighted')

(0.46063510164916216, 0.40667660208643819)