In [None]:
! pip install --quiet seaborn toolz fuzzywuzzy
! pip install --quiet diskcache python-Levenshtein lightgbm lime
! pip install --quiet adabound
! pip install -e 'git://github.com/nandanrao/embed-software.git#egg=embed_software'

In [39]:
! pip install --quiet statsmodels

In [1]:
%load_ext autoreload
%autoreload 2

import torch.nn as nn
import torch
import torch.optim as optim

import pandas as pd
import numpy as np
import seaborn as sns

from validation.data import dot_train_data, get_soc_n, get_dictionary, indeed_test_data, virginia_test_data
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from classification.embedding import PreEmbeddedVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('max_colwidth',50)
pd.set_option('display.width', 700)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

In [2]:
SOC_LEVEL = 3

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL, include_tasks=False)

In [4]:
from functools import reduce

def make_taskgroup(dx, dy):
    return (pd.DataFrame({'soc': dy, 'content': dx})
            .groupby('soc')
            .apply(lambda df: reduce(lambda a,b: a + b, df.content.sample(frac=0.33)))
            .reset_index()
            .rename(columns= {0: 'content'}))

dx, dy = dot_train_data(6, include_tasks=True, include_dot=False)        
tasks = pd.concat([make_taskgroup(dx, dy) for _ in range(6)])

tasks['soc'] = get_soc_n(tasks.soc.map(str), 3)

y_train, X_train = pd.concat([y_train, tasks.soc]).reset_index(drop=True), pd.concat([X_train, tasks.content]).reset_index(drop=True)

In [5]:
not_production = y_train != 519

X_train, y_train = X_train[not_production], y_train[not_production]

In [None]:
X_test, y_test, va_df = virginia_test_data('../data/va_job_posts.json', 3)

In [None]:
def _embed(embedding, d, sentences):
    if sentences == True:
        fn = embedding.embed_paragraph
    else:
        fn = embedding.embed_doc

    doc = fn(d).T.reshape(1, 100, -1)
    return torch.from_numpy(doc).float()

def load_source(embedding, X_train, y_train, sentences):
    for d,y in zip(X_train, y_train):
        doc = _embed(embedding, d, sentences)
        label = torch.tensor([y]).long()
        yield doc, label

def load_target(embedding, docs, sentences):
    for d in docs:
        yield _embed(embedding, d, sentences)

In [None]:
from classification.embedding import Embedding

embedding = Embedding('../glove-models/glove-va-100.txt', sep=' ')

In [None]:
label_lookup = {v:k for k,v in pd.Series(y_train.unique()).to_dict().items()}
y_train_idx = [label_lookup[y] for y in y_train]

In [None]:
docs, labels = zip(*list(load_source(embedding, X_train, y_train_idx, sentences = False)))

In [None]:
idx = np.random.choice(X_test.index, 50000, replace=False)
idx = [i for i in idx if X_test[i] is not None]

target = list(load_target(embedding, X_test[idx], sentences = False))

In [None]:
# w = 1 / pd.Series(np.array([l.to(device='cpu').numpy() for l in labels]).reshape(-1)).value_counts().reset_index().sort_values('index')[0].values

def make_weights(y_train):
    v = (1 / y_train.value_counts())
    v = (v / v.sum())*v.shape[0]
    w_df = v.reset_index().rename(columns={'soc': 'count', 'index': 'soc'})
    w = pd.DataFrame(y_train).merge(w_df, how='left')['count'].values
    return torch.from_numpy(w).float()

In [None]:
weights = make_weights(y_train)

In [None]:
import random
from toolz import curry
from time import perf_counter
from math import ceil

class Classifier(nn.Module):
    def __init__(self, net, opt, device, criterion = None):
        super().__init__()
        self.device = device
        self.net = net.to(device=device)
        self.opt = opt(net)
        self.criterion = criterion
        self.net.register_backward_hook(printgradnorm)

    def forward(self, X):
        return self.net(X).view(-1)

    def evaluate(self, source, target, label, targets=None):
        out = self.__call__(source)
        loss = self.criterion(out.reshape(1, -1), label)
        return loss


class Discriminator(Classifier):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.dummy_source = torch.tensor([1.]).to(device=self.device)
        self.dummy_target = torch.tensor([0.]).to(device=self.device)
        
    def evaluate(self, source, target, label):
        guess_s = self.__call__(source)
        guess_t = self.__call__(target)
        loss = self.criterion(guess_s, self.dummy_source)
        loss += self.criterion(guess_t, self.dummy_target)
        return loss


class MockOpt():
    def __init__(self):
        pass

    def zero_grad(self):
        return

    def step(self):
        return


class Distancer(Classifier):
    def __init__(self, alpha, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.alpha = alpha
        self.opt = MockOpt()

    def evaluate(self, source, target, label, targets):
        M = torch.cdist(torch.stack([target]), torch.stack(targets))
        return self.alpha * M.sum() / target.sum() 
        

class PlatonicNet():
    def __init__(self, embedder, classifier, discriminator, batch_size=64, n_epochs=5, grad_norm_clip=0.25, discriminator_mix= -1.0):
        self.discriminator = discriminator
        self.classifier = classifier
        self.embedder = embedder
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.grad_norm_clip = grad_norm_clip
        self.disc_mix = discriminator_mix

    def load_data(self, docs, labels, weights, target):
        self.docs, self.labels, self.weights, self.target = docs, labels, weights, target.copy()

    def batch(self, size):
        random.shuffle(self.target)

        dat = list(zip(self.docs, self.labels, self.weights, self.target))
        random.shuffle(dat)

        out = []
        while dat:
            head,dat = dat[:size], dat[size:]
            out.append(head)
        return out

    
    def epoch(self, embedder):
        epoch_disc_loss = 0
        epoch_class_loss = 0
        epoch_start = perf_counter()

        for i,batch in enumerate(self.batch(self.batch_size)):
            batch_disc_loss = 0
            batch_class_loss = 0

            # run for each net, classifier and discriminator
            # (self.discriminator, self.disc_mix)
            for net,sign in [(self.classifier, 1.)]:

                # due to pytorch updating, 
                # run twice, once for embedder, once for the other model
                for updating_model,sgn in [(embedder, sign), (net, 1.)]:
                    updating_model.opt.zero_grad()
                    loss = 0

                    sources, labels, weights, targets = zip(*batch)                    
                    sources = [embedder(s) for s in sources]
                    targets = [embedder(t) for t in targets]
                    
                    b = zip(sources, labels, weights, targets)
                    for source, label, weight, target in b:
                        # l = net.evaluate(embedder(source), embedder(target), label)
                        l = net.evaluate(source, target, label, targets)
                        loss += l*weight

                    # Flip the loss for embedding/discriminator
                    loss *= sgn

                    if torch.isnan(loss):
                        raise Exception('LOSS/EMBEDDING IS NAN')

                    # Update loss records for printing
                    if updating_model == self.discriminator:
                        batch_disc_loss += loss
                        epoch_disc_loss += loss
                    elif updating_model == self.classifier:
                        batch_class_loss += loss
                        epoch_class_loss += loss

                    # optimize
                    loss.backward()
                    torch.nn.utils.clip_grad_value_(updating_model.net.parameters(), self.grad_norm_clip)
                    updating_model.opt.step()
                    
            # if i % 100 == 0:
                # print(f'Batch class/disc loss: {batch_class_loss} ::: {batch_disc_loss}')
        epoch_time = round((perf_counter() - epoch_start)/60)
        print(f'----------- EPOCH --------------\nEpoch finished in {epoch_time} minutes. class/disc loss: {epoch_class_loss} ::: {epoch_disc_loss}')        

    def train(self):
        for epoch in range(self.n_epochs):
            self.epoch(self.embedder)            


def printgradnorm(self, grad_input, grad_output):
    pass
    # if grad_input[0].norm() > 200000.:
        # print('grad_input norm:', grad_input[0].norm())

class GatedNet(torch.nn.Module):
    def __init__(self, embed_size, layers):
        super().__init__()
        self.conver = nn.Sequential(
            nn.Conv1d(in_channels=embed_size, out_channels=layers, kernel_size=1, groups=1, padding=0, bias=False),
            nn.Sigmoid()
        )

        self.conver.register_backward_hook(printgradnorm)

    def forward(self, x):        
        convs = self.conver(x)
        out = torch.matmul(x, torch.t(convs.max(1).values))
        return out / torch.norm(out)  

class ParallelFilters(nn.Module):
    def __init__(self, filters):
        super().__init__()
        for i,net in enumerate(filters):
            self.add_module(f'filter_{i}', net)

    def forward(self, x):
        return torch.cat([net(x) for net in self.children()], 1)    


class NormedSum(nn.Module):
    def forward(self, x):
        y = x.sum(2).reshape(1,-1,1)
        return y / torch.norm(y)

def _embedder(embed_size, layers, normed_sum = False, dropout = 0.5):
    filters = [
        nn.Sequential(
            nn.Conv1d(in_channels=embed_size, out_channels=out_channels, kernel_size=kernel_size, groups=1, padding=kernel_size - 1),
            # nn.Sigmoid(),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1),
            nn.Dropout(p=dropout))
        for kernel_size,out_channels in layers]

    if normed_sum:
        filters = [NormedSum()] + filters

    net = nn.Sequential(
        ParallelFilters(filters)
    )

    net.register_backward_hook(printgradnorm)
    return net

def _embedder_single(embed_size, out_channels):
    net = nn.Sequential(
        nn.Conv1d(in_channels=embed_size, out_channels=out_channels, kernel_size=1, groups=1, padding=0),
        nn.ReLU(),
        nn.AdaptiveMaxPool1d(output_size=1),
        nn.Dropout(p=0.4)
    )

    net.register_backward_hook(printgradnorm)
    return net    

In [None]:
from adabound import AdaBound

@curry
def adam_opt(lr, net):
    return optim.Adam(net.parameters(), lr=lr, weight_decay=1.0)

@curry
def ab_opt(lr, wd, net):
    return AdaBound(net.parameters(), lr=lr, final_lr=0.01, weight_decay=wd)


def get_size(filters, normed_sum):
    s = np.sum([f[1] for f in filters])
    if normed_sum:
        s += 100
    return s

device = torch.device('cuda')
n_classes = y_train.unique().shape[0]
filters = [(1, 50)]
normed_sum = True

final_layer_size = get_size(filters, normed_sum)

print(final_layer_size)

embedder = Classifier(_embedder(100, filters, normed_sum=normed_sum, dropout=0.5), 
                      ab_opt(0.00002, 0.0),
                      device)

classifier = Classifier(nn.Sequential(nn.Linear(final_layer_size, n_classes)),
                        ab_opt(0.0001, 3.0), 
                        device,
                        nn.CrossEntropyLoss())

discriminator = Discriminator(nn.Sequential(nn.Linear(final_layer_size, 1)), 
                              ab_opt(0.0001, 1.0), 
                              device, 
                              nn.BCEWithLogitsLoss())

# discriminator = Distancer(1, 
#                           nn.Linear(final_layer_size, 1), 
#                           ab_opt(0.0001, 1.0), 
#                           device)

In [None]:
move_to_device = lambda x: [i.to(device=device) for i in x]

docs = move_to_device(docs)
labels = move_to_device(labels)
target = move_to_device(target)
weights = move_to_device(weights)

In [None]:
model = PlatonicNet(embedder, classifier, discriminator, n_epochs=25, grad_norm_clip=0.1, discriminator_mix= -0.05)
model.load_data(docs, labels, weights, target)

In [None]:
model.train()

In [None]:
torch.save(model, 'model-09-11-a.pt')

In [None]:
i = 90

def get_spread(d):
    vals = model.embedder.net.conver(d).max(1).values.detach().numpy()
    return vals.max() - vals.min()
    

np.mean([get_spread(d) for d in docs[:500]]), np.mean([get_spread(d) for d in target[:500]])

In [None]:
i = 44

idx = np.where(model.embedder.net.conver(target[i]).max(1).values.detach().numpy() < .4)[1]
np.array(df.content.iloc[i].split('\t'))[idx]

In [62]:
from validation.scoring import bubbleup_score
from validation.scoring import BubbleUpMixin

class BubbleUpLogisticRegression(BubbleUpMixin, LogisticRegression):
    pass

def simple_embed(doc):
    X = doc.sum(2).reshape(-1)
    return X / torch.norm(X)

def ss_embed(doc):
    d = embedding.embed_doc(doc).sum(0)
    return d / np.linalg.norm(d)

def _listify(a):
    if not hasattr(a, '__len__'):
        return [a]
    return a

def _is_hit(ys, preds):
    ys, preds = _listify(ys), _listify(preds)
    return len(set(ys) & set(preds)) > 0

def multi_score(y_test, preds):
    hits = [_is_hit(y, p) for y,p in zip(y_test, preds)] 
    return np.sum(hits) / len(hits)

get_soc_n_str = lambda x: ''.join(x.strip().split('.')[0].split('-'))[:3]

get_all_possibilities = lambda y: set([get_soc_n_str(i) for i in y])

y_possibilities = [[r.onet_soc_code] + r.occupationalCategory.split(',') for i,r in va_df.loc[idx, :].iterrows()]
y_possibilities = [get_all_possibilities(y) for y in y_possibilities]

In [69]:
Xe_train = [simple_embed(d).detach().cpu().numpy() for d in docs]

clf = BubbleUpLogisticRegression(C=1., n_jobs=-1, solver='lbfgs', multi_class='multinomial').set_bubbles(soc_n=3, top_x=1)

clf.fit(Xe_train, y_train)
preds = clf.predict(Xe_train)

multi_score(y_train, [[int(p) for p in pred] for pred in preds])

# Just DOT, Soc 3
# 0.567

# Just DOT, no 519
# 0.527

0.5265022137887413

In [70]:
Xe_test = [simple_embed(d).detach().cpu().numpy() for d in target]
preds = clf.predict(Xe_test)
multi_score(y_possibilities, preds)


# Just DOT, Soc 3
# 0.307


# Just DOT, no 519
# 0.312

0.31368

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

confusion_matrix(y_train, preds)[3]

In [82]:
Xp_train = [model.embedder(d).detach().cpu().numpy() for d in docs]

clf = BubbleUpLogisticRegression(C=1., n_jobs=-1, solver='lbfgs', multi_class='multinomial').set_bubbles(soc_n=3, top_x=1)

clf.fit(Xp_train, y_train)
preds = clf.predict(Xp_train)

multi_score(get_soc_n(y_train.map(str), 3), [[int(p) for p in pred] for pred in preds])

# ------ 0.39 baseline


# 0.5102
# 0.54275
# 0.5583

# --------- adversarial, SOC 6, bubbleup 3
# 0.3785
# 0.3887

# --------- adversarial, SOC 3, only dict

# 0.745 - a
# 0.786 - b

# --------- adversarial, SOC 3, weighted (9-03)
# 0.627 - a
# 0.657 - b

# --------- adversarial, SOC 3, weighted, no 519 (9-03)
# 0.850 - c
# 0.893 - d

# --------- non-adversarial, SOC 3, weighted, no 519, sigmoid
# 0.611 - 0.535

0.43605313092979125

In [83]:
Xp_test = [model.embedder(d).detach().cpu().numpy() for d in target]
preds = clf.predict(Xp_test)
multi_score(y_possibilities, preds)

# ------ 0.07 baseline

# 0.19544 - c
# 0.16636 - b
# 0.19374 - a

# -------- adversarial, SOC 6, bubbleup 3
# 0.14124
# 0.13644

# --------- adversarial, SOC 3, only dict
# 0.05
# 0.05

# -------- adversarial, SOC 3, weighted (9-03)
# 0.1013 - a 
# 0.1084 - b

# --------- adversarial, SOC 3, weighted, no 519 (9-03)
# 0.1597 - c
# 0.1520 - d


# -------- non-adversarial, SOC 3, weighted, no 519, sigmoid
# 0.15 - 0.17

0.08214

In [None]:
def interpret(model, target, X, ind):
    vals, indices = list(model.embedder.net[0].children())[1][0](target[ind]).max(2)
    numpize = lambda t: t.to(device='cpu').detach().numpy().reshape(-1) 
    vals, indices = numpize(vals), numpize(indices)
    _, words = embedding.embed_doc(X[ind], return_words=True)
    
#     vals, indices = zip(*sorted(list(zip(vals, indices)), key = lambda x: x[0], reverse=True))
    w = np.array(words)[list(indices)]
    return pd.DataFrame((w, vals)).T.rename(columns={0: 'word', 1: 'value'})

    

interpret(model, target, X_test[idx].reset_index(drop=True), 32)

In [30]:
%notebook -e platonic-embedding-output-9-02.ipynb

In [108]:
len(preds)

50000

In [103]:
pd.Series(y_possibilities)[target_idx]

341305            {, 291}
68059          {413, 414}
316045              {132}
639904    {119, 292, 291}
405625         {151, 271}
               ...       
430509            {333, }
596229         {412, 414}
123363    {435, 999, 412}
437148            {, 999}
412151         {353, 412}
Length: 50000, dtype: object

In [230]:
SAMPLE_SIZE=100000
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', SAMPLE_SIZE, 6)
X_train, y_train = dot_train_data(6)

In [268]:
Xe_test = [model.embedder(d).detach().numpy() for d in load_target(X_test)]
Xe_train = [model.embedder(d).detach().numpy() for d in load_target(X_train)]

In [269]:
clf = LogisticRegression(C=5., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xe_train, y_train)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [270]:
bubbleup_score(y_train, Xe_test, y_test, clf)

0.46860896376066846

In [251]:
Xe_test = [simple_embed(d).detach().numpy() for d in load_target(X_test)]
Xe_train = [simple_embed(d).detach().numpy() for d in load_target(X_train)]

In [252]:
clf = LogisticRegression(C=5., n_jobs=-1, solver='lbfgs', multi_class='multinomial')
clf.fit(Xe_train, y_train)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [253]:
bubbleup_score(y_train, Xe_test, y_test, clf)

0.4752223066267483