In [18]:
%load_ext autoreload
%autoreload 2

import torch.nn as nn
import torch
import torch.optim as optim

import pandas as pd
import numpy as np
import seaborn as sns

from validation.data import dot_train_data, get_soc_n, get_dictionary, indeed_test_data, virginia_test_data
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from classification.embedding import PreEmbeddedVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier

pd.set_option('max_colwidth',50)
pd.set_option('display.width', 700)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 3

In [3]:
X_train, y_train = dot_train_data(SOC_LEVEL)

In [5]:
from gcsfs import GCSFileSystem

fs = GCSFileSystem(project='labor-market-data')
with fs.open('lmd-classify-dot/data/us/company-everything.csv') as f:
    df = pd.read_csv(f)

df['title'] = df.title.str.lower()

In [None]:
X_test, y_test, va_df = virginia_test_data('../data/va_job_posts.json', 6)

In [None]:
def _embed(embedding, d, sentences):
    if sentences == True:
        fn = embedding.embed_paragraph
    else:
        fn = embedding.embed_doc

    doc = fn(d).T.reshape(1, 100, -1)
    return torch.from_numpy(doc).float()

def load_source(embedding, X_train, y_train, sentences):
    for d,y in zip(X_train, y_train):
        doc = _embed(embedding, d, sentences)
        label = torch.tensor([y]).long()
        yield doc, label

def load_target(embedding, docs, sentences):
    for d in docs:
        yield _embed(embedding, d, sentences)

In [None]:
from classification.embedding import Embedding

embedding = Embedding('../glove-models/glove-va-100.txt', sep=' ')

In [7]:
label_lookup = {v:k for k,v in pd.Series(y_train.unique()).to_dict().items()}
y_train_idx = [label_lookup[y] for y in y_train]

In [8]:
docs, labels = zip(*list(load_source(embedding, X_train, y_train_idx, sentences = False)))

In [9]:
# df = df.sample(n=50000)
# target = list(load_target(embedding, df.content, sentences = False))

idx = np.random.choice(X_test.index, 100000, replace=False)
idx = [i for i in idx if X_test[i] is not None]

target = list(load_target(embedding, X_test[idx], sentences = False))

In [83]:
import random
from toolz import curry
from time import perf_counter
from math import ceil

class Classifier(nn.Module):
    def __init__(self, net, opt, criterion = None):
        super().__init__()
        self.net = net
        self.opt = opt(net)
        self.criterion = criterion
        self.net.register_backward_hook(printgradnorm)

    def forward(self, X):
        return self.net(X).view(-1)

    def evaluate(self, source, target, label):
        out = self.__call__(source)
        loss = self.criterion(out.reshape(1, -1), label)
        return loss


class Discriminator(Classifier):
    def evaluate(self, source, target, label):
        guess_s = self.__call__(source)
        guess_t = self.__call__(target)
        loss = self.criterion(guess_s, torch.tensor([1.]))
        loss += self.criterion(guess_t, torch.tensor([0.]))
        return loss
        

class PlatonicNet():
    def __init__(self, embedder, classifier, discriminator, batch_size=64, n_epochs=5, grad_norm_clip=0.25):
        self.discriminator = discriminator
        self.classifier = classifier
        self.embedder = embedder
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.grad_norm_clip = grad_norm_clip

    def load_data(self, docs, labels, target):
        self.docs, self.labels, self.target = docs, labels, target.copy()

    def batch(self, size):
        random.shuffle(self.target)

        dat = list(zip(self.docs, self.labels, self.target))
        random.shuffle(dat)

        out = []
        while dat:
            head,dat = dat[:size], dat[size:]
            out.append(head)
        return out

    
    def epoch(self, embedder):
        epoch_disc_loss = 0
        epoch_class_loss = 0
        epoch_start = perf_counter()

        for i,batch in enumerate(self.batch(self.batch_size)):
            batch_disc_loss = 0
            batch_class_loss = 0

            # run for each net, classifier and discriminator
            for net,sign in [(self.classifier, 1.)]:

                # due to pytorch updating, 
                # run twice, once for embedder, once for the other model
                for updating_model,sgn in [(embedder, sign), (net, 1.)]:
                    
                    updating_model.opt.zero_grad()
                    loss = 0
                    for source, label, target in batch:
                        loss += net.evaluate(embedder(source), embedder(target), label)
                        if torch.isnan(loss):
                            print(embedder(source))
                            print(loss)
                            raise Exception('LOSS/EMBEDDING IS NAN')

                    loss *= sign
                    
                    if sign < 0:
                        batch_disc_loss += loss
                        epoch_disc_loss += loss
                    else:
                        batch_class_loss += loss
                        epoch_class_loss += loss
                    loss.backward()
                    torch.nn.utils.clip_grad_value_(updating_model.net.parameters(), self.grad_norm_clip)
                    updating_model.opt.step()
                    
            if i % 100 == 0:
                print(f'Batch class/disc loss: {batch_class_loss} ::: {batch_disc_loss}')
        epoch_time = round((perf_counter() - epoch_start)/60)
        print(f'----------- EPOCH --------------\nEpoch finished in {epoch_time} minutes. class/disc loss: {epoch_class_loss} ::: {epoch_disc_loss}')        

    def train(self):
        for epoch in range(self.n_epochs):
            self.epoch(self.embedder)            


def printgradnorm(self, grad_input, grad_output):
    if grad_input[0].norm() > 10.:
        print('grad_input norm:', grad_input[0].norm())

class GatedNet(torch.nn.Module):
    def __init__(self, embed_size, layers):
        super().__init__()
        self.conver = nn.Sequential(
            nn.Conv1d(in_channels=embed_size, out_channels=layers, kernel_size=1, groups=1, padding=0, bias=False),
            nn.Sigmoid()
        )

        self.conver.register_backward_hook(printgradnorm)

    def forward(self, x):        
        convs = self.conver(x)
        out = torch.matmul(x, torch.t(convs.max(1).values))
        return out / torch.norm(out)  

class ParallelFilters(nn.Module):
    def __init__(self, filters):
        super().__init__()
        for i,net in enumerate(filters):
            self.add_module(f'filter_{i}', net)

    def forward(self, x):
        return torch.cat([net(x) for net in self.children()], 1)    


def _embedder(embed_size, layers):
    filters = [
        nn.Sequential(
            nn.Conv1d(in_channels=embed_size, out_channels=out_channels, kernel_size=kernel_size, groups=1, padding=kernel_size - 1),
            nn.ReLU(),
            nn.AdaptiveMaxPool1d(output_size=1))
        for kernel_size,out_channels in layers]

    net = nn.Sequential(
        ParallelFilters(filters),
        nn.Dropout(p=0.25)
    )

    net.register_backward_hook(printgradnorm)
    return net    

def _embedder_single(embed_size, out_channels):
    net = nn.Sequential(
        nn.Conv1d(in_channels=embed_size, out_channels=out_channels, kernel_size=1, groups=1, padding=0),
        nn.ReLU(),
        nn.AdaptiveMaxPool1d(output_size=1),
        nn.Dropout(p=0.25)
    )

    net.register_backward_hook(printgradnorm)
    return net    

In [84]:
from adabound import AdaBound

@curry
def adam_opt(lr, net):
    return optim.Adam(net.parameters(), lr=lr, weight_decay=1.0)

@curry
def ab_opt(lr, net):
    return AdaBound(net.parameters(), lr=lr, final_lr=0.01, weight_decay=1.0)

n_classes = y_train.unique().shape[0]

filters = [(1, 50), (2, 50), (3, 50), (4, 50)]
final_layer_size = np.sum([f[1] for f in filters])

print(final_layer_size)

embedder = Classifier(_embedder(100, filters), ab_opt(0.0001))
classifier = Classifier(nn.Sequential(nn.Linear(final_layer_size, n_classes)), ab_opt(0.0001), nn.CrossEntropyLoss())
discriminator = Discriminator(nn.Sequential(nn.Linear(final_layer_size, 1)), ab_opt(0.0001), nn.BCELoss())

200


In [85]:
model = PlatonicNet(embedder, classifier, discriminator, n_epochs=30, grad_norm_clip=0.1)
model.load_data(docs, labels, target)

In [None]:
model.train()

In [None]:
i = 90

def get_spread(d):
    vals = model.embedder.net.conver(d).max(1).values.detach().numpy()
    return vals.max() - vals.min()
    

np.mean([get_spread(d) for d in docs[:500]]), np.mean([get_spread(d) for d in target[:500]])

In [None]:
i = 44

idx = np.where(model.embedder.net.conver(target[i]).max(1).values.detach().numpy() < .4)[1]
np.array(df.content.iloc[i].split('\t'))[idx]

In [41]:
from validation.scoring import bubbleup_score

def simple_embed(doc):
    X = doc.sum(2).reshape(-1)
    return X / torch.norm(X)

def ss_embed(doc):
    d = embedding.embed_doc(doc).sum(0)
    return d / np.linalg.norm(d)

In [42]:
Xe_train = [simple_embed(d).detach().numpy() for d in docs]

In [46]:
clf = LogisticRegression(C=1., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xe_train, y_train)
preds = clf.predict(Xe_train)
accuracy_score(preds, y_train)

0.43469912354052465

In [None]:
ss_embed('manager of sales')

In [314]:
np.dot(ss_embed('manager of farm labour'), ss_embed('sales clerk'))

-0.1731868108753034

In [315]:
np.dot(ss_embed('manager of sales personel'), ss_embed('manager of farm labour'))

0.10387624967646984

In [None]:
np.dot(ss_embed('manager of sales personel'), ss_embed('sales clerk'))

0.2131845341814802

In [313]:
np.dot(ss_embed('sales personel'), ss_embed('sales clerk'))

0.3484722905719662

In [None]:
labels = sorted(y_train.unique())

sorted(labels)[3]

In [304]:
from sklearn.metrics import confusion_matrix
import seaborn as sns


confusion_matrix(y_train, preds)[3]

array([  6,   4,  44, 274,  49,  28,  22,   0,   5,  72,   6,  19,  16,
         9,   6,  19,   0,   1,   1,  52,  24,   1,   1,   1,   0,  13,
         4,   1,  38,   8,   0,   0,   0,   0,   4,   0,  13,  13,   5,
         2,   6,   0,   0,   2,   4,   3,   1,  14,  15,   1,   0,   0,
         6,   0,   3,   5,   0,   5,   7,   0,   1,  13,   6,   0,   1,
         4,  25,   0,   0,   6,  10,   0,   4,   3,   1,   1,   1,   3,
         5,   0,   1,   1,   0,   0,   0,   2,  17,   1,   0,   0,   0,
         2,   0,   1,   0,   0])

In [57]:
Xp_train = [model.embedder(d).detach().numpy() for d in docs]

In [58]:
clf = LogisticRegression(C=1., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xp_train, y_train)
preds = clf.predict(Xp_train)
accuracy_score(preds, y_train)

0.30146489516553626

In [230]:
SAMPLE_SIZE=100000
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', SAMPLE_SIZE, 6)
X_train, y_train = dot_train_data(6)

In [268]:
Xe_test = [model.embedder(d).detach().numpy() for d in load_target(X_test)]
Xe_train = [model.embedder(d).detach().numpy() for d in load_target(X_train)]

In [269]:
clf = LogisticRegression(C=5., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xe_train, y_train)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [270]:
bubbleup_score(y_train, Xe_test, y_test, clf)

0.46860896376066846

In [251]:
Xe_test = [simple_embed(d).detach().numpy() for d in load_target(X_test)]
Xe_train = [simple_embed(d).detach().numpy() for d in load_target(X_train)]

In [252]:
clf = LogisticRegression(C=5., n_jobs=-1, solver='lbfgs', multi_class='multinomial')
clf.fit(Xe_train, y_train)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [253]:
bubbleup_score(y_train, Xe_test, y_test, clf)

0.4752223066267483