In [2]:
%load_ext autoreload
%autoreload 2

import torch.nn as nn
import torch
import torch.optim as optim

import pandas as pd
import numpy as np
import seaborn as sns

from validation.data import dot_train_data, get_soc_n, get_dictionary, indeed_test_data
from embed_software.preprocess import *
from embed_software.utils import get_embeddings, embed_docs
from classification.embedding import PreEmbeddedVectorizer


pd.set_option('max_colwidth',50)
pd.set_option('display.width', 700)

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [15, 10]

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [40]:
SAMPLE_SIZE = 100000
SOC_LEVEL = 3

In [41]:
X_train, y_train = dot_train_data(SOC_LEVEL)

In [5]:
from gcsfs import GCSFileSystem

fs = GCSFileSystem(project='labor-market-data')
with fs.open('lmd-classify-dot/data/us/company-everything.csv') as f:
    df = pd.read_csv(f)

df['title'] = df.title.str.lower()

In [6]:
def load_source(X_train, y_train):
    for d,y in zip(X_train, y_train):
        doc = embedding.embed_paragraph(d).T.reshape(1, 100, -1)
        doc = torch.from_numpy(doc).float()
        label = torch.tensor([y]).long()
        yield doc, label

def load_target(docs):
    for d in docs:
        doc = embedding.embed_paragraph(d).T.reshape(1, 100, -1)
        doc = torch.from_numpy(doc).float()
        yield doc

In [7]:
class Embedding():
    def __init__(self, path):
        embedding = pd.read_csv(path, sep='\t', header=None)
        keys = embedding.iloc[:,0]
        vals = embedding.iloc[:,1:].values
        self.lookup = {k:v for k,v in zip(keys, vals)}

    def embed_paragraph(self, doc):
        sents = doc.split('\t')
        vecs = [self.embed_sent(sent) for sent in sents]
        vecs = [v for v in vecs if v is not None] # check if sentence is empty
        return np.array(vecs)            

    def embed_sent(self, sent):
        vec = self.embed_doc(sent)
        if len(vec):
            return vec.sum(0) / np.linalg.norm(vec)
        else:
            return None

    def embed_doc(self, doc, return_words = False):
        words = []
        vecs = []
        for word in doc.split():
            try:
                vecs.append(self.lookup[word])
                words.append(word)
            except KeyError:
                pass
        if not return_words: 
            return np.array(vecs)
        return np.array(vecs), words


embedding = Embedding('../indeed-embeds/model.tsv')

In [42]:
label_lookup = {v:k for k,v in pd.Series(y_train.unique()).to_dict().items()}
y_train_idx = [label_lookup[y] for y in y_train]

In [43]:
docs, labels = zip(*list(load_source(X_train, y_train_idx)))

In [10]:
df = df.sample(n=50000)
target = list(load_target(df.content))

In [195]:
import random
from toolz import curry

class Classifier():
    def __init__(self, net, opt, criterion = None):
        self.net = net
        self.opt = opt(net)
        self.criterion = criterion
        self.net.register_backward_hook(printgradnorm)

    def __call__(self, X):
        return self.net(X).view(-1)

    def evaluate(self, source, target, label):
        out = self.__call__(source)
        loss = self.criterion(out.reshape(1, -1), label)
        return loss


class Discriminator(Classifier):
    def evaluate(self, source, target, label):
        guess_s = self.__call__(source)
        guess_t = self.__call__(target)
        loss = self.criterion(guess_s, torch.tensor([1.]))
        loss += self.criterion(guess_t, torch.tensor([-1.]))
        return loss
        

class PlatonicNet():
    def __init__(self, embedder, classifier, discriminator, batch_size=64, n_epochs=5):
        self.discriminator = discriminator
        self.classifier = classifier
        self.embedder = embedder
        self.batch_size = batch_size
        self.n_epochs = n_epochs
        self.grad_norm_clip = 0.025

    def load_data(self, docs, labels, target):
        self.docs, self.labels, self.target = docs, labels, target.copy()

    def batch(self, size):
        random.shuffle(self.target)

        dat = list(zip(self.docs, self.labels, self.target))
        random.shuffle(dat)

        out = []
        while dat:
            head,dat = dat[:size], dat[size:]
            out.append(head)
        return out

    def epoch(self, embedder):
        epoch_disc_loss = 0
        epoch_class_loss = 0

        for i,batch in enumerate(self.batch(self.batch_size)):
            batch_disc_loss = 0
            batch_class_loss = 0

            # run for each net, classifier and discriminator
            for net,sign in [(self.classifier, 1), (self.discriminator, -1)]:

                # due to pytorch updating, 
                # run twice, once for embedder, once for the other model
                for updating_model,sgn in [(embedder, sign), (net, 1)]:
                    
                    updating_model.opt.zero_grad()
                    loss = 0
                    for source, label, target in batch:
                        loss += net.evaluate(embedder(source), embedder(target), label)
                        if torch.isnan(loss):
                            print(embedder(source))
                            raise Exception('LOSS/EMBEDDING IS NAN')

                    loss *= sign
                    
                    if sign < 0:
                        batch_disc_loss += loss
                        epoch_disc_loss += loss
                    else:
                        batch_class_loss += loss
                        epoch_class_loss += loss
                    loss.backward()

                    torch.nn.utils.clip_grad_value_(updating_model.net.parameters(), self.grad_norm_clip)
                    updating_model.opt.step()
                    
            if i % 100 == 0:
                print(f'Batch class/disc loss: {batch_class_loss} ::: {batch_disc_loss}')
        print(f'----------- EPOCH --------------\nEpoch class/disc loss: {epoch_class_loss} ::: {epoch_disc_loss}')        

    def train(self):
        for epoch in range(self.n_epochs):
            self.epoch(self.embedder)            


def printgradnorm(self, grad_input, grad_output):
    if grad_input[0].norm() > 600.:
        print('grad_input norm:', grad_input[0].norm())

class GatedNet(torch.nn.Module):
    def __init__(self, embed_size, layers):
        super().__init__()
        self.conver = nn.Sequential(
            nn.Conv1d(in_channels=embed_size, out_channels=layers, kernel_size=1, groups=1, padding=0, bias=False),
            nn.Sigmoid()
        )

        self.conver.register_backward_hook(printgradnorm)

    def forward(self, x):        
        convs = self.conver(x)
        out = torch.matmul(x, torch.t(convs.max(1).values))
        return out / torch.norm(out)  


def _embedder(embed_size, layers):
    net = nn.Sequential(
        nn.Conv1d(in_channels=embed_size, out_channels=layers, kernel_size=1, groups=1, padding=0),
        nn.ReLU(),
        nn.AdaptiveMaxPool1d(output_size=1),
        nn.Dropout(p=0.25)
    )

    net.register_backward_hook(printgradnorm)
    return net    

In [196]:
from adabound import AdaBound

@curry
def adam_opt(lr, net):
    return optim.Adam(net.parameters(), lr=lr)

@curry
def ab_opt(lr, net):
    return AdaBound(net.parameters(), lr=lr, final_lr=0.01)

n_classes = y_train.unique().shape[0]
classifier = Classifier(nn.Sequential(nn.Linear(100, n_classes)), ab_opt(0.001), nn.CrossEntropyLoss())

discriminator = Discriminator(nn.Sequential(nn.Linear(100, 1)), ab_opt(0.01), nn.SoftMarginLoss())

embedder = Classifier(GatedNet(100, 50), ab_opt(0.0001))

model = PlatonicNet(embedder, classifier, discriminator, n_epochs=10)

model.load_data(docs, labels, target)

model.train()

Batch class/disc loss: 584.4098510742188 ::: -178.2403564453125


Batch class/disc loss: 552.3291015625 ::: -265.2200927734375


Batch class/disc loss: 522.257080078125 ::: -308.1689453125


Batch class/disc loss: 492.631591796875 ::: -406.66265869140625


Batch class/disc loss: 457.7840881347656 ::: -423.6307678222656


----------- EPOCH --------------
Epoch class/disc loss: 260234.65625 ::: -168984.140625


Batch class/disc loss: 459.88751220703125 ::: -410.251953125


Batch class/disc loss: 441.5174865722656 ::: -478.87567138671875


Batch class/disc loss: 449.98187255859375 ::: -424.98779296875


Batch class/disc loss: 437.3743896484375 ::: -447.0699157714844


Batch class/disc loss: 414.9876403808594 ::: -479.4339904785156


Batch class/disc loss: 447.04302978515625 ::: -426.9431457519531


----------- EPOCH --------------
Epoch class/disc loss: 226395.015625 ::: -232739.625


Batch class/disc loss: 391.5748291015625 ::: -508.047607421875


Batch class/disc loss: 450.5005798339844 ::: -465.57647705078125


Batch class/disc loss: 432.8442077636719 ::: -599.3541870117188


Batch class/disc loss: 435.601318359375 ::: -576.580810546875


Batch class/disc loss: 429.64154052734375 ::: -543.239013671875


----------- EPOCH --------------
Epoch class/disc loss: 216763.84375 ::: -273864.375


Batch class/disc loss: 405.45147705078125 ::: -567.2535400390625


Batch class/disc loss: 392.1707763671875 ::: -557.09619140625


Batch class/disc loss: 427.6141357421875 ::: -523.069091796875


Batch class/disc loss: 395.3511047363281 ::: -670.39599609375


Batch class/disc loss: 421.0580139160156 ::: -615.4805908203125


Batch class/disc loss: 448.6578369140625 ::: -611.7037353515625


----------- EPOCH --------------
Epoch class/disc loss: 211017.796875 ::: -308732.46875


Batch class/disc loss: 431.4724426269531 ::: -656.7446899414062


Batch class/disc loss: 422.68731689453125 ::: -628.1716918945312


Batch class/disc loss: 413.0703125 ::: -620.9329223632812


Batch class/disc loss: 424.67730712890625 ::: -682.7667236328125


Batch class/disc loss: 428.7297058105469 ::: -637.4623413085938


Batch class/disc loss: 410.93511962890625 ::: -717.5020141601562


----------- EPOCH --------------
Epoch class/disc loss: 206739.25 ::: -341525.84375


Batch class/disc loss: 443.2616882324219 ::: -717.1974487304688


Batch class/disc loss: 442.01470947265625 ::: -800.630615234375


Batch class/disc loss: 429.6741638183594 ::: -598.8499145507812


Batch class/disc loss: 406.7196044921875 ::: -776.4276123046875


Batch class/disc loss: 403.8837890625 ::: -759.2978515625


Batch class/disc loss: 446.21392822265625 ::: -717.7222900390625


----------- EPOCH --------------
Epoch class/disc loss: 203315.078125 ::: -371526.9375


Batch class/disc loss: 436.582763671875 ::: -808.15087890625


Batch class/disc loss: 408.59527587890625 ::: -784.536376953125


Batch class/disc loss: 408.20733642578125 ::: -777.6864624023438


Batch class/disc loss: 404.8863525390625 ::: -766.2387084960938


Batch class/disc loss: 435.57501220703125 ::: -703.7081909179688


Batch class/disc loss: 410.7972412109375 ::: -757.63134765625


----------- EPOCH --------------
Epoch class/disc loss: 200458.375 ::: -403072.625


Batch class/disc loss: 410.3885803222656 ::: -842.0288696289062


Batch class/disc loss: 379.1239013671875 ::: -801.5884399414062


Batch class/disc loss: 399.40625 ::: -988.7857055664062


Batch class/disc loss: 435.48345947265625 ::: -871.29052734375


Batch class/disc loss: 399.74017333984375 ::: -856.109130859375


Batch class/disc loss: 338.1229553222656 ::: -924.7833251953125


----------- EPOCH --------------
Epoch class/disc loss: 198033.5625 ::: -432626.5625


Batch class/disc loss: 389.07373046875 ::: -894.3946533203125


Batch class/disc loss: 397.10302734375 ::: -967.978271484375


Batch class/disc loss: 421.25537109375 ::: -902.5859375


Batch class/disc loss: 402.982421875 ::: -892.080322265625


Batch class/disc loss: 386.44769287109375 ::: -945.206787109375


Batch class/disc loss: 358.2181396484375 ::: -1074.4908447265625


----------- EPOCH --------------
Epoch class/disc loss: 195903.65625 ::: -461215.71875


Batch class/disc loss: 355.67181396484375 ::: -848.8766479492188


Batch class/disc loss: 334.46270751953125 ::: -1090.366943359375


Batch class/disc loss: 341.178955078125 ::: -1016.0640258789062


Batch class/disc loss: 395.705322265625 ::: -940.0366821289062


Batch class/disc loss: 382.79364013671875 ::: -1103.3408203125


Batch class/disc loss: 408.870361328125 ::: -1005.6273193359375


----------- EPOCH --------------
Epoch class/disc loss: 194025.625 ::: -490319.90625


In [None]:
model.train()

Batch class/disc loss: 381.31427001953125 ::: -786.635009765625


Batch class/disc loss: 379.6686706542969 ::: -1002.591064453125


Batch class/disc loss: 397.80194091796875 ::: -902.412353515625


Batch class/disc loss: 402.3179016113281 ::: -1021.63134765625


Batch class/disc loss: 336.9195251464844 ::: -996.4227294921875


Batch class/disc loss: 431.1322937011719 ::: -1054.1578369140625


In [None]:
model.embedder(docs[0])

In [None]:
torch.save(model, 'model-28.pt')

In [258]:
torch.save(model.embedder.net.state_dict(), 'embedder-state-dict-28.pt')

In [None]:
list(model.embedder.net.parameters())

In [229]:
i = 90

def get_spread(d):
    vals = model.embedder.net.conver(d).max(1).values.detach().numpy()
    return vals.max() - vals.min()
    

np.mean([get_spread(d) for d in docs[:500]]), np.mean([get_spread(d) for d in target[:500]])

(0.15397196, 0.3561607)

In [None]:
i = 44

idx = np.where(model.embedder.net.conver(target[i]).max(1).values.detach().numpy() > .5)[1]
np.array(df.content.iloc[i].split('\t'))[idx]

In [217]:
model.embedder.net.conver(target[12]).min(1)

torch.return_types.min(
values=tensor([[0.4257, 0.5064, 0.4322, 0.4226, 0.5467, 0.4889, 0.3775, 0.3027, 0.4938,
         0.4459, 0.1576, 0.3468, 0.4140, 0.4285]], grad_fn=<MinBackward0>),
indices=tensor([[37, 37, 20, 37, 41, 17,  5,  6, 37,  6,  6,  6,  6, 37]]))

In [None]:
from validation.scoring import bubbleup_score

In [218]:
def simple_embed(doc):
    X = doc.sum(2).reshape(-1)
    return X / torch.norm(X)

In [219]:
Xe_train = [simple_embed(d).detach().numpy() for d in docs]

In [220]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_predict, StratifiedKFold, KFold
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


clf = LogisticRegression(C=1., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xe_train, y_train)
preds = clf.predict(Xe_train)
accuracy_score(preds, y_train)

0.4852736225959305

In [221]:
Xp_train = [model.embedder(d).detach().numpy() for d in docs]

In [222]:
clf = LogisticRegression(C=1., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xp_train, y_train)
preds = clf.predict(Xp_train)
accuracy_score(preds, y_train)

0.486419523676794

In [230]:
SAMPLE_SIZE=100000
X_test, y_test, ids = indeed_test_data('../data/us/everything.csv', SAMPLE_SIZE, 6)
X_train, y_train = dot_train_data(6)

In [234]:
Xe_test = [model.embedder(d).detach().numpy() for d in load_target(X_test)]
Xe_train = [model.embedder(d).detach().numpy() for d in load_target(X_train)]

In [247]:
clf = LogisticRegression(C=5., n_jobs=-1, solver='lbfgs', multi_class='multinomial')

clf.fit(Xe_train, y_train)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [249]:
bubbleup_score(y_train, Xe_test, y_test, clf)

0.47129004870637653

In [251]:
Xe_test = [simple_embed(d).detach().numpy() for d in load_target(X_test)]
Xe_train = [simple_embed(d).detach().numpy() for d in load_target(X_train)]

In [252]:
clf = LogisticRegression(C=5., n_jobs=-1, solver='lbfgs', multi_class='multinomial')
clf.fit(Xe_train, y_train)

LogisticRegression(C=5.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [253]:
bubbleup_score(y_train, Xe_test, y_test, clf)

0.4752223066267483