In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import numpy as np
import re

In [2]:
class Embedding():
    def __init__(self, path):
        embedding = pd.read_csv(path, sep='\t', header=None)
        keys = embedding.iloc[:,0]
        vals = embedding.iloc[:,1:].values
        self.lookup = {k:v for k,v in zip(keys, vals)}

    def embed_doc(self, doc, return_words = False):
        words = []
        vecs = []
        for word in doc.split():
            try:
                vecs.append(self.lookup[word])
                words.append(word)
            except KeyError:
                pass
        if not return_words: 
            return np.array(vecs)
        return np.array(vecs), words

In [3]:
import torch.nn as nn
import torch
import torch.optim as optim

In [4]:
embedding = Embedding('../indeed-embeds/model.tsv')

In [None]:
from gcsfs import GCSFileSystem

fs = GCSFileSystem(project='labor-market-data')
with fs.open('lmd-classify-dot/data/us/company-everything.csv') as f:
    df = pd.read_csv(f)

df['title'] = df.title.str.lower()

In [None]:
KEY_A = 'category'

overlapping = (df.sort_values(KEY_A)
               .groupby(KEY_A)
               .filter(lambda df: df.shape[0] > 15 and df.company.unique().shape[0] > 5)
               .groupby('company')
               .filter(lambda df: df.shape[0] > 15 and df[KEY_A].unique().shape[0] > 5)
               .groupby(KEY_A)
               .filter(lambda df: df.shape[0] > 10 and df.company.unique().shape[0] > 3)
               .groupby('company')
               .filter(lambda df: df.shape[0] > 10 and df[KEY_A].unique().shape[0] > 3))

top_companies = overlapping.company.value_counts().index.values[:10]
top_titles = overlapping[KEY_A].value_counts().index.values[:10]
overlapping = overlapping[(overlapping[KEY_A].isin(top_titles)) & (overlapping.company.isin(top_companies))]

overlapping = (overlapping
               .merge(pd.DataFrame(list(enumerate(overlapping[KEY_A].unique())), columns = ['title_class', KEY_A]),
                      how = 'left', on=KEY_A)
               .merge(pd.DataFrame(list(enumerate(overlapping.company.unique())), columns = ['company_class', 'company']),
                      how = 'left', on='company'))[['company_class', KEY_A, 'content', 'company', 'title_class']]

In [None]:
def load_data(df):
    for i,d in df.iterrows():
        doc = embedding.embed_doc(d.content.lower()).T.reshape(1, 100, -1)
        doc = torch.from_numpy(doc).float()
        class_distinct, class_ignore = torch.from_numpy(np.array(d.title_class)).long(), torch.from_numpy(np.array(d.company_class)).long()
        yield doc, [class_distinct, class_ignore]

In [None]:
docs, labels = zip(*load_data(overlapping))

In [None]:
from toolz import curry
import attr
import random

class Classifier():
    def __init__(self, net, opt, criterion = None):
        self.net = net
        self.opt = opt(net)
        self.criterion = criterion

    def __call__(self, X):
        return self.net(X).view(-1)


class Model():
    def __init__(self, Embedder, Classifier, Discriminator, aspects, batch_size=64, n_epochs=5):
        self.Discriminator = Discriminator
        self.Classifier = Classifier
        self.embedders = [Embedder() for _ in range(aspects)]
        self.aspects = aspects
        self.batch_size = batch_size
        self.n_epochs = n_epochs


    def load_data(self, docs, labels):
        self.docs, self.labels = docs, labels

    def batch(self, docs, labels, size):
        dat = list(zip(docs, labels))
        random.shuffle(dat)
        out = []
        while dat:
            head,dat = dat[:size], dat[size:]
            out.append(head)
        return out

    def epoch(self, embedder, nets, aspect):
        epoch_loss = 0

        for batch in self.batch(self.docs, self.labels, self.batch_size):

            # run for each aspect, classifier and discriminators
            for net,sign in nets:

                # due to pytorch updating, 
                # run twice, once for embedder, once for the other model
                for updating_model,sgn in [(embedder, sign), (net, 1)]:
                    updating_model.opt.zero_grad()
                    for doc,labels in batch:
                        label = labels[aspect].reshape(1)
                        doc_em = embedder(doc)

                        # embed title
                        out = net(doc_em).reshape(1, -1)

                        # pass title embed and doc embed to criterion
                        loss = net.criterion(out, label)
                        loss *= sign
                        loss.backward()
                        epoch_loss += loss
                    updating_model.opt.step()
        print(epoch_loss)

    def train(self):
        for aspect,embedder in enumerate(self.embedders):
            classifier = self.Classifier()
            discriminators = [self.Discriminator() for i in range(self.aspects - 1)]

            # insert classifier 
            nets = [(d, -1) for d in discriminators]
            nets.insert(aspect, (classifier, 1))

            for epoch in range(self.n_epochs):
                self.epoch(embedder, nets, aspect)            

class GatedNet(torch.nn.Module):
    def __init__(self, embed_size, layers):
        super().__init__()
        self.conver = nn.Sequential(
            nn.Conv1d(in_channels=embed_size, out_channels=layers, kernel_size=1, groups=1, padding=0),
            nn.Sigmoid()
        )


    def forward(self, x):        
        convs = self.conver(x)
        out = torch.matmul(x, torch.t(convs.max(1).values))
        return out / torch.norm(out)


def _gated_embedder(embed_size, layers):
    return GatedNet(embed_size, layers)

def _embedder(embed_size, layers):
    return nn.Sequential(
        nn.Conv1d(in_channels=embed_size, out_channels=layers, kernel_size=1, groups=1, padding=0),
        nn.ReLU(),
        nn.AdaptiveMaxPool1d(output_size=1),
        nn.Dropout(p=0.25)
    )



In [None]:
criterion = nn.CrossEntropyLoss()

logistic_regression = lambda P: nn.Sequential(nn.Linear(P, 10))
sgd = lambda net: optim.Adam(net.parameters(), lr=0.01)

classifier = lambda: Classifier(logistic_regression(100), sgd, criterion)
discriminator = lambda: Classifier(logistic_regression(100), lambda net: optim.Adam(net.parameters(), lr=0.01), criterion)
embedder = lambda: Classifier(_gated_embedder(100, 40), sgd)

model = Model(embedder, classifier, discriminator, 2, n_epochs=10)
model.load_data(docs, labels)
model.train()

tensor(-28623.2852, grad_fn=<AddBackward0>)


tensor(-105733.5703, grad_fn=<AddBackward0>)


tensor(-200111.4844, grad_fn=<AddBackward0>)


tensor(-296963.7812, grad_fn=<AddBackward0>)


tensor(-391554.0625, grad_fn=<AddBackward0>)


tensor(-484005.4375, grad_fn=<AddBackward0>)


tensor(-574920.7500, grad_fn=<AddBackward0>)


tensor(-664975.5625, grad_fn=<AddBackward0>)


tensor(-754278.3125, grad_fn=<AddBackward0>)


tensor(-843107.7500, grad_fn=<AddBackward0>)


tensor(-47610.7695, grad_fn=<AddBackward0>)


tensor(-150088.4062, grad_fn=<AddBackward0>)


tensor(-270993.6875, grad_fn=<AddBackward0>)


tensor(-391928.2812, grad_fn=<AddBackward0>)


tensor(-509017.8750, grad_fn=<AddBackward0>)


tensor(-624099.1250, grad_fn=<AddBackward0>)


tensor(-738028.1875, grad_fn=<AddBackward0>)


tensor(-850889.8125, grad_fn=<AddBackward0>)


tensor(-962848.8125, grad_fn=<AddBackward0>)


tensor(-1074132.7500, grad_fn=<AddBackward0>)


In [None]:


# add = pd.DataFrame(list(zip(overlapping.title.unique(), [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0])), columns = ['title', 'soft'])
# oo = overlapping.merge(add, how='left', on='title')

sns.scatterplot(x = 'x', y = 'y', hue='label', data = pd.DataFrame(MDS(2).fit_transform(X.astype(np.float64)), columns = ['x', 'y']).assign(label = overlapping.category))