In [1]:
import torch, torchtext, nltk
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence
from sklearn.metrics import accuracy_score
import numpy as np
import pandas as pd
import re, string
from tqdm import tqdm
from tqdm import notebook

# Read Data

In [2]:
'''
This data file has been ranked already, using a keyword multiplier system I used as a rudimentary start to the
idea of ranking job listings. As such, we can expect that any neural network that tries to learn from this data
will most likely relearn the particular words and their multipliers, and only tangentially learn other things.
Thus, we need a method to allow for active learning from user input, ideally in the form of job listing pairs
that are shown to the user for binary ranking, which then feeds the learning system somehow. Otherwise this
network cannot learn anything that does not already appear in a curated dataset, which defeats the point of the
neural network in the first place as a replacement for keyword multiplier ranking.
'''

Listings = pd.read_csv('Data/Listings.csv')

# Clean all fields

In [225]:
for column_to_clean in ['Position','Company','Location','Salary','Summary']:
    Listings[column_to_clean].replace('\n',' ', regex=True,inplace=True) #Replace all newline char's with spaces
    Listings[column_to_clean].fillna('', inplace=True) #Replaces all NaN's with blank strings
    
    # remove all unusual text characters in the text and turn everything lowercase to reduce dictionary size
    cleaningfunction = lambda piece_of_text: re.sub(r'\.(?=[^ \W\d])', '. ',piece_of_text[column_to_clean].lower())
    Listings[column_to_clean] = Listings.apply(cleaningfunction,'columns')

# Naively apply a ranking system to the job listings

In [226]:
Listings['Rating'] = np.linspace(1,0,len(Listings['Rating']))

# Create a 2xN dataframe [All textual data as one string, Rank]

In [227]:
DescriptionAndRank = pd.DataFrame({'Description': Listings['Position'] + ' ' + Listings['Company'] + ' ' + Listings['Location'] + ' ' + Listings['Salary'] + ' ' + Listings['Summary'],'Rating': Listings['Rating']})

#Save to CSV (did this to avoid having to figure out why the torchtext.data.TabularDataset() function wasn't working)
DescriptionAndRank.to_csv('Data/DescriptionAndRank.csv',index=False)

# Create a torchtext format dataset and split it into training and validation

In [432]:
DescriptionField = torchtext.data.Field(sequential=True, #words have order, sequence matters
                            include_lengths=True, #batching function tries to batch similar-length lines together
                            
                            # NLTK recognizes hyphenated bigrams and understands mis-spelled words,
                            # it takes our long string of text and breaks it into individual words or "tokens",
                            # fixing it along the way. The default would have been a split() function.
                            tokenize=nltk.tokenize.word_tokenize, 
                            use_vocab=True, # we're going to use the GloVe vocabulary vectorizer to turn tokens into integers
                            batch_first = True) #batch dimension comes first in the tensor

RankField = torchtext.data.Field(sequential=False, 
                            tokenize=None,
                            include_lengths=None,
                            use_vocab=None,
                            batch_first = True,
                            
                            #default is torch.long, fine for integer representations of words but not 0-1 ranking
                            dtype=torch.float)
SortField = torchtext.data.Field(sequential=False, 
                            tokenize=None,
                            include_lengths=None,
                            use_vocab=None,
                            batch_first = True,
                            
                            #default is torch.long, fine for integer representations of words but not 0-1 ranking
                            dtype=torch.float)

Fields = [('Description', DescriptionField),('Rank', RankField),('SortKey', SortField)]


dataset = torchtext.data.TabularDataset('Data/DescriptionAndRank.csv','CSV',skip_header=True,fields = [('Description', DescriptionField),('Rank', RankField)])
trainset = torchtext.data.TabularDataset('Data/FinalRankedPairsTraining.csv','CSV',skip_header=True,fields = Fields)
validset = torchtext.data.TabularDataset('Data/FinalRankedPairsValidation.csv','CSV',skip_header=True,fields = Fields)

#trainset, validset = dataset.split() #automatically splits train/validation into 0.7/0.3

# Vectorize the words into a 50-dimensional format

In [433]:
'''
The beauty of the GloVe vocabulary model is that it has been trained to "group" certain words with other words
so that meaning is approximated in numerical format. We are using the 50-dimensional version, which means there
are 50 dimensions in which one word can be "similar" to any other word. So for instance, if the word 'dog' were
represented in its 34th dimension with a floating point of 0.893833, you would expect to find that the
34th dimension of the word 'puppy' was close to that numerical value. In this way, the neural network can learn to
approximate meaning, without having to define absurdly complex almost step-wise functions for randomly-assigned
word vectors.
'''
DescriptionField.build_vocab(dataset, max_size = 30000, vectors='glove.6B.50d') #max 30,000 words/tokens
print('Unique tokens in Description vocabulary: {}'.format(len(DescriptionField.vocab)))
print(DescriptionField.vocab.itos[2:102]) #print the most popular 100 tokens (the first two are the "unknown" and the "padding" tokens)

Unique tokens in Description vocabulary: 30002
[',', 'and', '.', 'to', 'of', 'the', 'in', 'a', 'with', 'for', ':', 'or', 'experience', ')', '(', 'is', 'engineering', 'design', 'as', 'work', 'our', 'be', 'will', 'you', 'engineer', 'we', 'on', 'an', 'systems', 'are', 'that', 'team', 'required', 'this', '’', 'development', 'skills', 'technical', 'ability', 'support', 'years', 'manufacturing', 'at', 'requirements', 'other', 'all', 'process', 'test', 'project', 's', ';', 'by', 'including', 'knowledge', '&', 'system', 'equipment', 'mechanical', 'have', 'product', 'position', 'job', 'new', 'from', 'projects', 'analysis', 'degree', 'must', 'working', 'management', 'your', 'environment', 'company', 'related', 'products', 'software', 'provide', 'data', 'electrical', 'field', 'develop', 'preferred', 'strong', 'responsibilities', 'solutions', 'qualifications', "'s", '-', 'status', 'opportunity', 'and/or', 'quality', 'may', 'customer', 'perform', 'processes', 'testing', 'more', 'communication', 'co

# Define the neural network model

In [434]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, dilation_sizes, output_dim, 
                 dropout, pad_idx):
        
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx = pad_idx)
        kernel_dimensions = zip(filter_sizes,dilation_sizes)
        #for (fs,ds) in kernel_dimensions:
        #    print(int((fs*ss)/2))
        self.convs = nn.ModuleList([
                                    nn.Conv2d(in_channels = 1, 
                                              out_channels = n_filters, 
                                              kernel_size = (fs, embedding_dim),
                                              dilation = (ds,1),
                                              padding = (int((fs*ds)/2),0)
                                             )
                                    
                                    for (fs,ds) in kernel_dimensions
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):
        
        #text = [batch size, sent len]
        
        embedded = self.embedding(text)
                
        #embedded = [batch size, sent len, emb dim]
        
        embedded = embedded.unsqueeze(1)
        
        #embedded = [batch size, 1, sent len, emb dim]
        
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
            
        #conved_n = [batch size, n_filters, sent len - filter_sizes[n] + 1]
                
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        
        #pooled_n = [batch size, n_filters]
        
        cat = self.dropout(torch.cat(pooled, dim = 1))

        #cat = [batch size, n_filters * len(filter_sizes)]
            
        return self.fc(cat)

In [435]:
BATCH_SIZE = 2 # counterintuitively, increasing this much more seems to make the performance suffer.
                # I think that may be due to the addition of padding characters to 'even the batch up'?
                # The bigger the batch size, the bigger the difference in job description lengths, the more padding

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_iterator, valid_iterator = torchtext.data.Iterator.splits(
    (trainset, validset), 
    batch_size = BATCH_SIZE, 
    device = device,
    sort_key = lambda x: int(x.SortKey),
    sort=True
    #sort_key=lambda x: len(x.Description)
    )#sort by the length of the job description, that way we group 
                        # similar-length job descriptions together, avoiding having too much padding on the ends
    
test_iterator = torchtext.data.Iterator(
    dataset, 
    batch_size = 20, 
    device = device,
    sort_key=lambda x: len(x.Description)
    )#sort by the length of the job description, that way we group 
                        # similar-length job descriptions together, avoiding having too much padding on the ends

In [436]:

INPUT_DIM = len(DescriptionField.vocab)
EMBEDDING_DIM = 50 # when we turned our tokens into vectors, this was the length of the vector
N_FILTERS = 200 # how many features the convolutions learn
FILTER_SIZES = [2,3,4,5,5,5,5,5,5,5] # originally this example only cared about 2,3,4 lengths, but our text examples 
# are very large so I figured having longer representations might help certain sections with more context stand out
# as opposed to just simple bi/tri/quad-grams being the dominant encoding of meaning in these thousand-word 
# job listings texts. It seemed like just way to small of a focus for a piece of text this big
DILATION_SIZES = [1,1,1,1,2,4,8,16,32,64] # without increasing the stride, larger kernels start to really slow things down
OUTPUT_DIM = 1
DROPOUT = 0.5
PAD_IDX = DescriptionField.vocab.stoi[DescriptionField.pad_token]

model = CNN(INPUT_DIM, EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, DILATION_SIZES, OUTPUT_DIM, DROPOUT, PAD_IDX)

In [437]:
pretrained_embeddings = DescriptionField.vocab.vectors

model.embedding.weight.data.copy_(pretrained_embeddings)

UNK_IDX = DescriptionField.vocab.stoi[DescriptionField.unk_token]

model.embedding.weight.data[UNK_IDX] = torch.zeros(EMBEDDING_DIM)
model.embedding.weight.data[PAD_IDX] = torch.zeros(EMBEDDING_DIM)

In [438]:
import torch.optim as optim

optimizer = optim.Adam(model.parameters())

#I changed it to SmoothL1Loss because this is kind of a regression problem. learning rank, not classification
criterion = nn.SmoothL1Loss() #BCEWithLogitsLoss() 

model = model.to(device)
criterion = criterion.to(device)
criterion1 = nn.KLDivLoss()
criterion2 = nn.Softplus()
def same_order_loss(output,target):
    x = -((output[0]-output[1])*(target[0]-target[1])) #positive num if wrong order, negative num if correct order
    loss = (x*torch.sigmoid(x)).pow(3) #SiLU
    #loss = torch.relu(x)
    return(loss)
    
criterion1.to(device)
criterion2.to(device)

Softplus(beta=1, threshold=20)

In [439]:
import time

def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [440]:
def binary_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    
    We are trying to do regression, but this is still nice to see at least whether it does a good job putting
    the rankings in the upper or lower half of the 0-1 distribution. If it can do better than 0.5, a random coin
    flip, we're on the right track.
    """
    # round truths and predictions to the closest integer
    rounded_preds = torch.round(preds)
    rounded_truth = torch.round(y)
    correct = (rounded_preds == rounded_truth).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [441]:
def sentiment_accuracy(preds, y):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    
    We are trying to do regression, but this is still nice to see at least whether it's doing the right thing with
    the rankings. If it's departing from 0.5, which should be the underlying distribution of random pairwise job listings
    (50% will end up both being from the top or bottom, 50% will end up being from opposite ends), we're on the 
    wrong track. Our accuracy at predicting how well the binary pairwise rankings are predicted should be completely
    random, because our selection from the underlying dataset is completely random.
    """
    # round truths and predictions to the closest integer
    rounded_preds = preds
    rounded_preds[preds>0] = 1
    rounded_preds[preds<0] = 0
    
    rounded_truth = torch.round(y)
    correct = (rounded_preds == rounded_truth).float() #convert into float for division 
    acc = correct.sum() / len(correct)
    return acc

In [442]:
def train(model, iterator, optimizer, criterion, criterion1, criterion2):
    
    epoch_loss = 0
    epoch_order_loss = 0
    epoch_centered_normal_loss = 0
    epoch_acc = 0
    
    model.train()
    #pbar = notebook.tqdm(total=len(iterator),position=1,dynamic_ncols=True)
    for i,batch in enumerate(iterator):
        optimizer.zero_grad()
        predictions = model(batch.Description[0]).squeeze(1)

        order_loss = same_order_loss(predictions,batch.Rank)
        centered_normal_loss = criterion1(predictions,(torch.randn_like(predictions))).pow(2)
        #center_loss = predictions.abs().sum()
        loss = order_loss+centered_normal_loss#+centered_normal_loss

        loss.backward()
        
        optimizer.step()
        
        acc = sentiment_accuracy(predictions, batch.Rank)
        
        epoch_order_loss += order_loss.item()
        epoch_centered_normal_loss += centered_normal_loss.item()
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        #print(loss.item(), acc.item())
        #pbar.update(1) #update tqdm progress bar
        #pbar.display('Iteration %d: Batch loss = %.6f , Batch accuracy = %.3f' % (i, loss.item(), acc.item()), pos=0)
    #pbar.close()
    return epoch_loss / len(iterator), epoch_order_loss / len(iterator), epoch_centered_normal_loss / len(iterator), epoch_acc / len(iterator)

In [443]:
def evaluate(model, iterator, criterion, criterion1,criterion2):
    
    epoch_loss = 0
    epoch_order_loss = 0
    epoch_centered_normal_loss = 0
    epoch_acc = 0
    
    model.eval()
    #pbar = notebook.tqdm(total=len(iterator),position=1,dynamic_ncols=True)
    with torch.no_grad():
    
        for i,batch in enumerate(iterator):

            predictions = model(batch.Description[0]).squeeze(1)
            order_loss = same_order_loss(predictions,batch.Rank)
            centered_normal_loss = criterion1(predictions,torch.randn_like(predictions)).pow(2)
            #center_loss = predictions.abs().mean()
            loss = order_loss+centered_normal_loss#+centered_normal_loss
            acc = sentiment_accuracy(predictions, batch.Rank)

            end_time = time.time()
            epoch_order_loss += order_loss.item()
            epoch_centered_normal_loss += centered_normal_loss.item()
            epoch_loss += loss.item()
            epoch_acc += acc.item()
            #pbar.update(1) #update tqdm progress bar
            #pbar.display('Iteration %d: Batch loss = %.6f , Batch accuracy = %.3f' % (i, loss.item(), acc.item()), pos=0)

    #pbar.close
    return epoch_loss / len(iterator), epoch_order_loss / len(iterator), epoch_centered_normal_loss / len(iterator), epoch_acc / len(iterator)

In [444]:
def test(model, iterator, criterion, criterion1,criterion2):
    
    epoch_loss = 0
    epoch_acc = 0
    test_result = []
    model.eval()
    pbar = notebook.tqdm(total=len(iterator),position=1,dynamic_ncols=True)
    with torch.no_grad():
    
        for i,batch in enumerate(iterator):
            predictions = model(batch.Description[0]).squeeze(1)
            for descript,predict in zip(batch.Description[0],predictions):
                test_result.append([descript,predict.item()])
    
            pbar.update(1) #update tqdm progress bar
    pbar.close
    return test_result

In [None]:
N_EPOCHS = 500

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):

    start_time = time.time()
    
    train_loss, train_order_loss, train_centered_loss, train_acc = train(model, train_iterator, optimizer, criterion, criterion1, criterion2)
    valid_loss, valid_order_loss, valid_centered_loss, valid_acc = evaluate(model, valid_iterator, criterion, criterion1, criterion2)
    
    end_time = time.time()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #if valid_loss < best_valid_loss:
    #    best_valid_loss = valid_loss
    #    torch.save(model.state_dict(), 'tut4-model.pt')
    torch.save(model.state_dict(), 'RankPrediction-model.pkl')
    
    print('Epoch: {} | Epoch Time: {}m {}s'.format(epoch+1,epoch_mins,epoch_secs))
    print('\tTrain Loss: {0:.8f} | Order Loss: {1:.8e} | Normal Dist. Loss: {2:.8f} | Train Acc: {3:.1f}%'.format(train_loss,train_order_loss,train_centered_loss,train_acc*100))
    print('\tVal. Loss: {0:.8f}  | Order Loss: {1:.8e} | Normal Dist. Loss: {2:.8f} |  Val. Acc: {3:.1f}%\n'.format(valid_loss,valid_order_loss,valid_centered_loss,valid_acc*100))
test_result = test(model, test_iterator, criterion, criterion1, criterion2)

Epoch: 1 | Epoch Time: 0m 46s
	Train Loss: 6.53778092 | Order Loss: 5.92707312e+00 | Normal Dist. Loss: 0.61070782 | Train Acc: 36.2%
	Val. Loss: 2.03402956  | Order Loss: 1.74138175e+00 | Normal Dist. Loss: 0.29264782 |  Val. Acc: 49.6%

Epoch: 2 | Epoch Time: 0m 39s
	Train Loss: 55.31670288 | Order Loss: 4.83566810e+01 | Normal Dist. Loss: 6.96002147 | Train Acc: 40.5%
	Val. Loss: 82.06934034  | Order Loss: 8.18908823e+01 | Normal Dist. Loss: 0.17845922 |  Val. Acc: 41.0%

Epoch: 3 | Epoch Time: 0m 34s
	Train Loss: 304.77432207 | Order Loss: 2.99209883e+02 | Normal Dist. Loss: 5.56444082 | Train Acc: 42.2%
	Val. Loss: 190.75785267  | Order Loss: 1.89503711e+02 | Normal Dist. Loss: 1.25413925 |  Val. Acc: 42.5%

Epoch: 4 | Epoch Time: 0m 34s
	Train Loss: 185.65229025 | Order Loss: 1.70730829e+02 | Normal Dist. Loss: 14.92146065 | Train Acc: 43.5%
	Val. Loss: 110.23407589  | Order Loss: 1.02625849e+02 | Normal Dist. Loss: 7.60822578 |  Val. Acc: 1.9%

Epoch: 5 | Epoch Time: 0m 36s
	Tra

Epoch: 35 | Epoch Time: 0m 33s
	Train Loss: 29096.63581468 | Order Loss: 2.83403788e+04 | Normal Dist. Loss: 756.25798114 | Train Acc: 50.0%
	Val. Loss: 18852.24565101  | Order Loss: 1.86174118e+04 | Normal Dist. Loss: 234.83397168 |  Val. Acc: 50.0%

Epoch: 36 | Epoch Time: 0m 35s
	Train Loss: 10274.73467264 | Order Loss: 9.54576246e+03 | Normal Dist. Loss: 728.97216154 | Train Acc: 49.7%
	Val. Loss: 9926.88951124  | Order Loss: 9.77661461e+03 | Normal Dist. Loss: 150.27510010 |  Val. Acc: 50.0%

Epoch: 37 | Epoch Time: 0m 35s
	Train Loss: 7140.14824983 | Order Loss: 6.39737738e+03 | Normal Dist. Loss: 742.77079075 | Train Acc: 50.0%
	Val. Loss: 7798.80982803  | Order Loss: 7.71733657e+03 | Normal Dist. Loss: 81.47322418 |  Val. Acc: 49.6%

Epoch: 38 | Epoch Time: 0m 33s
	Train Loss: 16139.61817626 | Order Loss: 1.56240742e+04 | Normal Dist. Loss: 515.54393362 | Train Acc: 49.7%
	Val. Loss: 8615.77657991  | Order Loss: 8.55920850e+03 | Normal Dist. Loss: 56.56818111 |  Val. Acc: 49.3%

In [429]:
#print([DescriptionField.vocab.itos[x] for x in test_result[0][0]])
PredictedPlaintextListingsandRanks = []
for listing in test_result:
    #print(listing)
    JobDescriptionPlaintext = [DescriptionField.vocab.itos[x] for x in listing[0]]
    JobDescriptionPlaintext = [word for word in JobDescriptionPlaintext if word != '<pad>']
    JobDescriptionPlaintext = ' '.join(JobDescriptionPlaintext)
    PredictedPlaintextListingsandRanks.append([JobDescriptionPlaintext,listing[1]])
    

In [430]:
pd.DataFrame(PredictedPlaintextListingsandRanks).to_csv('Data/AIRanked.csv',index=False)

In [428]:
test_result = test(model, test_iterator, criterion, criterion1, criterion2)

HBox(children=(FloatProgress(value=0.0, layout=Layout(flex='2'), max=173.0), HTML(value='')), layout=Layout(di…