In [1]:
import random as rn

import fasttext
import text_preprocessing as tp

import numpy as np
from numpy import newaxis

from collections import Counter
import sklearn
from imblearn.over_sampling import RandomOverSampler

import matplotlib.pyplot as plt

import torch
train_on_gpu = torch.cuda.is_available()
train_on_gpu = False
if train_on_gpu:
    torch.cuda.current_device()

from torch.utils.data import TensorDataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\3naza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
with open("vacancies.txt", mode='r', encoding='utf-8') as vf:
    vacancies = vf.readlines()

with open("labels.txt", mode='r', encoding='utf-8') as lf:
    labels = lf.readlines()

In [3]:
labeled_data = []
for vacancy, label in zip(vacancies, labels):
    labeled_data.append(vacancy + " __label__" + label.replace(' ', ''))

rn.shuffle(labeled_data)
split_index = int(len(labeled_data) * .75)
train_data = '\n'.join(labeled_data[:split_index])
valid_data = '\n'.join(labeled_data[split_index:])

In [4]:
with open("data.train", mode='w', encoding='utf-8') as df:
    df.write(train_data)
with open("data.valid", mode='w', encoding='utf-8') as df:
    df.write(valid_data)

In [5]:
# model = fasttext.train_supervised(input='data.train', epoch=30)

In [6]:
# model.save_model("model_vacancies.bin")

In [7]:
# model.test("data.valid")

In [8]:
# model.predict('10 Jul 2017 Published 4 days ago Product Manager (mobile apps) Digital Screens, LLC IT - software development Region: Kiev Website: digitalscreens.ua/ full time Job description Responsibilities: Formation of a roadmap product development plan (short-term and long-term plans); Project team management Definition and up-to-date support of the product development strategy for mobile applications Release planning; Writing TK; Identification of priority tasks, coordination of requirements for new tasks; Supervising UX / UI design processes, usability testing; Market research of competitors. Mobile product lifecycle management (from concept development to release); Requirements: Experience in launching successful iOS, Android applications (having a portfolio of completed projects that you can be proud of is an additional advantage) Experience in managing a mobile development team of 4 people or more. Understanding the mobile application development cycle Understanding the key trends and development trends of mobile applications; Experience with analytics systems (Google Analytics, etc.); Experience with Redmine, MS Excel, Project Understanding the principles of Agile methodologies Experience as a product manager of mobile products from 3 years; Experience with Fabric and Firebase')

In [9]:
cleared = tp.clear_and_tokenize(vacancies)
cleared[0][:10]

['computer',
 'system',
 'sql',
 'trainee',
 'analyst',
 'company',
 'consulting',
 'region',
 'kiev',
 'company']

In [10]:
tokens = tp.tokenize(vacancies)
tokens[0][:10]

['computer',
 'system',
 'sql',
 'trainee',
 'analyst',
 'company',
 'at',
 'consulting',
 'region',
 'kiev']

In [11]:
words = tp.get_all_words(tokens)
words[:10]

['computer',
 'system',
 'sql',
 'trainee',
 'analyst',
 'company',
 'at',
 'consulting',
 'region',
 'kiev']

In [12]:
corpus = ' '.join(words)
with open('corpus.txt', mode='w', encoding='utf-8') as cf:
    cf.write(corpus)

In [13]:
# model = fasttext.train_unsupervised('corpus.txt', dim=128)
model = fasttext.load_model('fasttext_model')




In [14]:
model.save_model("fasttext_model")

In [15]:
model.words[:10]

['and',
 'of',
 'the',
 'in',
 'a',
 'to',
 'with',
 'experience',
 'for',
 'development']

In [16]:
model.get_word_vector('computer')

array([-0.22876108,  0.07642055,  0.1859511 ,  0.41626796,  0.11182241,
       -0.28945684, -0.66903186, -0.21917312, -0.14527364,  0.06512249,
        0.26689827, -0.0183398 , -0.02297962,  0.718341  , -0.349835  ,
        0.06574977,  0.08183116,  0.88873255, -0.17869489, -0.4375649 ,
       -0.13618167, -0.0016635 ,  0.10165705, -0.50262934, -0.0774672 ,
       -0.0982023 , -0.44890982,  0.16257028, -0.14164343,  0.5974602 ,
       -0.0784402 ,  0.54712874,  0.12620166,  0.09744629,  0.16470037,
       -0.283907  , -0.13685536, -0.51129705, -0.2705507 ,  0.34872207,
       -0.29827207,  0.6564145 ,  0.36645138,  0.24717124,  0.07709025,
       -0.30961856,  0.21496625, -0.30768776,  0.02405895,  0.2402943 ,
        0.43621793, -0.24740571,  0.32947266,  0.0898559 , -0.2837158 ,
       -0.06052554, -0.43978968,  0.38489348, -0.25158736,  0.15060748,
       -0.20507613,  0.21857153,  0.41095582, -0.3411489 ,  0.16295013,
       -0.0472776 , -0.35010478,  0.25204396, -0.08301147,  0.05

In [17]:
def pad_feature(vector, size):
    if len(vector) > size:
        return vector[:size]
    vector = vector.repeat(size // len(vector) + 1, axis=0)
    return vector[:size]

In [18]:
data = np.empty([len(vacancies), 256, 128])
for n, vacancy in enumerate(tokens):
    v = np.empty([len(vacancy), 128])
    for k, word in enumerate(vacancy):
        v[k] = model.get_word_vector(word)
    data[n] = pad_feature(v, 256)

In [19]:
data.shape

(3004, 256, 128)

In [20]:
counts_labels = Counter(labels)
label_vocab = sorted(counts_labels, key=counts_labels.get, reverse=True)
label_vocab_to_int = {label: ii for ii, label in enumerate(label_vocab)}

labels_ints = []
for label in labels:
    labels_ints.append(label_vocab_to_int[label])
labels_ints = np.array(labels_ints)

In [21]:
view_data = np.copy(data)
view_data = view_data.reshape(3004, 32768)
view_data.shape

(3004, 32768)

In [22]:
print('Before:', Counter(labels_ints))
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(view_data, labels_ints)
print('After:', Counter(y_resampled))

Before: Counter({0: 679, 1: 353, 2: 349, 3: 329, 4: 242, 5: 164, 6: 154, 7: 144, 8: 129, 9: 119, 10: 81, 11: 56, 12: 48, 13: 28, 14: 28, 15: 26, 16: 23, 17: 12, 18: 10, 19: 9, 20: 8, 21: 6, 22: 4, 23: 2, 24: 1})
After: Counter({4: 679, 2: 679, 3: 679, 21: 679, 16: 679, 10: 679, 9: 679, 1: 679, 0: 679, 12: 679, 17: 679, 13: 679, 8: 679, 15: 679, 19: 679, 6: 679, 11: 679, 14: 679, 23: 679, 7: 679, 5: 679, 18: 679, 20: 679, 22: 679, 24: 679})


In [23]:
labeled_data = np.array([(x, y) for x, y in zip(X_resampled, y_resampled)])
np.random.shuffle(labeled_data)
X_resampled = np.array([d[0] for d in labeled_data])
y_resampled = np.array([d[1] for d in labeled_data])

In [24]:
X_resampled.shape

(16975, 32768)

In [25]:
X_resampled = X_resampled.reshape(16975, 256, 128)
X_resampled.shape

(16975, 256, 128)

In [26]:
split_frac = 0.8

split_idx = int(len(X_resampled)*split_frac)
train_x, remaining_x = X_resampled[:split_idx], X_resampled[split_idx:]
train_y, remaining_y = y_resampled[:split_idx], y_resampled[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))
print("Train set: \t\t{}".format(train_y.shape), 
      "\nValidation set: \t{}".format(val_y.shape),
      "\nTest set: \t\t{}".format(test_y.shape))

Feature Shapes:
Train set: 		(13580, 256, 128) 
Validation set: 	(1697, 256, 128) 
Test set: 		(1698, 256, 128)
Train set: 		(13580,) 
Validation set: 	(1697,) 
Test set: 		(1698,)


In [27]:
train_x = train_x[:, :, :, newaxis]
val_x = val_x[:, :, :, newaxis]
test_x = test_x[:, :, :, newaxis]

In [28]:
train_x = np.swapaxes(train_x, 2, 3)
val_x = np.swapaxes(val_x, 2, 3)
test_x = np.swapaxes(test_x, 2, 3)

train_x = np.swapaxes(train_x, 1, 2)
val_x = np.swapaxes(val_x, 1, 2)
test_x = np.swapaxes(test_x, 1, 2)

In [29]:
train_x.shape

(13580, 1, 256, 128)

In [30]:
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

batch_size = 35

# num_train = len(train_data)
# indices = list(range(num_train))
# np.random.shuffle(indices)
# split = int(np.floor(.2 * num_train))
# train_idx, valid_idx = indices[split:], indices[:split]

# train_sampler = SubsetRandomSampler(train_idx)
# valid_sampler = SubsetRandomSampler(valid_idx)

train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [31]:
dataiter = iter(train_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([35, 1, 256, 128])
Sample input: 
 tensor([[[[-4.6494e-02, -7.9022e-02,  3.7492e-01,  ...,  5.0884e-01,
           -6.9551e-02,  9.5529e-02],
          [ 2.3321e-01, -6.7624e-02,  6.7799e-01,  ..., -6.8913e-02,
           -3.2566e-01,  2.2537e-01],
          [ 1.2325e-01, -2.2405e-01,  2.8481e-01,  ..., -1.0864e-01,
            8.9693e-02, -2.7178e-01],
          ...,
          [-3.8367e-01, -3.1472e-01,  1.8253e-01,  ...,  7.1883e-02,
            5.1735e-02,  1.4164e-01],
          [-1.7500e-01, -4.8123e-01, -6.0737e-04,  ...,  2.3758e-01,
           -3.6451e-02, -7.3372e-03],
          [ 2.5164e-02, -3.4555e-01, -3.7766e-01,  ...,  1.3967e-01,
           -3.6276e-01, -1.7457e-01]]],


        [[[ 4.1980e-02, -2.9792e-03,  6.3573e-01,  ...,  4.1209e-01,
           -1.8550e-01,  8.0194e-02],
          [ 3.2760e-01, -2.3437e-01,  4.0599e-01,  ...,  3.2539e-01,
            4.4850e-01,  7.9860e-02],
          [-1.2529e-01, -1.1289e-01,  5.3903e-01,  ...,  2.

# CNN

In [32]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
  
        self.conv1 = nn.Conv2d(1, 4, 3, padding=1)
        self.conv2 = nn.Conv2d(4, 8, 3, padding=1)
        self.conv3 = nn.Conv2d(8, 16, 3, padding=1)
  
        self.pool = nn.MaxPool2d(2, 2)
  
        self.fc1 = nn.Linear(16 * 32 * 16, 4096)
        self.fc2 = nn.Linear(4096, 512)
        self.fc3 = nn.Linear(512, 25)

        self.dropout = nn.Dropout(0.25)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = self.pool(F.relu(self.conv3(x)))
        
        x = x.view(-1, 16 * 32 * 16)

        x = F.relu(self.fc1(self.dropout(x)))        
        x = F.relu(self.fc2(self.dropout(x)))        
        x = self.fc3(self.dropout(x))

        return x

model = Net()
print(model)

if train_on_gpu:
    model.cuda()

Net(
  (conv1): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(4, 8, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv3): Conv2d(8, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (fc1): Linear(in_features=8192, out_features=4096, bias=True)
  (fc2): Linear(in_features=4096, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=25, bias=True)
  (dropout): Dropout(p=0.25)
)


In [33]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [34]:
n_epochs = 30

valid_loss_min = np.Inf

model.double()

for epoch in range(1, n_epochs+1):

    train_loss = 0.0
    valid_loss = 0.0
    
    model.train()
    for data, target in train_loader:
        
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        
        optimizer.zero_grad()
        output = model(data)
        loss = criterion(output, target.long())
        loss.backward()
        optimizer.step()
        train_loss += loss.item()*data.size(0)
        
    model.eval()
    for data, target in valid_loader:
        if target.shape != torch.Size([35]):
            break
        if train_on_gpu:
            data, target = data.cuda(), target.cuda()
        output = model(data)
        loss = criterion(output, target.long())
        valid_loss += loss.item()*data.size(0)
    
    train_loss = train_loss/len(train_loader.sampler)
    valid_loss = valid_loss/len(valid_loader.sampler)
        
    print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
        epoch, train_loss, valid_loss))
    
    if valid_loss <= valid_loss_min:
        print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
        valid_loss_min,
        valid_loss))
        torch.save(model.state_dict(), 'model_cnn2.pt')
        valid_loss_min = valid_loss

KeyboardInterrupt: 

In [0]:
model.load_state_dict(torch.load('model_cnn2.pt'))

In [0]:
test_loss = 0.0
class_correct = list(0. for i in range(25))
class_total = list(0. for i in range(25))

classes = label_vocab[:]

model.eval()
for data, target in test_loader:
    if target.shape != torch.Size([35]):
        break
  
    if train_on_gpu:
        data, target = data.cuda(), target.cuda()
  
    output = model(data)
    loss = criterion(output, target.long())
    test_loss += loss.item()*data.size(0)
    _, pred = torch.max(output, 1)    
    correct_tensor = pred.eq(target.long().data.view_as(pred))
    correct = np.squeeze(correct_tensor.numpy()) if not train_on_gpu else np.squeeze(correct_tensor.cpu().numpy())
  
    for i in range(batch_size):
        label = target.data[i]
        class_correct[label] += correct[i].item()
        class_total[label] += 1

test_loss = test_loss/len(test_loader.dataset)
print('Test Loss: {:.6f}\n'.format(test_loss))

for i in range(25):
    if class_total[i] > 0:
        print('Test Accuracy of %5s: %2d%% (%2d/%2d)' % (
            classes[i], 100 * class_correct[i] / class_total[i],
            np.sum(class_correct[i]), np.sum(class_total[i])))
    else:
        print('Test Accuracy of %5s: N/A (no training examples)' % (classes[i]))

print('\nTest Accuracy (Overall): %2d%% (%2d/%2d)' % (
    100. * np.sum(class_correct) / np.sum(class_total),
    np.sum(class_correct), np.sum(class_total)))