In [0]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import torch

In [0]:
data = pd.read_csv('jobs_data.csv')

In [0]:
data = data.drop('Unnamed: 0',axis=1)

In [10]:
data[:5]

Unnamed: 0,title,jobFunction,industry
0,Full Stack PHP Developer,"['Engineering - Telecom/Technology', 'IT/Softw...","['Computer Software', 'Marketing and Advertisi..."
1,CISCO Collaboration Specialist Engineer,"['Installation/Maintenance/Repair', 'IT/Softwa...",['Information Technology Services']
2,Senior Back End-PHP Developer,"['Engineering - Telecom/Technology', 'IT/Softw...","['Computer Software', 'Computer Networking']"
3,UX Designer,"['Creative/Design/Art', 'IT/Software Developme...","['Computer Software', 'Information Technology ..."
4,Java Technical Lead,"['Engineering - Telecom/Technology', 'IT/Softw...","['Computer Software', 'Information Technology ..."


# processing input Text data

In [0]:
# separating words of titles
titles=[]
for title in data['title']:
    titles.append(re.sub("[^\w]", " ",  title).split())

In [13]:
titles[:5]

[['Full', 'Stack', 'PHP', 'Developer'],
 ['CISCO', 'Collaboration', 'Specialist', 'Engineer'],
 ['Senior', 'Back', 'End', 'PHP', 'Developer'],
 ['UX', 'Designer'],
 ['Java', 'Technical', 'Lead']]

In [0]:
# list of total words in dataset 
vocab=[]
for mystr in data['title']:
    vocab += re.sub("[^\w]", " ",  mystr).split()
for i,word in enumerate(vocab):
    vocab[i]=word.lower()

In [15]:
len(vocab)

36570

In [16]:
#unique words(no duplicates)
len(set(vocab)) 

1373

In [0]:
# frequency of each word
counts = Counter(vocab) 

In [0]:
# sort words by no. of occurrences
sorted_vocab = sorted(counts, key=counts.get, reverse=True)

In [21]:
for word in sorted_vocab[:10]:
    print(word,': ', counts[word])

developer :  2082
senior :  1930
engineer :  1388
sales :  1305
manager :  1225
specialist :  1129
software :  716
marketing :  539
executive :  520
designer :  518


  #                vocab cleaning   ================================

In [22]:
# Remove one-characters
for word in sorted_vocab:
    if len(word)==1:
        print(word,': ', counts[word])

a :  194
e :  36
r :  27
d :  26
c :  19
1 :  9
4 :  7
7 :  7
s :  6
و :  5
2 :  5
6 :  4
i :  2
م :  2
v :  1
8 :  1
t :  1


In [23]:
print(len(sorted_vocab))
for word in sorted_vocab:
    if len(word)==1:
        sorted_vocab.remove(word)
print(len(sorted_vocab))

1373
1356


In [0]:
for word in sorted_vocab:
    if len(word)==1:
        print(word,': ', counts[word])

# remove Non-English words

In [0]:
def NotEnglish(s):
    try:
        s.encode(encoding='utf-8').decode('ascii')
    except UnicodeDecodeError:
        return True
    else:
        return False

In [26]:
NotEnglish('يىهسنينثهصثخ')

True

In [27]:
for word in sorted_vocab:
    if NotEnglish(word):
        print(word,': ', counts[word])
        sorted_vocab.remove(word)

مسئول :  8
مندوب :  7
لغة :  5
إداري :  5
مدير :  5
موظفة :  5
سوشيال :  5
اونلاين :  5
خارجي :  5
معلم :  3
الشئون :  3
français :  3
انتاج :  2
مجال :  2
والاعلان :  2
امين :  2
امن :  2
درسة :  2
أطفال :  2
دكتور :  1
مشغل :  1
طوب :  1
العلمين :  1
مخازن :  1
مشرف :  1
خراسانات :  1
أخصائي :  1
إلكتروني :  1
pédagogique :  1


In [28]:
print(len(sorted_vocab))
for word in sorted_vocab:
    if NotEnglish(word):
        sorted_vocab.remove(word)
print(len(sorted_vocab))

1327
1316


In [29]:
print(len(sorted_vocab))
for word in sorted_vocab:
    if NotEnglish(word):
        sorted_vocab.remove(word)
print(len(sorted_vocab))

1316
1312


In [0]:
for word in sorted_vocab:
    if NotEnglish(word):
        print(word,': ', counts[word])

In [31]:
print(len(sorted_vocab))

1312


# remove places and cities  

In [0]:
places = ['alexandria','cairo','tunisia','german','indonesia','saudi','tanta','egypt','egypt',
          'dubai','hurghada','dokki','monufya','brazil','mohandessin','damietta',
          'ukraine','turkey','maadi','mansoura','heliopolis','riyadh','dubai','qalubia','mohandessin',
          'gunsberg','jeddah','dakahlia','kuwait','shorouk','helwan','of','the','mahala','khaima','suez','sheikh',
         'beheira','benha','5th','qatar','abbassia','sokhna','ain','damanhour','mahla','17','mohandesin','mohandessein','faisal'] 

In [33]:
print(len(sorted_vocab))
for word in places:
    if word in sorted_vocab:
        sorted_vocab.remove(word)
len(sorted_vocab)

1312


1265

# convert titles to numeric data


In [0]:
# mapping dictionary for words to integers
vocab_to_int = {word: ii for ii, word in enumerate(sorted_vocab, 1)} 

In [36]:
for k,v in vocab_to_int.items():
    if v<10:
        print(k,': ',v)

developer :  1
senior :  2
engineer :  3
sales :  4
manager :  5
specialist :  6
software :  7
marketing :  8
executive :  9


In [0]:
# mapping dictionary for integers to words
int_to_vocab = {v: k for k, v in vocab_to_int.items()}

In [38]:
titles=[]
for title in data['title']:
    titles.append(re.sub("[^\w]", " ",  title).split())
titles[:10]

[['Full', 'Stack', 'PHP', 'Developer'],
 ['CISCO', 'Collaboration', 'Specialist', 'Engineer'],
 ['Senior', 'Back', 'End', 'PHP', 'Developer'],
 ['UX', 'Designer'],
 ['Java', 'Technical', 'Lead'],
 ['Technical', 'Support', 'Engineer'],
 ['Senior', 'iOS', 'Developer'],
 ['Mechanical', 'Engineer'],
 ['Real', 'Estate', 'Sales', 'Specialist', '10th', 'of', 'Ramadan'],
 ['School', 'Principal']]

In [0]:
# building a list contains job titles as integers
titles_as_ints = titles[:]
for j,title in enumerate(titles_as_ints):
    for k,word in enumerate(title):
        if word.lower() in vocab_to_int.keys():
            titles_as_ints[j][k] = vocab_to_int[word.lower()] 
        else:
            titles_as_ints[j][k] = 0

In [0]:
# remove '0' words (the previously cleaned words)
for j,title in enumerate(titles_as_ints):
    for k,word in enumerate(title):
        if word == 0: 
            titles_as_ints[j].remove(word)

In [42]:
titles_as_ints[:10]

[[25, 19, 36, 1],
 [429, 519, 6, 3],
 [2, 59, 12, 36, 1],
 [40, 10],
 [56, 13, 57],
 [13, 29, 3],
 [2, 46, 1],
 [72, 3],
 [53, 55, 4, 6, 296, 297],
 [165, 306]]

#   Padding

In [0]:
def pad_features(titles_ints, seq_length):

    features = np.zeros((len(titles_ints), seq_length), dtype=int)

    for i, row in enumerate(titles_ints):
        if len(row)<6:
          features[i, -len(row):] = np.array(row)[:seq_length]
        else:
          features[i, :] = np.array(row)[:seq_length]
    
    return features

In [0]:
seq_length = 6

# final input array
features = pad_features(titles_as_ints, seq_length=seq_length)

#  Extract Classes

In [45]:
for i,mystr in enumerate(data['jobFunction'][:5]):
    print(mystr)

['Engineering - Telecom/Technology', 'IT/Software Development']
['Installation/Maintenance/Repair', 'IT/Software Development', 'Engineering - Telecom/Technology']
['Engineering - Telecom/Technology', 'IT/Software Development']
['Creative/Design/Art', 'IT/Software Development']
['Engineering - Telecom/Technology', 'IT/Software Development']


In [46]:
for i,mystr in enumerate(data['jobFunction'][:5]):
    print(re.sub('\'', "", mystr)[1:-1])


Engineering - Telecom/Technology, IT/Software Development
Installation/Maintenance/Repair, IT/Software Development, Engineering - Telecom/Technology
Engineering - Telecom/Technology, IT/Software Development
Creative/Design/Art, IT/Software Development
Engineering - Telecom/Technology, IT/Software Development


In [0]:
# building a list contains all classes in all samples
classes=[]
for mystr in data['jobFunction']:
    ccc = re.sub('\'', "", mystr)[1:-1].split(', ')
    #print(ccc)
    for element in ccc:
        #print(type(element))
        classes.append(element.lower())

In [48]:
classes[:10]

['engineering - telecom/technology',
 'it/software development',
 'installation/maintenance/repair',
 'it/software development',
 'engineering - telecom/technology',
 'engineering - telecom/technology',
 'it/software development',
 'creative/design/art',
 'it/software development',
 'engineering - telecom/technology']

In [49]:
len(classes)

20681

In [0]:
class_count = Counter(classes)


In [51]:
class_count['engineering - telecom/technology']

3886

In [0]:
# mapping dictionary for classes to its frequency
class_to_count = sorted(class_count, key=class_count.get, reverse=True)

In [53]:
len(set(classes))

38

In [0]:
unique_classes = list(set(classes))


In [0]:
# mapping dictionary for classes to its integers
class_to_int = {word: ii for ii, word in enumerate(sorted(unique_classes))}

In [57]:
for k,v in class_to_int.items():
    if v<10:
        print(k,': ',v)

accounting/finance :  0
administration :  1
analyst/research :  2
banking :  3
business development :  4
c-level executive/gm/director :  5
creative/design/art :  6
customer service/support :  7
education/teaching :  8
engineering - construction/civil/architecture :  9


In [0]:
# mapping dictionary for integers to its classes
int_to_class = {v: k for k, v in class_to_int.items()}

In [0]:
# final array for labels to all samples
targets = np.zeros([len(data),len(unique_classes)])

In [0]:
for j, j_f in enumerate(data['jobFunction']):
    row = re.sub('\'', "", j_f)[1:-1].split(', ')
    for label in row:
        idx_to_fill = class_to_int[label.lower()]
        targets[j][idx_to_fill] = 1.0


In [62]:
nn=888
for word in titles_as_ints[nn]:
    print(int_to_vocab[word], end=' ')
print(' : ',end='')
for i,classs in enumerate(targets[nn]):
    if classs==1:
        print(int_to_class[i],end=', ')

senior account manager  : customer service/support, marketing/pr/advertising, 

# Dataloaders and batchinng

In [63]:
split_frac = 0.8

split_idx = int(len(features)*split_frac)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = targets[:split_idx], targets[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(8696, 6) 
Validation set: 	(1087, 6) 
Test set: 		(1087, 6)


In [0]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 64
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)


# Batching for test
valid_loader = DataLoader(valid_data, shuffle=False, batch_size=len(valid_data))
test_loader = DataLoader(test_data, shuffle=False, batch_size=len(test_data))

# Training and Testing Functions

In [65]:
train_on_gpu = torch.cuda.is_available()

if not train_on_gpu:
    print('CUDA is not available.  Training on CPU ...')
else:
    print('CUDA is available!  Training on GPU ...')

CUDA is available!  Training on GPU ...


In [0]:
def train_model(num_epochs, model, trainLoader, validLoader, criterion, optimizer, saving_path):   
   
    min_valid_loss = np.inf

    for e in range(num_epochs):
        running_training_loss = 0
        running_validation_loss = 0
        model.train()
        for inputs, labels in trainLoader:
            if train_on_gpu:
                inputs, labels = inputs.cuda(), labels.cuda()
            logits = model(inputs.float())
            loss = criterion(logits, labels.float())        
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_training_loss += loss.item()

        model.eval()
        for inputs, labels in validLoader:
            if train_on_gpu:
                inputs, labels = inputs.cuda(), labels.cuda()
            logits = model(inputs.float())
            loss = criterion(logits, labels.float())        
            running_validation_loss += loss.item()
        train_loss = running_training_loss/len(trainLoader)
        valid_loss = running_validation_loss/len(validLoader)

        print(e,' Training loss: ',round(train_loss,6),'  validation loss: ',round(valid_loss,6))
        if valid_loss < min_valid_loss:
            print('validation loss decreased!...Model saved')
            torch.save(model.state_dict(), saving_path)
            min_valid_loss = valid_loss


In [0]:
from sklearn.metrics import f1_score, fbeta_score, precision_score, recall_score
from torch import nn

segmoid = nn.Sigmoid()

In [0]:
def test_model(model, prob_threshold, testLoader):    
    # prop_threshold  :  propability threshold to consider a class as positive

    for inputs, labels in testLoader:
        if train_on_gpu:
          inputs, labels = inputs.cuda(), labels.cuda()
        logits = model(inputs.float())
        break
    out = segmoid(logits) 

    allout = out > prob_threshold

    ypred = allout.cpu().numpy().astype(np.float64)
    ytrue = labels.cpu().numpy().astype(np.float64)

    recall = recall_score(ytrue, ypred, average='weighted')
    precesion = precision_score(ytrue, ypred, average='weighted')
    fbeta = fbeta_score(ytrue, ypred, average='weighted',beta=1)

    print('#######################\nOVERALL SCORES:')
    print('recall    :',round(recall,2))
    print('precesion:',round(precesion,2))
    print('f1_score    :',round(fbeta,2))
    print('\n#######################\n')

    recall_per_class     = recall_score(ytrue, ypred, average=None)
    precesion_per_class = precision_score(ytrue, ypred, average=None)
    fbeta_per_class     = fbeta_score(ytrue, ypred, average=None ,beta=1)

    print('SCORES PER CLASS:\n')
    print('\t\t\t\t recal-precision-f1_score \t no. of occurrences in dataset')
    for i in range(len(recall_per_class)):
      print(i,int_to_class[i], '\n\t\t\t\t', round(recall_per_class[i],2),'\t', round(precesion_per_class[i],2), '\t',
                                              round(fbeta_per_class[i],2),'\t\t',class_count[int_to_class[i]])
      print('---------------------------------------------------------------------------')

# Building first model

In [0]:
from torch import nn, optim
import torch.nn.functional as F

In [0]:
class neural_network1(nn.Module):
    def __init__(self, input_size, output_size):
        super().__init__()
        self.fc1 = nn.Linear(input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, output_size)
    def forward(self, x):
        
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x

In [0]:
model1 = neural_network1( input_size=seq_length, output_size=len(unique_classes))
if train_on_gpu:
    model1.cuda()

In [0]:
from torch import nn, optim

criterion = nn.BCEWithLogitsLoss()  #pos_weight= loss_weights.cuda()
optimizer1 = optim.Adam(model1.parameters(), lr=0.0001)

In [73]:
train_model(200, model1, train_loader, valid_loader, criterion=criterion, optimizer=optimizer1, saving_path='JobFunctionModel.pt')

0  Training loss:  0.486782   validation loss:  0.353814
validation loss decreased!...Model saved
1  Training loss:  0.320124   validation loss:  0.326842
validation loss decreased!...Model saved
2  Training loss:  0.303012   validation loss:  0.316884
validation loss decreased!...Model saved
3  Training loss:  0.295442   validation loss:  0.311665
validation loss decreased!...Model saved
4  Training loss:  0.289881   validation loss:  0.307058
validation loss decreased!...Model saved
5  Training loss:  0.284721   validation loss:  0.303206
validation loss decreased!...Model saved
6  Training loss:  0.279125   validation loss:  0.294439
validation loss decreased!...Model saved
7  Training loss:  0.272804   validation loss:  0.287727
validation loss decreased!...Model saved
8  Training loss:  0.26507   validation loss:  0.278911
validation loss decreased!...Model saved
9  Training loss:  0.255193   validation loss:  0.267648
validation loss decreased!...Model saved
10  Training loss:  0

In [74]:
model1.load_state_dict(torch.load('JobFunctionModel.pt'))

<All keys matched successfully>

In [75]:
test_model(model1, prob_threshold= 0.35, testLoader=test_loader)

#######################
OVERALL SCORES:
recall    : 0.52
precesion: 0.64
f1_score    : 0.53

#######################

SCORES PER CLASS:

				 recal-precision-f1_score 	 no. of occurrences in dataset
0 accounting/finance 
				 0.33 	 0.73 	 0.46 		 477
---------------------------------------------------------------------------
1 administration 
				 0.24 	 1.0 	 0.38 		 656
---------------------------------------------------------------------------
2 analyst/research 
				 0.39 	 0.73 	 0.51 		 272
---------------------------------------------------------------------------
3 banking 
				 0.0 	 0.0 	 0.0 		 10
---------------------------------------------------------------------------
4 business development 
				 0.35 	 0.64 	 0.46 		 445
---------------------------------------------------------------------------
5 c-level executive/gm/director 
				 0.0 	 0.0 	 0.0 		 3
---------------------------------------------------------------------------
6 creative/design/art 
				 0.59 	 0.87 	

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# second model (adding embedding layer)

In [0]:
class neural_network2(nn.Module):
    def __init__(self, input_size, vocab_size, output_size, embedding_dim):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(input_size*embedding_dim, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 64)
        self.fc4 = nn.Linear(64, output_size)
    def forward(self, x):
        x = self.embedding(x.long())
        x = x.view(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x

In [0]:
embed_dim = 400
model2 = neural_network2( input_size=seq_length,vocab_size=len(vocab_to_int), output_size=len(unique_classes), embedding_dim=embed_dim)
if train_on_gpu:
    model2.cuda()
optimizer2 = optim.Adam(model2.parameters(), lr=0.0001)

In [78]:
train_model(200, model2, train_loader, valid_loader, criterion=criterion, optimizer=optimizer2, saving_path='JobFunctionModel.pt')

0  Training loss:  0.25288   validation loss:  0.128546
validation loss decreased!...Model saved
1  Training loss:  0.110384   validation loss:  0.098358
validation loss decreased!...Model saved
2  Training loss:  0.087073   validation loss:  0.08274
validation loss decreased!...Model saved
3  Training loss:  0.073421   validation loss:  0.073646
validation loss decreased!...Model saved
4  Training loss:  0.064119   validation loss:  0.066339
validation loss decreased!...Model saved
5  Training loss:  0.057229   validation loss:  0.061372
validation loss decreased!...Model saved
6  Training loss:  0.05189   validation loss:  0.057932
validation loss decreased!...Model saved
7  Training loss:  0.047672   validation loss:  0.054904
validation loss decreased!...Model saved
8  Training loss:  0.043919   validation loss:  0.052144
validation loss decreased!...Model saved
9  Training loss:  0.040473   validation loss:  0.051356
validation loss decreased!...Model saved
10  Training loss:  0.0

In [79]:
model2.load_state_dict(torch.load('JobFunctionModel.pt'))

<All keys matched successfully>

In [80]:
test_model(model2, prob_threshold= 0.35, testLoader = test_loader)

#######################
OVERALL SCORES:
recall    : 0.88
precesion: 0.89
f1_score    : 0.88

#######################

SCORES PER CLASS:

				 recal-precision-f1_score 	 no. of occurrences in dataset
0 accounting/finance 
				 0.94 	 0.94 	 0.94 		 477
---------------------------------------------------------------------------
1 administration 
				 0.85 	 0.87 	 0.86 		 656
---------------------------------------------------------------------------
2 analyst/research 
				 0.86 	 1.0 	 0.92 		 272
---------------------------------------------------------------------------
3 banking 
				 0.0 	 0.0 	 0.0 		 10
---------------------------------------------------------------------------
4 business development 
				 0.88 	 0.71 	 0.79 		 445
---------------------------------------------------------------------------
5 c-level executive/gm/director 
				 0.0 	 0.0 	 0.0 		 3
---------------------------------------------------------------------------
6 creative/design/art 
				 0.87 	 0.97 	

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


# 3rd model (adding dropout and incrasing layers dims)

In [0]:
class neural_network3(nn.Module):
    def __init__(self, input_size, vocab_size, output_size, embedding_dim):
        super().__init__()

        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.fc1 = nn.Linear(input_size*embedding_dim, 512)
        self.fc2 = nn.Linear(512, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, output_size)
        self.dropout = nn.Dropout(0.25)
    def forward(self, x):
        x = self.embedding(x.long())
        x = x.view(x.shape[0], -1)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = F.relu(self.fc2(x))
        x = self.dropout(x)
        x = F.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x

embed_dim = 400
model3 = neural_network3( input_size=seq_length,vocab_size=len(vocab_to_int), output_size=len(unique_classes), embedding_dim=embed_dim)
if train_on_gpu:
    model3.cuda()

optimizer3 = optim.Adam(model3.parameters(), lr=0.0001)

In [82]:
model3

neural_network3(
  (embedding): Embedding(1265, 400)
  (fc1): Linear(in_features=2400, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=512, bias=True)
  (fc3): Linear(in_features=512, out_features=512, bias=True)
  (fc4): Linear(in_features=512, out_features=38, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [83]:
train_model(150, model3, train_loader, valid_loader, criterion=criterion, optimizer=optimizer3, 
                                                      saving_path='JobFunctionModel.pt')

0  Training loss:  0.214492   validation loss:  0.118877
validation loss decreased!...Model saved
1  Training loss:  0.104317   validation loss:  0.089178
validation loss decreased!...Model saved
2  Training loss:  0.080387   validation loss:  0.073103
validation loss decreased!...Model saved
3  Training loss:  0.066725   validation loss:  0.063754
validation loss decreased!...Model saved
4  Training loss:  0.057995   validation loss:  0.058275
validation loss decreased!...Model saved
5  Training loss:  0.051915   validation loss:  0.053571
validation loss decreased!...Model saved
6  Training loss:  0.047165   validation loss:  0.051078
validation loss decreased!...Model saved
7  Training loss:  0.043381   validation loss:  0.048745
validation loss decreased!...Model saved
8  Training loss:  0.040223   validation loss:  0.04643
validation loss decreased!...Model saved
9  Training loss:  0.037587   validation loss:  0.045311
validation loss decreased!...Model saved
10  Training loss:  0

In [85]:
model3.load_state_dict(torch.load('JobFunctionModel.pt'))

<All keys matched successfully>

In [87]:
test_model(model3, prob_threshold= 0.4, testLoader = test_loader)

#######################
OVERALL SCORES:
recall    : 0.89
precesion: 0.9
f1_score    : 0.89

#######################

SCORES PER CLASS:

				 recal-precision-f1_score 	 no. of occurrences in dataset
0 accounting/finance 
				 0.94 	 0.92 	 0.93 		 477
---------------------------------------------------------------------------
1 administration 
				 0.86 	 0.91 	 0.89 		 656
---------------------------------------------------------------------------
2 analyst/research 
				 0.86 	 1.0 	 0.92 		 272
---------------------------------------------------------------------------
3 banking 
				 0.0 	 0.0 	 0.0 		 10
---------------------------------------------------------------------------
4 business development 
				 0.8 	 1.0 	 0.89 		 445
---------------------------------------------------------------------------
5 c-level executive/gm/director 
				 0.0 	 0.0 	 0.0 		 3
---------------------------------------------------------------------------
6 creative/design/art 
				 0.87 	 1.0 	 0.9

  'recall', 'true', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)
