In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [None]:
train = pd.read_csv('train_set.csv')
train.head()

Unnamed: 0,Job_offer,Label
0,"Openjobmetis SpA ricerca, per importante azien...",Java Developer
1,"La persona prescelta, diplomata o laureata in ...",Software Engineer
2,Sei un informatico o matematico con la passion...,Software Engineer
3,"Ti occuperai della progettazione, realizzazion...",Programmer
4,Stiamo cercando uno\una sviluppatore\sviluppat...,Programmer


In [None]:
#one hot encoding of train labels then add to train dataframe
train_labels = pd.get_dummies(train['Label'])
train = pd.concat([train, train_labels], axis=1)
train.head()

Unnamed: 0,Job_offer,Label,Java Developer,Programmer,Software Engineer,System Analyst,Web Developer
0,"Openjobmetis SpA ricerca, per importante azien...",Java Developer,1,0,0,0,0
1,"La persona prescelta, diplomata o laureata in ...",Software Engineer,0,0,1,0,0
2,Sei un informatico o matematico con la passion...,Software Engineer,0,0,1,0,0
3,"Ti occuperai della progettazione, realizzazion...",Programmer,0,1,0,0,0
4,Stiamo cercando uno\una sviluppatore\sviluppat...,Programmer,0,1,0,0,0


In [None]:
#define X and Y as numpy arrays where Job_offer is the X and the one hot encoded labels are the Y
X = np.array(train['Job_offer'])
Y = np.array(train[['Java Developer', 'Software Engineer', 'Programmer', 'Web Developer', 'System Analyst']])
X.shape, Y.shape

((1752,), (1752, 5))

In [None]:
#define a keras text vectorization layer and apply on X and apply a filter to remove punctuations
vectorize_layer = tf.keras.layers.experimental.preprocessing.TextVectorization(
    output_mode='int',
    standardize='lower_and_strip_punctuation',
    split='whitespace',
)
vectorize_layer.adapt(X)
X = vectorize_layer(X)
#get word counts then mean length and use it to perform padding
word_counts = vectorize_layer.get_vocabulary()
print('Vocabulary size: {}'.format(len(word_counts)))
mean_len = int(np.mean([len(x) for x in X]))
print('Mean document length: {}'.format(mean_len))
X = tf.keras.preprocessing.sequence.pad_sequences(X, maxlen=mean_len, padding='post')
X.shape


Vocabulary size: 9706
Mean document length: 1134


(1752, 1134)

In [None]:
#define a keras sequential model
model=tf.keras.models.Sequential()
model.add(tf.keras.layers.Embedding(len(word_counts), 64))
model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)))
model.add(tf.keras.layers.Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, Y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f8fe00db640>

In [None]:
test = pd.read_csv('test_set.csv')
test.head()

Unnamed: 0,Job_offer,Label
0,Siamo alla ricerca di figure di Full Stack Dev...,Web Developer
1,"Nell'ambito del potenziamento della struttura,...",Java Developer
2,"* Gestione dell'infrastruttura informatica, de...",System Analyst
3,La risorsa è il responsabile dell'installazion...,System Analyst
4,"La risorsa, inserita nell'area Innovation Cent...",Java Developer


In [None]:
test_labels = pd.get_dummies(test['Label'])
test = pd.concat([test, test_labels], axis=1)
test.head()

Unnamed: 0,Job_offer,Label,Java Developer,Programmer,Software Engineer,System Analyst,Web Developer
0,Siamo alla ricerca di figure di Full Stack Dev...,Web Developer,0,0,0,0,1
1,"Nell'ambito del potenziamento della struttura,...",Java Developer,1,0,0,0,0
2,"* Gestione dell'infrastruttura informatica, de...",System Analyst,0,0,0,1,0
3,La risorsa è il responsabile dell'installazion...,System Analyst,0,0,0,1,0
4,"La risorsa, inserita nell'area Innovation Cent...",Java Developer,1,0,0,0,0


In [None]:
X_test = np.array(test['Job_offer'])
Y_test = np.array(test[['Java Developer', 'Software Engineer', 'Programmer', 'Web Developer', 'System Analyst']])

In [None]:
X_test = vectorize_layer(X_test)
X_test = tf.keras.preprocessing.sequence.pad_sequences(X_test, maxlen=mean_len, padding='post')
X_test.shape

(439, 1134)

In [None]:
#evaluate the model on test set
model.evaluate(X_test, Y_test)

#predict on test set
pred = model.predict(X_test)



array([[8.9085667e-04, 9.6178126e-01, 9.6599543e-03, 2.7398035e-02,
        2.6993040e-04],
       [9.7435093e-01, 2.5323415e-03, 1.7077142e-02, 4.8291399e-03,
        1.2104172e-03],
       [1.6562518e-03, 3.0357869e-05, 7.2166737e-04, 3.7490006e-04,
        9.9721682e-01],
       ...,
       [5.5975495e-05, 8.4472289e-05, 1.9657999e-05, 9.9983346e-01,
        6.4570931e-06],
       [1.1229526e-03, 5.8801932e-04, 1.8023876e-04, 9.9797744e-01,
        1.3138972e-04],
       [2.9729351e-03, 6.1571278e-04, 3.9341636e-02, 2.1094619e-03,
        9.5496029e-01]], dtype=float32)

In [None]:
#convert the predictions to labels
pred = np.argmax(pred, axis=1)
pred

array([1, 0, 4, 4, 0, 1, 0, 3, 1, 1, 0, 1, 2, 1, 2, 0, 1, 1, 4, 2, 2, 0,
       1, 1, 0, 0, 4, 4, 3, 1, 2, 3, 4, 2, 0, 3, 2, 3, 4, 2, 1, 1, 0, 1,
       0, 2, 0, 1, 3, 4, 1, 3, 0, 2, 1, 0, 1, 3, 2, 2, 2, 3, 2, 4, 2, 4,
       1, 3, 0, 1, 0, 2, 3, 1, 0, 4, 3, 1, 2, 4, 1, 1, 2, 0, 4, 2, 0, 2,
       3, 3, 3, 3, 3, 1, 1, 2, 0, 0, 2, 2, 2, 2, 0, 3, 4, 1, 3, 2, 3, 4,
       4, 2, 1, 0, 3, 0, 0, 3, 0, 4, 4, 2, 0, 2, 3, 4, 1, 0, 3, 2, 2, 4,
       0, 0, 3, 1, 2, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 2, 4, 4, 2, 0, 0, 2,
       3, 3, 4, 1, 0, 2, 2, 3, 1, 1, 2, 0, 2, 2, 3, 2, 0, 0, 0, 2, 4, 1,
       2, 3, 3, 0, 2, 2, 4, 3, 3, 3, 4, 0, 3, 2, 2, 3, 1, 4, 4, 2, 1, 4,
       3, 4, 2, 1, 1, 1, 1, 0, 2, 1, 0, 3, 1, 3, 4, 1, 1, 2, 1, 0, 3, 2,
       3, 1, 2, 1, 2, 1, 3, 0, 1, 1, 0, 4, 3, 3, 3, 3, 4, 3, 3, 1, 1, 3,
       4, 3, 4, 3, 2, 0, 0, 4, 0, 2, 1, 3, 0, 1, 0, 1, 0, 3, 0, 3, 0, 1,
       3, 3, 3, 0, 3, 0, 3, 0, 3, 3, 2, 3, 4, 4, 1, 3, 0, 3, 2, 2, 2, 0,
       1, 4, 3, 0, 2, 1, 0, 2, 4, 4, 0, 3, 0, 3, 2,

In [None]:
#convert the one hot encoded labels to labels
Y_test = np.argmax(Y_test, axis=1)
Y_test

array([3, 0, 4, 4, 0, 0, 0, 3, 4, 1, 0, 1, 3, 1, 2, 0, 1, 1, 4, 2, 2, 3,
       3, 1, 0, 0, 4, 2, 3, 1, 2, 0, 4, 2, 0, 3, 1, 0, 4, 0, 2, 1, 0, 1,
       0, 2, 4, 1, 2, 4, 3, 3, 0, 2, 1, 0, 0, 3, 2, 2, 2, 3, 3, 4, 3, 4,
       1, 3, 1, 1, 0, 2, 3, 2, 0, 4, 3, 1, 2, 4, 3, 1, 2, 0, 4, 2, 0, 2,
       3, 3, 3, 3, 3, 1, 2, 0, 0, 2, 2, 2, 2, 2, 0, 3, 4, 1, 3, 2, 3, 4,
       4, 3, 1, 3, 3, 0, 0, 3, 0, 4, 4, 2, 1, 2, 2, 4, 1, 0, 3, 2, 1, 4,
       0, 0, 1, 2, 2, 3, 4, 1, 2, 0, 1, 2, 0, 1, 1, 4, 4, 4, 1, 2, 2, 0,
       3, 3, 2, 2, 0, 2, 1, 0, 0, 1, 2, 0, 2, 0, 3, 2, 0, 0, 0, 2, 4, 3,
       0, 3, 3, 2, 2, 2, 4, 3, 0, 3, 4, 0, 0, 2, 2, 3, 1, 4, 4, 3, 3, 4,
       3, 4, 2, 2, 1, 1, 1, 0, 2, 2, 0, 3, 2, 3, 4, 1, 1, 0, 1, 0, 3, 1,
       4, 3, 2, 1, 2, 1, 2, 3, 3, 1, 0, 4, 3, 3, 3, 3, 4, 3, 3, 1, 1, 3,
       4, 3, 4, 4, 0, 0, 0, 4, 0, 2, 1, 3, 3, 2, 0, 1, 3, 3, 0, 0, 0, 1,
       1, 3, 3, 0, 3, 1, 3, 0, 3, 3, 0, 3, 4, 4, 2, 3, 0, 3, 2, 3, 2, 0,
       1, 4, 2, 0, 2, 1, 0, 2, 4, 4, 4, 3, 0, 3, 4,

In [None]:
#calculate the F1 score
from sklearn.metrics import f1_score
f1_score(Y_test, pred, average='macro')

0.754125086741681

In [None]:
#save output to csv as follows: 'Job_offer', 'Label_true', 'Label_pred'
output = pd.DataFrame({'Job_offer': test['Job_offer'], 'Label_true': Y_test, 'Label_pred': pred})
output['Label_true'] = output['Label_true'].replace({0: 'Java Developer', 1: 'Software Engineer', 2: 'Programmer', 3: 'Web Developer', 4: 'System Analyst'})
output['Label_pred'] = output['Label_pred'].replace({0: 'Java Developer', 1: 'Software Engineer', 2: 'Programmer', 3: 'Web Developer', 4: 'System Analyst'})
output.to_csv('output.csv', index=False)
output.head()

In [None]:
#save the model
model.save('model.h5')