In [1]:
import numpy as np

np.random.seed(42)
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

from keras.models import Model
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, concatenate
from keras.layers import GRU, Bidirectional, GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.preprocessing import text, sequence
from keras.callbacks import Callback

import warnings

warnings.filterwarnings('ignore')

import os

os.environ['OMP_NUM_THREADS'] = '4'

EMBEDDING_FILE = 'data/crawl-300d-2M.vec'

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/submit_sample.csv')

X_train = train["description"].fillna("fillna").values
y_train = train["jobflag"].values
X_test = test["description"].fillna("fillna").values

max_features = 30000
maxlen = 256
embed_size = 300

tokenizer = text.Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(list(X_train) + list(X_test))
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)
x_train = sequence.pad_sequences(X_train, maxlen=maxlen)
x_test = sequence.pad_sequences(X_test, maxlen=maxlen)


def get_coefs(word, *arr):
    return word, np.asarray(arr, dtype='float32')


embeddings_index = dict(get_coefs(*o.rstrip().rsplit(' ')) for o in open(EMBEDDING_FILE, encoding="utf-8"))

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [31]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from keras.utils import to_categorical

onehot_encoder = OneHotEncoder(sparse=False)
reshaped = y_train.reshape(len(y_train), 1)
onehot = onehot_encoder.fit_transform(reshaped)

In [41]:
onehot
label = argmax(onehot)


Tensor("ArgMax_3:0", shape=(2931,), dtype=int64)


In [2]:
word_index = tokenizer.word_index
nb_words = min(max_features, len(word_index))
print(nb_words)
embedding_matrix = np.zeros((nb_words, embed_size))


5563


In [4]:
for word, i in word_index.items():
    if i >= nb_words:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [6]:
embedding_matrix.shape

(5563, 300)

In [7]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.1081    ,  0.0191    ,  0.0354    , ...,  0.1104    ,
         0.0475    , -0.0599    ],
       [-0.0175    , -0.2189    ,  0.0353    , ..., -0.28459999,
         0.0509    ,  0.0229    ],
       ...,
       [-0.0936    ,  0.0364    , -0.98079997, ..., -0.1285    ,
        -0.26109999,  0.3678    ],
       [ 0.0421    ,  0.055     , -0.1261    , ..., -0.0253    ,
        -0.0318    ,  0.1617    ],
       [ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ]])

In [38]:

from keras.backend import argmax
class Evaluation(Callback):
    def __init__(self, validation_data=(), interval=1):
        super(Callback, self).__init__()

        self.interval = interval
        self.X_val, self.y_val = validation_data

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.interval == 0:
            y_pred = self.model.predict(self.X_val, verbose=0)
            label = argmax(self.y_val, 0)
            predict = argmax(y_pred, 0)
            score = f1_score(label, predict)
            print("\n f1 score - epoch: %d - score: %.6f \n" % (epoch + 1, score))


In [29]:
def get_model():
    inp = Input(shape=(maxlen,))
    x = Embedding(nb_words, embed_size, weights=[embedding_matrix])(inp)
    x = SpatialDropout1D(0.2)(x)
    x = Bidirectional(GRU(80, return_sequences=True))(x)
    avg_pool = GlobalAveragePooling1D()(x)
    max_pool = GlobalMaxPooling1D()(x)
    conc = concatenate([avg_pool, max_pool])
    outp = Dense(4, activation="softmax")(conc)
    model = Model(inputs=inp, outputs=outp)
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    return model


model = get_model()
print(model.summary())

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 100)          0                                            
__________________________________________________________________________________________________
embedding_11 (Embedding)        (None, 100, 300)     1668900     input_11[0][0]                   
__________________________________________________________________________________________________
spatial_dropout1d_11 (SpatialDr (None, 100, 300)     0           embedding_11[0][0]               
__________________________________________________________________________________________________
bidirectional_11 (Bidirectional (None, 100, 160)     182880      spatial_dropout1d_11[0][0]       
__________________________________________________________________________________________________
global_ave

In [42]:
batch_size = 32
epochs = 10

X_tra, X_val, y_tra, y_val = train_test_split(x_train, onehot, train_size=0.95, random_state=233)
# eval_score = Evaluation(validation_data=(X_val, y_val), interval=1)

hist = model.fit(X_tra, y_tra, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val),
                  verbose=2)

y_pred = model.predict(x_test, batch_size=1024)
print(y_pred)
# submission[1] = y_pred
# submission.to_csv('submission.csv', index=False)


Train on 2784 samples, validate on 147 samples
Epoch 1/2
 - 21s - loss: 0.6389 - acc: 0.7636 - val_loss: 0.8550 - val_acc: 0.6531
Epoch 2/2
 - 21s - loss: 0.4878 - acc: 0.8233 - val_loss: 0.8801 - val_acc: 0.6327
[[7.2513208e-02 4.9611282e-02 2.4401283e-01 6.3386267e-01]
 [4.2035847e-04 1.1632243e-03 9.9298793e-01 5.4285135e-03]
 [2.2513341e-02 4.3581426e-02 8.0371791e-01 1.3018726e-01]
 ...
 [7.0712590e-01 2.6260829e-01 2.6763489e-02 3.5023335e-03]
 [1.7316408e-02 1.8928066e-02 9.1504467e-01 4.8710801e-02]
 [9.4991503e-03 2.6569374e-02 9.4981635e-01 1.4115120e-02]]


In [59]:
pred = np.argmax(y_pred,1)
len(pred)

1743

In [48]:
submission

Unnamed: 0,2931,3
0,2932,3
1,2933,3
2,2934,1
3,2935,3
4,2936,2
...,...,...
1737,4669,1
1738,4670,3
1739,4671,3
1740,4672,3


In [60]:
submission = pd.read_csv('data/submit_sample.csv', header=None)
submission.ix[:, 1] = pred
submission.to_csv('submission.csv', index=False)

In [67]:
submission.iloc[:, 1] = pred
pred

array([3, 2, 2, ..., 0, 2, 2], dtype=int64)

In [65]:
submission

Unnamed: 0,0,1
0,2931,3
1,2932,2
2,2933,2
3,2934,0
4,2935,2
...,...,...
1738,4669,0
1739,4670,2
1740,4671,0
1741,4672,2
