In [1]:
import numpy as np
import pandas as pd
from sklearn import preprocessing
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import KeyedVectors

from keras.layers import Dense, Dropout
from keras.layers.embeddings import Embedding
from keras.layers import LSTM
from keras.layers import Input, concatenate, Activation
from keras.models import Model



In [2]:
SEED = 123
np.random.seed(SEED)
tf.random.set_seed(SEED)

### Load Data and Clean

In [3]:
BASE = './CIKM_Data/'
fins_train = ['train.csv']
fins_test = ['test10.csv']
track = 0

In [4]:
df_train = pd.read_csv(BASE+fins_train[track])
df_train.head(5)

Unnamed: 0,label,sent
0,NOB,candidates will be notified later by e - mail ...
1,NOB,successful sales track record .
2,GENDER,she moved to the united states to train with o...
3,NOB,areas of focus analysts may be placed in the f...
4,NOB,.


In [5]:
le = preprocessing.LabelEncoder()
le.fit(df_train['label'])
df_train['num_label'] = le.transform(df_train['label'])
df_train = df_train[['num_label', 'label', 'sent']]
df_train.columns = ['num_label', 'label', 'text']
df_train.head(5)

Unnamed: 0,num_label,label,text
0,3,NOB,candidates will be notified later by e - mail ...
1,3,NOB,successful sales track record .
2,1,GENDER,she moved to the united states to train with o...
3,3,NOB,areas of focus analysts may be placed in the f...
4,3,NOB,.


In [6]:
# le.inverse_transform(num_labels)

In [7]:
#df_train.groupby('label').count()

In [8]:
# We apply only this preprocessing because our data is already preprocessed
# def cleanNonAscii(text):
#     '''
#     Remove Non ASCII characters from the dataset.
#     Arguments:
#         text: str
#     returns: 
#         text: str
#     '''
#     return ''.join(i for i in text if ord(i) < 128)

In [9]:
# df_train['text'] = df_train['text'].apply(cleanNonAscii)
# df_train.head(5)

In [10]:
X_train, y_train = df_train['text'].values, df_train['num_label'].values

In [11]:
df_test = pd.read_csv(BASE+fins_test[track])
df_test.head(5)

Unnamed: 0,label,sent
0,NOB,what you bring three years relevant experience...
1,NOB,<number> to <number> years of working experien...
2,GENDER,his lifetime work was the research and publica...
3,NOB,â company name : apollo sugar clinics ltd loca...
4,NOB,most klan sites contain a membership applicati...


In [12]:
df_test['num_label'] = le.transform(df_test['label'])
df_test = df_test[['num_label', 'label', 'sent']]
df_test.columns = ['num_label', 'label', 'text']
df_test.head(5)

Unnamed: 0,num_label,label,text
0,3,NOB,what you bring three years relevant experience...
1,3,NOB,<number> to <number> years of working experien...
2,1,GENDER,his lifetime work was the research and publica...
3,3,NOB,â company name : apollo sugar clinics ltd loca...
4,3,NOB,most klan sites contain a membership applicati...


In [13]:
# df_test['text'] = df_test['text'].apply(cleanNonAscii)
# df_test.head(5)

In [14]:
# df_test.groupby('label').count()

In [15]:
X_test, y_test = df_test['text'].values, df_test['num_label'].values

In [16]:
set(y_train), set(y_test)

({0, 1, 2, 3, 4}, {0, 1, 2, 3, 4})

### Transforming data suitable for model format

In [17]:
num_words = 100000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(X_train)
xtrain = tokenizer.texts_to_sequences(X_train)
maxlen = max(map(lambda x: len(x),xtrain))
xtrain = pad_sequences(xtrain, maxlen=maxlen)

xtest = tokenizer.texts_to_sequences(X_test)
xtest = pad_sequences(xtest, maxlen=maxlen)

In [18]:
from keras.utils import to_categorical
#one-hot encode target column
ytrain = to_categorical(y_train)
ytest = to_categorical(y_test)
ytrain[0]

array([0., 0., 0., 1., 0.], dtype=float32)

#### Load Word Embeddings

Description of the word vectors:

wiki-news-300d-1M.vec.zip: 1 million word vectors trained on Wikipedia 2017, UMBC webbase corpus and statmt.org news dataset (16B tokens).

Source Link: https://fasttext.cc/docs/en/english-vectors.html

In [19]:
import io

def load_vectors(fname='./word_emb_map/wiki-news-300d-1M.vec'):
    fin = io.open(fname, 'r', encoding='utf-8', newline='\n', errors='ignore')
    n, d = map(int, fin.readline().split()) # The first line of the file contains the number of words in the vocabulary and the size of the vectors.
    data = {}
    for line in fin:
        tokens = line.rstrip().split(' ') # Each value is space separated
        data[tokens[0]] = [*map(float, tokens[1:])] # Each line contains a word followed by its vectors
    return data

w2v = load_vectors()

In [20]:
embedding_matrix = np.zeros((num_words, 300))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        continue
    if word not in w2v.keys():
        continue
    embedding_vector = list(w2v[word])
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [21]:
def create_lstm_model():
    tweet_input = Input(shape=(maxlen,), dtype='int32')
    #tweet_encoder = Embedding(num_words, 300, weights=[embedding_matrix], input_length=maxlen, trainable=True)(tweet_input)
    tweet_encoder = Embedding(num_words, 200, input_length=maxlen)(tweet_input)
    tweet_encoder = Dropout(0.5)(tweet_encoder)
    merged = LSTM(100)(tweet_encoder)
    merged = Dropout(0.5)(merged)
    merged = Dense(5)(merged)
    output = Activation('softmax')(merged)
    model = Model(inputs=[tweet_input], outputs=[output])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    #model.summary()
    return model

lstm_model = create_lstm_model()

In [22]:
lstm_model.fit(xtrain, ytrain, epochs=3, batch_size=32, verbose=1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fa8ec1ff898>

In [23]:
predicted = lstm_model.predict(xtest)

In [24]:
pred = [*map(np.argmax, predicted)]

In [25]:
actual = y_test

### Performance

In [26]:
actual = le.inverse_transform(actual)
pred = le.inverse_transform(pred)

In [27]:
from sklearn.metrics import classification_report
print(classification_report(actual, pred))

              precision    recall  f1-score   support

         AGE       0.55      0.11      0.18        55
      GENDER       0.81      0.94      0.87      2069
        MNHA       0.59      0.38      0.46       155
         NOB       0.91      0.83      0.86      3030
        RACE       0.62      0.64      0.63       434

    accuracy                           0.84      5743
   macro avg       0.69      0.58      0.60      5743
weighted avg       0.84      0.84      0.83      5743



In [28]:
from sklearn.metrics import cohen_kappa_score

cks = cohen_kappa_score(pred, actual)
print('CKS', cks)

CKS 0.7207721788982584


In [29]:
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelBinarizer

def multiclass_roc_auc_score(y_test, y_pred, average="macro"):
    lb = LabelBinarizer()
    lb.fit(y_test)
    y_test = lb.transform(y_test)
    y_pred = lb.transform(y_pred)
    return roc_auc_score(y_test, y_pred, average=average)

auc = multiclass_roc_auc_score(actual, pred)
print('Average AUC', auc)

Average AUC 0.7639650869394052


In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(actual, pred)

array([[   6,   26,    2,   20,    1],
       [   3, 1952,    7,   64,   43],
       [   0,   38,   59,   50,    8],
       [   2,  375,   31, 2504,  118],
       [   0,   32,    1,  122,  279]])