## Load the train and test data

In [6]:
import pandas as pd
import numpy
import os
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers
from keras.utils import to_categorical
from keras.models import model_from_json
from sklearn.metrics import f1_score

In [7]:
num_classes = 7

#load train data
X1 = pd.read_csv('dataset/hm_train.csv')

#drop empty rows
X1 = X1.dropna(axis=0, how='any', inplace=False)

train_sentences = X1['cleaned_hm']
labels_train = X1['predicted_category']

#load test data
X2 = pd.read_csv('dataset/hm_test.csv')
X2 = X2.dropna(axis=0, how='any', inplace=False)

test_sentences = X2['cleaned_hm']

X1.predicted_category.str.split(expand=True).stack().value_counts()


affection           20880
achievement         20274
bonding              6561
enjoy_the_moment     6508
leisure              4242
nature               1127
exercise              729
dtype: int64

## Convert train labels to categorical

In [8]:
labels = labels_train.values 
labels[labels == 'affection'] = 0
labels[labels == 'achievement'] = 1
labels[labels == 'bonding'] = 2
labels[labels == 'enjoy_the_moment'] = 3
labels[labels == 'leisure'] = 4
labels[labels == 'nature'] = 5
labels[labels == 'exercise'] = 6

y_train = to_categorical(labels, num_classes = num_classes)

## Vectorize the sentences using pre-trained word vectors

In [9]:
# load the pre-trained word-embedding vectors 
embeddings_index = {}
for i, line in enumerate(open('wiki-news-300d-1M.vec',encoding='utf8')):
    values = line.split()
    embeddings_index[values[0]] = numpy.asarray(values[1:], dtype='float32')

# create a tokenizer 
token = text.Tokenizer()
token.fit_on_texts(train_sentences)
word_index = token.word_index

# convert text to sequence of tokens and pad them to ensure equal length vectors 
train_seq_x = sequence.pad_sequences(token.texts_to_sequences(train_sentences), maxlen=70)
valid_seq_x = sequence.pad_sequences(token.texts_to_sequences(test_sentences), maxlen=70)

# create token-embedding mapping
embedding_matrix = numpy.zeros((len(word_index) + 1, 300))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

## Build and train a simple LSTM model 

In [10]:
# Add an Input Layer
input_layer = layers.Input((70, ))

# Add the word embedding Layer
embedding_layer = layers.Embedding(len(word_index) + 1, 300, weights=[embedding_matrix], trainable=False)(input_layer)
embedding_layer = layers.SpatialDropout1D(0.3)(embedding_layer)

# Add the LSTM Layer
lstm_layer = layers.LSTM(100)(embedding_layer)

# Add the output Layers
output_layer1 = layers.Dense(50, activation="relu")(lstm_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(num_classes, activation="softmax")(output_layer1)

# Compile the model
model = models.Model(inputs=input_layer, outputs=output_layer2)
model.compile(optimizer=optimizers.Adam(), loss='categorical_crossentropy',metrics=['accuracy'])

#Train the model
model.fit(train_seq_x, y_train, epochs=25, batch_size=256)

#Save the trained model

# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25
Saved model to disk


## Test the model and get predictions for test data

In [11]:
#Load the trained model

json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

#get predictions on train data and calculate F1-score with weighted mean
y_pred_train = model.predict(train_seq_x, batch_size=256,verbose=1)

print('F1-score with weighted average for training data is:', f1_score(y_train, y_pred_train.round(), average='weighted'))

# get prediction on test data
y_pred = model.predict(valid_seq_x, batch_size=256,verbose=1)
y_pred = numpy.argmax(y_pred, axis=1)

# get the id column of test data
hmid = X2['hmid']

#convert y_pred to real labels
preds = pd.DataFrame(y_pred, columns=['predicted_category'])

preds[preds.predicted_category == 0] = 'affection'
preds[preds.predicted_category == 1] = 'achievement'
preds[preds.predicted_category == 2] = 'bonding'
preds[preds.predicted_category == 3] = 'enjoy_the_moment'
preds[preds.predicted_category == 4] = 'leisure'
preds[preds.predicted_category == 5] = 'nature'
preds[preds.predicted_category == 6] = 'exercise'

hmid = hmid.values
preds = preds['predicted_category']
preds = preds.values

result = {'hmid':hmid, 'predicted_category': preds}
df = pd.DataFrame(result)
df.to_csv('submission-Nagarjun.csv', index=False)

Loaded model from disk
F1-score with weighted average for training data is: 0.9332379242744984
