In [None]:
import numpy as np, pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, Embedding, Dropout, Activation, GRU, Bidirectional, GlobalMaxPool1D, LSTM, Softmax, GlobalAveragePooling1D
from keras.models import Model, Sequential
from keras.utils import to_categorical
from keras.optimizers import Adam, SGD
import time

In [None]:
embedding_file=f'glove.6B.200d.txt'
train = pd.read_csv('train_clean.csv', sep=';', encoding='utf-8')
test = pd.read_csv('test_clean.csv', sep=';', encoding='utf-8')

In [None]:
test_nans = test.loc[test['cleaned_string']=='Not Available']

In [None]:
embeddings_dimensions = 200
embeddings_words = 10000
comment_length = 70

In [None]:
train_data = train["cleaned_string"].fillna("_na_").values
Y_train = to_categorical(train.Category)
test_data = test["cleaned_string"].fillna("_na_").values

In [None]:
tokenizer = Tokenizer(num_words=embeddings_words)
tokenizer.fit_on_texts(list(train_data))
train_tokenized = tokenizer.texts_to_sequences(train_data)
test_tokenized = tokenizer.texts_to_sequences(test_data)
X_train = pad_sequences(train_tokenized, maxlen=comment_length)
X_test = pad_sequences(test_tokenized, maxlen=comment_length)

In [None]:
def get_coefs(word,*arr): return word, np.asarray(arr, dtype='float32')
embeddings_index = dict(get_coefs(*o.strip().split()) for o in open(embedding_file, encoding="utf8"))

In [None]:
all_embeddings = np.stack(embeddings_index.values())
embeddings_mean, embeddings_std = all_embeddings.mean(), all_embeddings.std()

word_index = tokenizer.word_index
how_many_words = min(embeddings_words, len(word_index))
embedding_matrix = np.random.normal(embeddings_mean, embeddings_std, (how_many_words, embeddings_dimensions))
for word, i in word_index.items():
    if i >= embeddings_words: continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None: embedding_matrix[i] = embedding_vector

In [None]:
input_layer = Input(shape=(comment_length,))
emb_layer = Embedding(embeddings_words, embeddings_dimensions, weights=[embedding_matrix])(input_layer)
bidir_layer = Bidirectional(GRU(64, return_sequences=True, dropout=0.25, recurrent_dropout=0.05))(emb_layer)
max_pool = GlobalMaxPool1D()(bidir_layer)
dense_2 = Dense(3, activation="sigmoid")(max_pool)
model = Model(inputs=input_layer, outputs=dense_2)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
training = model.fit(X_train, Y_train, batch_size=32, epochs=2, validation_split=0.0)

In [None]:
Y_test = model.predict([X_test], batch_size=32, verbose=1)

In [None]:
Y_test.shape

In [None]:
Y_classes = np.argmax(Y_test, axis=1)
Y_classes[0:100]

In [None]:
submission = pd.DataFrame(test.Id)
submission['Category'] = Y_classes

In [None]:
submission.head(10)

In [None]:
mapping = {0: 'neutral', 1: 'positive', 2: 'negative'}
submission = submission.replace({'Category': mapping})

In [None]:
submission.head(10)

In [None]:
test_nans.index

In [None]:
for i in test_nans.index:
    submission.loc[i, 'Category'] = 'positive'

In [None]:
submission.head(10)

In [None]:
filename = 'submission'+time.strftime("%Y%m%d-%H%M%S")+'.csv'

In [None]:
submission.to_csv(filename, index=False, sep=',')

In [None]:
filename