In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow import keras
from tensorflow.keras.optimizers import Adam, RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split



In [None]:
df = pd.read_csv('corpus.csv')
df.dropna(inplace=True)
corpus = list(df['messages'])

In [None]:
# cv = CountVectorizer(max_features = 20000)
# X = cv.fit_transform(corpus).toarray()
# y = df['target'].values

In [None]:
X = df.messages
y = df.target

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.15, stratify = y)

In [None]:
max_words = 20000
max_len = 15000
tok = Tokenizer(num_words=max_words)
tok.fit_on_texts(X_train)
sequences = tok.texts_to_sequences(X_train)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
print('sequences[2] length: ', len(sequences[2]))
print('sequences length: ', len(sequences))

sequences[2] length:  29
sequences length:  43752


In [None]:

def RNN():
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(max_words,128,input_length=max_len)(inputs)

    layer = LSTM(64)(layer)
    layer = Dense(128,name='FC1')(layer)
    layer = Activation('leaky_relu')(layer)
    layer = Dropout(0.4)(layer)

    layer = Dense(64,name='FC2')(layer)
    layer = Activation('leaky_relu')(layer)
    layer = Dropout(0.6)(layer)

    layer = Dense(64,name='FC3')(layer)
    layer = Activation('leaky_relu')(layer)
    layer = Dropout(0.7)(layer)

    layer = Dense(1,name='out_layer')(layer)
    layer = Activation('sigmoid')(layer)
    model = Model(inputs=inputs,outputs=layer)
    return model

In [None]:
model = RNN()
model.summary()
model.compile(loss='binary_crossentropy',optimizer=Adam(),metrics=['accuracy'])

Model: "model_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 150)]             0         
                                                                 
 embedding_8 (Embedding)     (None, 150, 128)          128000    
                                                                 
 lstm_12 (LSTM)              (None, 64)                49408     
                                                                 
 FC1 (Dense)                 (None, 128)               8320      
                                                                 
 activation_12 (Activation)  (None, 128)               0         
                                                                 
 dropout_8 (Dropout)         (None, 128)               0         
                                                                 
 FC2 (Dense)                 (None, 64)                8256

In [None]:
# model.fit(sequences_matrix,Y_train,batch_size=128,epochs=10,
#           validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',min_delta=0.0001)])
model.fit(sequences_matrix,Y_train,batch_size=64,epochs=3,
          validation_split=0.2)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f61b454ac50>

In [None]:
test_sequences = tok.texts_to_sequences(X_test)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
accr = model.evaluate(test_sequences_matrix,Y_test)



In [None]:
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.293
  Accuracy: 0.824


In [None]:
test = pd.read_csv('test.csv')

In [None]:
id = test['Id']
test = test[['Subject','Body']]
test['messages'] = test['Subject']+' '+test['Body']

In [None]:
test_sequences = tok.texts_to_sequences(test['messages'].astype(str))
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [None]:
test_prob = model.predict(test_sequences_matrix)

In [None]:
submission = pd.DataFrame()
submission['Id'] = id
submission['Flag'] = test_prob.flatten()
submission = submission.set_index('Id')


In [None]:
submission.to_csv('submission.csv')