In [27]:
import keras
import pandas as pd
import os
import sys
import numpy as np
import re
import csv
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout
from keras.layers import Conv1D, MaxPooling1D, Embedding, LSTM, Conv2D, Bidirectional
from keras.models import Model, Sequential
from sklearn.model_selection import train_test_split

In [28]:
def word_split(mystring):
    str_split = []
    #nltk_stopwords= nltk.corpus.stopwords.words('english')
    for tmp in mystring:
        tmp = tmp.lower()
        #punct_token = wordpunct_tokenize(tmp)
        tmp = re.sub('[^a-zA-Z0-9\s\?\!]+', '', tmp)
        tmp = tmp.replace('!', ' !')
        tmp = tmp.replace('?', ' ?')
        tmp = tmp.split(' ')
        while True:
            if '' not in tmp:
                break
            tmp.remove('')
        while True:
            if 'the' not in tmp:
                break
            tmp.remove('the')
        while True:
            if 'and' not in tmp:
                break
            tmp.remove('and')
        while True:
            if 'of' not in tmp:
                break
            tmp.remove('of')
        '''
        while True:
            if 'is' not in tmp:
                break
            tmp.remove('is')
        while True:
            if 'are' not in tmp:
                break
            tmp.remove('are')
        '''
        str_split.append(tmp)
    return str_split

In [29]:
GLOVE_DIR = './'
MAX_SEQUENCE_LENGTH = 50
MAX_NB_WORDS = 10000
EMBEDDING_DIM = 100
NUM_LSTM_UNITS = 512
VALIDATION_SPLIT = 0.2

In [30]:
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'), encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

In [31]:
all_data = pd.read_csv("train.csv")
data = all_data['Headline']
label = all_data['Label']
my_split = word_split(data)

In [32]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(my_split)
sequences = tokenizer.texts_to_sequences(my_split)
word_index = tokenizer.word_index
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [33]:
num_words = min(MAX_NB_WORDS, len(word_index))+1
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NB_WORDS:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [34]:
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

In [35]:
mm = keras.models.load_model('./mymodel_0.445/')

In [36]:
test_data = pd.read_csv("test.csv")
data = test_data['Headline']
label = test_data['Label']
test_split = word_split(data)
sequences = tokenizer.texts_to_sequences(test_split)
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_pre = mm.predict(x)
print(y_pre[0])
b = np.arange(1, y_pre.shape[0]+1).reshape(y_pre.shape[0], 1).astype('int32')
y_pre = np.append(b, y_pre, axis=1).astype(object)
for i in range(len(y_pre)):
    y_pre[i][0] = int(y_pre[i][0])

[2.3372777]


In [37]:
with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['ID','Label'])
    writer.writerows(y_pre)

In [38]:
mm.save("my_model.h5")

In [40]:
mn = keras.models.load_model('my_model.h5')
test_data = pd.read_csv("test.csv")
data = test_data['Headline']
label = test_data['Label']
test_split = word_split(data)
sequences = tokenizer.texts_to_sequences(test_split)
x = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
y_pre = mm.predict(x)
print(y_pre[0])
b = np.arange(1, y_pre.shape[0]+1).reshape(y_pre.shape[0], 1).astype('int32')
y_pre = np.append(b, y_pre, axis=1).astype(object)
for i in range(len(y_pre)):
    y_pre[i][0] = int(y_pre[i][0])

[2.3372777]


In [21]:
print(d)

      ID     Label
0      1  2.337278
1      2  2.964703
2      3  2.684707
3      4  2.922065
4      5  2.455397
..   ...       ...
222  223  3.060324
223  224  2.522440
224  225  2.865470
225  226  2.824675
226  227  2.668736

[227 rows x 2 columns]


In [14]:
print((y_pre))

[[1 2.4667890071868896]
 [2 2.4667890071868896]
 [3 2.4667890071868896]
 [4 2.4667890071868896]
 [5 2.4667890071868896]
 [6 2.4667890071868896]
 [7 2.4667890071868896]
 [8 2.4667890071868896]
 [9 2.4667890071868896]
 [10 2.4667890071868896]
 [11 2.4667890071868896]
 [12 2.4667890071868896]
 [13 2.4667890071868896]
 [14 2.4667890071868896]
 [15 2.4667890071868896]
 [16 2.4667890071868896]
 [17 2.4667890071868896]
 [18 2.4667890071868896]
 [19 2.4667890071868896]
 [20 2.4667890071868896]
 [21 2.4667890071868896]
 [22 2.4667890071868896]
 [23 2.4667890071868896]
 [24 2.4667890071868896]
 [25 2.4667890071868896]
 [26 2.4667890071868896]
 [27 2.4667890071868896]
 [28 2.4667890071868896]
 [29 2.4667890071868896]
 [30 2.4667890071868896]
 [31 2.4667890071868896]
 [32 2.4667890071868896]
 [33 2.4667890071868896]
 [34 2.4667890071868896]
 [35 2.4667890071868896]
 [36 2.4667890071868896]
 [37 2.4667890071868896]
 [38 2.4667890071868896]
 [39 2.4667890071868896]
 [40 2.4667890071868896]
 [41 2.46