In [8]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
import numpy as np
import re
from keras import Input
from keras.layers import Embedding, LSTM, concatenate, Dense
from keras.models import Model

In [57]:
def word_split(mystring):
    str_split = []
    for tmp in mystring:
        tmp = re.sub('[^a-zA-Z0-9\s]+', '', tmp)
        tmp = tmp.split(' ')
        while True:
            if '' not in tmp:
                break
            tmp.remove('')    
        str_split.append(tmp)
    return str_split

In [58]:
all_data = pd.read_csv("train.csv")
data = all_data['Headline'].to_numpy()
label = all_data['Label'].to_numpy()

In [60]:
my_split = word_split(data)

In [61]:
MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

In [62]:
tokenizer.fit_on_texts(my_split)

In [77]:
x= tokenizer.texts_to_sequences(my_split)
max_seq_len = max([len(seq) for seq in x])
MAX_SEQUENCE_LENGTH = 25
x = keras.preprocessing.sequence.pad_sequences(x,maxlen=MAX_SEQUENCE_LENGTH)

In [87]:
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.1, random_state = 42)

In [88]:
NUM_EMBEDDING_DIM = 128

# LSTM 輸出的向量維度
NUM_LSTM_UNITS = 128
top_input = Input(shape=(MAX_SEQUENCE_LENGTH, ), dtype='int32')
embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(top_input)
shared_lstm = LSTM(NUM_LSTM_UNITS)
top_output = shared_lstm(top_embedded)
dense =  Dense(units=1, activation='linear')
predictions = dense(top_output)
model = Model(inputs=top_input, outputs=predictions)

In [89]:
model.compile(optimizer='Adam',loss='MSE',metrics=['accuracy'])
model.summary()

Model: "functional_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
embedding_7 (Embedding)      (None, 25, 128)           1280000   
_________________________________________________________________
lstm_7 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 129       
Total params: 1,411,713
Trainable params: 1,411,713
Non-trainable params: 0
_________________________________________________________________


In [90]:
history = model.fit(x=x_train, y=y_train, validation_data=(x_test,y_test), batch_size=32,epochs=20,shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [91]:
results = model.evaluate(x_test, y_test, batch_size=128)



In [92]:
test_data = pd.read_csv("test.csv")
data = test_data['Headline'].to_numpy()
label = test_data['Label'].to_numpy()

In [93]:
test_split = word_split(data)
x_test= tokenizer.texts_to_sequences(test_split)
max_seq_len = max([len(seq) for seq in x_test])
MAX_SEQUENCE_LENGTH = 25
x_test = keras.preprocessing.sequence.pad_sequences(x_test,maxlen=MAX_SEQUENCE_LENGTH)

In [94]:
y_pre = model.predict(x_test)

In [95]:
print(y_pre)

[[3.0768807]
 [2.706529 ]
 [2.9559648]
 [2.6946697]
 [3.0924385]
 [2.9758039]
 [2.316033 ]
 [3.2757995]
 [3.2431397]
 [2.2609587]
 [3.2479632]
 [3.5005813]
 [2.931682 ]
 [2.9758236]
 [2.7231138]
 [2.54774  ]
 [3.8070252]
 [2.8423195]
 [2.5946152]
 [2.116478 ]
 [2.6708167]
 [3.1830525]
 [3.067257 ]
 [2.8596241]
 [3.5902002]
 [3.1898968]
 [2.885566 ]
 [2.5494807]
 [3.2189722]
 [2.7376943]
 [2.688157 ]
 [2.7162097]
 [2.5561676]
 [2.307369 ]
 [2.9364307]
 [2.195771 ]
 [2.943375 ]
 [2.179076 ]
 [3.3726056]
 [2.5766423]
 [3.6232762]
 [2.0030375]
 [3.6917448]
 [2.952646 ]
 [2.7745183]
 [2.4672794]
 [2.854521 ]
 [2.7661312]
 [2.223356 ]
 [2.9804606]
 [3.115323 ]
 [3.1343863]
 [2.6460316]
 [3.0401967]
 [2.9399722]
 [3.1987314]
 [3.2446373]
 [2.6765547]
 [2.6526473]
 [2.9142327]
 [2.1462362]
 [3.0120554]
 [2.4441245]
 [3.7603626]
 [3.054699 ]
 [4.1713915]
 [2.4301994]
 [2.2992213]
 [3.2063558]
 [3.4448621]
 [3.642676 ]
 [3.4453027]
 [3.8319292]
 [2.9381044]
 [2.7748148]
 [2.880195 ]
 [3.0953238]

In [96]:
import csv
with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow('[Label]')
    writer.writerows(y_pre)