In [189]:
import tensorflow as tf
import pandas as pd
from sklearn.model_selection import train_test_split
import keras
import numpy as np
import re
from keras import Input
from keras.layers import Embedding, LSTM, concatenate, Dense
from keras.models import Model

In [190]:
def word_split(mystring):
    str_split = []
    for tmp in mystring:
        tmp = tmp.translate(str.maketrans('','',string.punctuation))
        str_split.append(tmp.split(' '))
    return str_split

In [191]:
all_data = pd.read_csv("train.csv")
data = all_data['Headline'].to_numpy()
label = all_data['Label'].to_numpy()

In [192]:
my_split = word_split(data)
print(my_split[0])

['Golden', 'gaudy', 'and', 'glorious', 'Dubai', 'has', 'the', 'worlds', 'tallest', 'building', 'and', 'biggest', 'airport', 'is', 'it', 'about', 'to', 'overtake', 'London', 'as', 'the', 'most', 'visited', 'city']


In [193]:
MAX_NUM_WORDS = 10000
tokenizer = keras.preprocessing.text.Tokenizer(num_words=MAX_NUM_WORDS)

In [194]:
tokenizer.fit_on_texts(my_split)

In [195]:
x= tokenizer.texts_to_sequences(my_split)
max_seq_len = max([len(seq) for seq in x])
MAX_SEQUENCE_LENGTH = 25
x = keras.preprocessing.sequence.pad_sequences(x,maxlen=MAX_SEQUENCE_LENGTH)

In [196]:
x_train, x_test, y_train, y_test = train_test_split(x, label, test_size=0.2, random_state = 42)

In [199]:
NUM_EMBEDDING_DIM = 256

# LSTM 輸出的向量維度
NUM_LSTM_UNITS = 128
top_input = Input(shape=(MAX_SEQUENCE_LENGTH), dtype='int32')
embedding_layer = Embedding(MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(top_input)
shared_lstm = LSTM(NUM_LSTM_UNITS)
top_output = shared_lstm(top_embedded)
dense =  Dense(units=1, activation='linear')
predictions = dense(top_output)
model = Model(inputs=top_input, outputs=predictions)

In [200]:
model.compile(optimizer='Adam',loss='MSE',metrics=['accuracy'])
model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         [(None, 25)]              0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 25, 256)           2560000   
_________________________________________________________________
lstm_5 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 129       
Total params: 2,757,249
Trainable params: 2,757,249
Non-trainable params: 0
_________________________________________________________________


In [170]:
history = model.fit(x=x_train, y=y_train, batch_size=32,epochs=20,shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [176]:
test_data = pd.read_csv("test.csv")
data = test_data['Headline'].to_numpy()
label = test_data['Label'].to_numpy()

In [178]:
test_split = word_split(data)
x_test= tokenizer.texts_to_sequences(test_split)
max_seq_len = max([len(seq) for seq in x_test])
MAX_SEQUENCE_LENGTH = 25
x_test = keras.preprocessing.sequence.pad_sequences(x_test,maxlen=MAX_SEQUENCE_LENGTH)

In [179]:
y_pre = model.predict(x_test)

In [180]:
print(y_pre)

[[3.2112126]
 [2.675583 ]
 [3.2613273]
 [2.762611 ]
 [3.2430542]
 [2.8119648]
 [2.5411115]
 [3.5116854]
 [3.5975657]
 [2.3732586]
 [3.3292513]
 [3.5874605]
 [3.0024996]
 [2.8647442]
 [3.162531 ]
 [2.8913069]
 [3.4873295]
 [2.9547029]
 [2.9362454]
 [2.2778797]
 [3.1528711]
 [3.2247005]
 [3.1562376]
 [2.9896576]
 [3.7697232]
 [3.1820736]
 [3.0225334]
 [2.5728889]
 [3.181754 ]
 [2.8465614]
 [2.6414533]
 [2.9618392]
 [2.6001103]
 [2.6645052]
 [3.293181 ]
 [2.258183 ]
 [3.260087 ]
 [1.9893703]
 [3.602384 ]
 [2.6933537]
 [3.7011135]
 [2.459227 ]
 [3.5577097]
 [3.0057938]
 [3.1078582]
 [2.5298536]
 [3.129984 ]
 [3.0644422]
 [2.181725 ]
 [3.1899507]
 [3.5729313]
 [3.2478766]
 [2.7284098]
 [3.5614235]
 [3.2797556]
 [3.4039278]
 [3.7088947]
 [2.98175  ]
 [2.77464  ]
 [2.6999855]
 [2.3528705]
 [3.449797 ]
 [2.6205564]
 [3.929274 ]
 [2.9964685]
 [4.190185 ]
 [2.5919788]
 [2.1790853]
 [3.423702 ]
 [2.980987 ]
 [3.7585979]
 [3.652658 ]
 [3.8588424]
 [3.3168614]
 [3.2222981]
 [2.7701306]
 [3.0201082]

In [188]:
import csv
with open('output.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow('Label')
    writer.writerows(y_pre)