In [35]:
import numpy as np
import pandas as pd
import pickle
from collections import defaultdict
import re
from bs4 import BeautifulSoup
import sys
import os
os.environ['KERAS_BACKEND']='tensorflow'
from keras.preprocessing.text import Tokenizer,text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional, TimeDistributed
from keras.models import Model
from keras.callbacks import ModelCheckpoint
import matplotlib.pyplot as plt
plt.switch_backend('agg')
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from keras import initializers
%matplotlib inline

In [36]:
def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

In [37]:
MAX_SENT_LENGTH = 100
MAX_SENTS = 15
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

In [38]:
# reading data
df = pd.read_csv('data/fake_or_real_news.csv')
df = df.dropna()
df = df.reset_index(drop=True)
print('Shape of dataset ',df.shape)
print(df.columns)
print('No. of unique classes',len(set(df['label'])))

Shape of dataset  (6335, 5)
Index(['Unnamed: 0', 'title', 'text', 'label', 'title_vectors'], dtype='object')
No. of unique classes 2


In [39]:
import nltk
from nltk import tokenize

reviews = []
labels = []
texts = []


In [40]:
macronum=sorted(set(df['label']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

In [41]:
def fun(i):
    return macro_to_id[i]

df['label']=df['label'].apply(fun)

In [42]:
data_train = df['text'].values
label_train = df['label'].values
for i in range(len(data_train)):
    text = BeautifulSoup(data_train[i])
    text=clean_str(str(text.get_text().encode()).lower())
    texts.append(text)
    sentences = tokenize.sent_tokenize(text)
    reviews.append(sentences)


for i in df['label']:
    labels.append(i)



 BeautifulSoup([your markup])

to this:

 BeautifulSoup([your markup], "lxml")

  markup_type=markup_type))


In [43]:
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)

data_former = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')
data_latter = np.zeros((len(texts), MAX_SENTS, MAX_SENT_LENGTH), dtype='int32')

for i, sentences in enumerate(reviews):
    for j, sent in enumerate(sentences):
        if j< MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data_former[i,j,k] = tokenizer.word_index[word]
                    k=k+1
        
        j_verse = len(sentences)-1-j
        
        if j_verse < MAX_SENTS:
            wordTokens = text_to_word_sequence(sent)
            k=0
            for _, word in enumerate(wordTokens):
                if k<MAX_SENT_LENGTH and tokenizer.word_index[word]<MAX_NB_WORDS:
                    data_latter[i,MAX_SENTS-1-j_verse,k] = tokenizer.word_index[word]
                    k=k+1

In [46]:
len(data_former)

6335

In [47]:
len(data_latter)

6335

In [48]:
word_index = tokenizer.word_index
print('No. of %s unique tokens.' % len(word_index))

No. of 110291 unique tokens.


In [49]:
labels = to_categorical(np.asarray(labels))
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)


indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

Shape of data tensor: (6335, 15, 100)
Shape of label tensor: (6335, 2)


In [50]:
x_train_former = data_former[:-nb_validation_samples]
x_train_latter = data_latter[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val_former = data_former[-nb_validation_samples:]
x_val_latter = data_latter[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [51]:
embeddings_index = {}
f = open('data/glove.6B.100d.txt',encoding='utf8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Total %s word vectors.' % len(embeddings_index))

Total 400000 word vectors.


In [52]:
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SENT_LENGTH,
                            trainable=True)

In [53]:
sentence_input = Input(shape=(MAX_SENT_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sentence_input)
l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
sentEncoder = Model(sentence_input, l_lstm)

review_input = Input(shape=(MAX_SENTS,MAX_SENT_LENGTH), dtype='int32')
review_encoder = TimeDistributed(sentEncoder)(review_input)
l_lstm_sent = Bidirectional(LSTM(100))(review_encoder)
preds = Dense(len(macronum), activation='softmax',name='predict_value')(l_lstm_sent)
model = Model(review_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

print("Hierachical LSTM")
model.summary()

Hierachical LSTM
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 15, 100)           0         
_________________________________________________________________
time_distributed_2 (TimeDist (None, 15, 200)           11190000  
_________________________________________________________________
bidirectional_4 (Bidirection (None, 200)               240800    
_________________________________________________________________
predict_value (Dense)        (None, 2)                 402       
Total params: 11,431,202
Trainable params: 11,431,202
Non-trainable params: 0
_________________________________________________________________


In [None]:
#cp=ModelCheckpoint('model_han_.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
history1=model.fit(x_train_former, y_train, validation_data=(x_val_former, y_val),
          epochs=1, batch_size=24)


output_model = Model(inputs=model.input,outputs=model.get_layer('predict_value').output)

pre_former = output_model.predict(x_val_former)

In [23]:
pre_former#former想要的值

In [None]:
history2=model.fit(x_train_latter, y_train, validation_data=(x_val_latter, y_val),
          epochs=1, batch_size=24)


output_model = Model(inputs=model.input,outputs=model.get_layer('predict_value').output)

pre_latter = output_model.predict(x_val_latter)

In [None]:
pre_latter

In [None]:
former=[]
latter=[]
label=[]
for i in range(len(pre_former)):
    former.append(1.65*pre_former[i][0]+pre_latter[i][0])#1.65这个是个参数可以随便调
    latter.append(1.65*pre_former[i][1]+pre_latter[i][1])
    if y_val[i][0]>y_val[i][1]:
        label.append(1)
    else:
        label.append(0)
            
combine_predict_label=[]
for i in range(len(former)):
    if former[i]>latter[i]:
        combine_predict_label.append(1)
    else:
        combine_predict_label.append(0)
        
count=0
for i in range(len(label)):
    if label[i]==combine_predict_label[i]:
        count+=1
print(count/len(label))

In [None]:
cp=ModelCheckpoint('model_han_.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
history=model.fit(x_train, y_train, validation_data=(x_val, y_val),
          epochs=15, batch_size=2,callbacks=[cp])

Train on 5068 samples, validate on 1267 samples
Epoch 1/15

Epoch 00001: val_acc improved from -inf to 0.96764, saving model to model_han_.hdf5
Epoch 2/15

Epoch 00002: val_acc improved from 0.96764 to 0.97395, saving model to model_han_.hdf5
Epoch 3/15

Epoch 00003: val_acc improved from 0.97395 to 0.98658, saving model to model_han_.hdf5
Epoch 4/15

Epoch 00004: val_acc did not improve from 0.98658
Epoch 5/15

Epoch 00005: val_acc did not improve from 0.98658
Epoch 6/15

Epoch 00006: val_acc did not improve from 0.98658
Epoch 7/15

Epoch 00007: val_acc did not improve from 0.98658
Epoch 8/15

Epoch 00008: val_acc did not improve from 0.98658
Epoch 9/15

Epoch 00009: val_acc did not improve from 0.98658
Epoch 10/15

Epoch 00010: val_acc did not improve from 0.98658
Epoch 11/15

Epoch 00011: val_acc did not improve from 0.98658
Epoch 12/15

Epoch 00012: val_acc did not improve from 0.98658
Epoch 13/15

Epoch 00013: val_acc did not improve from 0.98658
Epoch 14/15

Epoch 00014: val_acc 