-
Notifications
You must be signed in to change notification settings - Fork 4
/
LSTM_News_Classifier.py
164 lines (130 loc) · 6.36 KB
/
LSTM_News_Classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, LSTM, Dropout
from keras.layers import Embedding, Activation
from keras.models import Model, Sequential
from keras.preprocessing import sequence
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import classification_report, accuracy_score
from NewsArticleDataset import NewsArticleDataset
GLOVE_DIR = "./glove.6b/"
GLOVE_DIR = GLOVE_DIR.replace("./", sys.path[0] + "/")
GLOVE_TEXTFILE = os.path.join(GLOVE_DIR, "glove.6B.100d.txt")
MAX_SEQUENCE_LENGTH = 200
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
TRAINABLE = False
NUM_UNITS = 50
VALIDATION_SPLIT = 0.2
BATCH_SIZE = 32
NUM_EPOCHS = 10
DROPOUT_RATE = 0.5
lossFn = "binary_crossentropy"
optFn = "adam"
actFn = "sigmoid"
seed = 7
np.random.seed(seed)
bestEpochs = 1
bestBatchSize = 32
# Load the words features
print("Loading the training, testing, validation datasets...")
ArticleDataset = NewsArticleDataset("./credible_news_v5.csv", "./malicious_news_v5.csv")
# header = [label, #, URL, filepath, title, authors_attributed, num_characters, num_words, date, google_sentiment_score, google_sentiment_magnitude, msft_sentiment_score]
header = ArticleDataset.getHeader()
(trainFeatures, trainLabels, testFeatures, testLabels, tuneFeatures, tuneLabels) = ArticleDataset.getTrainTuneTestSets("10/13/2016", VALIDATION_SPLIT)
trainLabels = (trainLabels==1).astype(np.int32)
testLabels = (testLabels==1).astype(np.int32)
tuneLabels = (tuneLabels==1).astype(np.int32)
textfileIndex = header.index("filepath")
# Read all of the articles into memory
def loadDatasetArticles(dataset, fileIndex):
filesList = dataset[:,fileIndex]
textfiles = [open(f) for f in filesList]
texts = [f.read() for f in textfiles]
for f in textfiles:
f.close
return texts
print("Loading the articles...")
trainingTexts = loadDatasetArticles(trainFeatures, textfileIndex)
testingTexts = loadDatasetArticles(testFeatures, textfileIndex)
validationTexts = loadDatasetArticles(tuneFeatures, textfileIndex)
print("Found %d training texts, %d testing texts, %d validation texts" % (len(trainingTexts), len(testingTexts), len(validationTexts)))
# Vectorize the text samples into a 2D integer tensor
print("Fitting tokenizer on training articles...")
tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(trainingTexts)
word_index = tokenizer.word_index
trainData = pad_sequences(tokenizer.texts_to_sequences(trainingTexts), maxlen=MAX_SEQUENCE_LENGTH)
testData = pad_sequences(tokenizer.texts_to_sequences(testingTexts), maxlen=MAX_SEQUENCE_LENGTH)
validationData = pad_sequences(tokenizer.texts_to_sequences(validationTexts), maxlen=MAX_SEQUENCE_LENGTH)
# Build a mapping of words to their word embedding vector
print("Loading and indexing word vectors...")
embeddings_index = {}
f = open(GLOVE_TEXTFILE)
for line in f:
values = line.split()
word = values[0]
coefs = np.asarray(values[1:], dtype="float32")
embeddings_index[word] = coefs
f.close()
print("Found %s word vectors!" % len(embeddings_index))
# Prepare embedding matrix
print("Preparing word embedding matrix...")
num_words = min(MAX_NB_WORDS, len(word_index))+1
print("num_words (input_dim) = " + str(num_words))
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, i in word_index.items():
if i >= num_words:
continue
embedding_vector = embeddings_index.get(word)
if embedding_vector is not None:
# words not found in embedding index will be all-zeros.
embedding_matrix[i] = embedding_vector
# Load pre-trained word embeddings into an Embedding layer
# trainable = False --> keep the embeddings fixed
embeddingLayer = Embedding(num_words, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=TRAINABLE)
def createModel():
global MAX_NB_WORDS, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, NUM_UNITS, embeddingLayer
global lossFn, optFn, actFn, DROPOUT_RATE
model = Sequential()
if (embeddingLayer == None):
embeddingLayer = Embedding(MAX_NB_WORDS, EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH)
model.add(embeddingLayer)
#model.add(LSTM(NUM_UNITS, dropout=DROPOUT_RATE, recurrent_dropout=DROPOUT_RATE))
model.add(LSTM(NUM_UNITS))
model.add(Dropout(DROPOUT_RATE))
model.add(Dense(1, activation=actFn)) # condense the output layer (size = NUM_UNITS) to provide binary classification
model.compile(loss=lossFn, optimizer=optFn, metrics=['accuracy'])
return model
batch_size = [24]
epochs = [8]
#verbose = [0] # Silence the grid search
verbose = [1] # Show bar graphs of progress from the grid search output during each epoch
param_grid = dict(batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=[(validationData, tuneLabels)])
model = KerasClassifier(build_fn=createModel)
clf = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1)
print("Performing grid search...")
grid_result = clf.fit(trainData, trainLabels)
print("Best: %f using batch_size = %.1f, epochs = %.1f" % (grid_result.best_score_, grid_result.best_params_["batch_size"], grid_result.best_params_["epochs"]))
bestEpochs = grid_result.best_params_["epochs"]
bestBatchSize = grid_result.best_params_["batch_size"]
print("Training model with 'best params': num_epochs = " + str(bestEpochs) + ", batch_size = " + str(bestBatchSize) + ", NUM_UNITS = " + str(NUM_UNITS) + ", dropout = " + str(DROPOUT_RATE))
model = createModel()
model.fit(trainData, trainLabels, batch_size=bestBatchSize, epochs=bestEpochs, validation_data=(validationData, tuneLabels))
print("Evaluating model...")
scores = model.evaluate(testData, testLabels, verbose=0)
predictions = model.predict(testData, verbose=0)
predictions = np.round(predictions)
acc = accuracy_score(testLabels, predictions)
print("Using num_epochs = " + str(bestEpochs) + ", batch_size = " + str(bestBatchSize) + ", NUM_UNITS = " + str(NUM_UNITS) + ", dropout = " + str(DROPOUT_RATE))
for metricIter in xrange(0, len(scores)):
metricName = model.metrics_names[metricIter]
metricValue = scores[metricIter]
print("Models's " + metricName + " = " + str(metricValue))
target_names = ["credible", "malicious"]
print("Confusion matrix:")
print(classification_report(testLabels, predictions, target_names=target_names))