-
Notifications
You must be signed in to change notification settings - Fork 0
/
newsclassification_nlp_lstm.py
356 lines (262 loc) · 11.1 KB
/
newsclassification_nlp_lstm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
# -*- coding: utf-8 -*-
"""NewsClassification_NLP_LSTM.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/1JYwhR7yoSfAef5Tek_SI6nrZxaQauE-9
# **First Project: Developing NLP Model Using TensorFlow**
**Name: Nicko Arya Dharma**
**Email: nicko.arya.dharma@gmail.com**
**DicodingID: nickoaryad**
## **1 <font color='yellow'>**|**</font> About the Dataset**
Context
News article datasets, originating from BBC News, provided for use as benchmarks for machine learning research. The original data is processed to form a single csv file for ease of use, the news title and the related text file name is preserved along with the news content and its category. This dataset is made available for non-commercial and research purposes only.
All rights, including copyright, in the content of the original articles are owned by the BBC.
Content:
Consists of 2225 documents from the BBC news website corresponding to stories in five topical areas from 2004-2005.
Class Labels:
5 (business, entertainment, politics, sport, tech)
Acknowledgements:
The original source of the data may be accessed through this link and it might be interesting to read the associated research article.
Associated Official Research Papers:
D. Greene and P. Cunningham. "Practical Solutions to the Problem of Diagonal Dominance in Kernel Document Clustering", Proc. ICML 2006.
Source:
https://www.kaggle.com/datasets/hgultekin/bbcnewsarchive/data
## **2 <font color='yellow'>**|**</font> Importing Libraries**
"""
import zipfile
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.layers import Embedding, Dense, Dropout, BatchNormalization, Flatten, LSTM
from tensorflow.keras import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import Callback, ReduceLROnPlateau, EarlyStopping
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import plot_model
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
!pip install wordcloud
import nltk
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize, sent_tokenize
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
import re, string, unicodedata
from string import punctuation
import matplotlib.pyplot as plt
"""## **3 <font color='yellow'>**|**</font> Preparing the Dataset**
#### **3.1 <font color='yellow'>**|**</font> Extracting the Dataset**
"""
local_zip = '/bbc-news-data.csv.zip'
zip_ref = zipfile.ZipFile(local_zip, 'r')
zip_ref.extractall('/tmp')
zip_ref.close()
"""#### **3.2 <font color='yellow'>**|**</font> Reading the Dataset**"""
data = pd.read_csv('/tmp/bbc-news-data.csv', on_bad_lines='skip', sep='\t')
data
data.info()
data.category.value_counts()
data["category"].hist()
heading_sport = data[data["category"]=="sport"]["title"]
collapsed_heading_sport = heading_sport.str.cat(sep=' ')
heading_business = data[data["category"]=="sport"]["title"]
collapsed_heading_business = heading_sport.str.cat(sep=' ')
heading_politics = data[data["category"]=="sport"]["title"]
collapsed_heading_politics = heading_sport.str.cat(sep=' ')
heading_tech = data[data["category"]=="sport"]["title"]
collapsed_heading_tech = heading_sport.str.cat(sep=' ')
heading_entertainment = data[data["category"]=="sport"]["title"]
collapsed_heading_entertainment = heading_sport.str.cat(sep=' ')
print("Word Cloud for SPORT")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_sport)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()
print("Word Cloud for BUSINESS")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_business)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()
print("Word Cloud for POLITICS")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_politics)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()
print("Word Cloud for TECH")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_tech)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()
print("Word Cloud for ENTERTAINMENT")
wordcloud = WordCloud(background_color = "black",max_words = 50).generate(collapsed_heading_entertainment)
plt.imshow(wordcloud,interpolation='bilinear')
plt.axis('off')
plt.show()
"""#### **3.3 <font color='yellow'>**|**</font> Wrangling the Dataset**"""
# Removing unnecessary column
data = data.drop(columns=['filename'])
# Combining title and content columns
data['text'] = data['title'] + " " + data['content']
# Renaming column
data.columns = ['Category', 'Title', 'Content', 'Text']
# Removing title and content columns
data = data.drop(columns=['Title', 'Content'])
# Reindexing columns
data = data.reindex(columns=['Text', 'Category'])
data
# Inspecting any NaN data
data.isna().sum()
# Inspecting any null data
data.isnull().sum()
"""## **4 <font color='yellow'>**|**</font> Preprocessing Text**"""
# Removing unnecessary whitespace, punctuation, and characters
def preprocess(text):
text = text.lower() #convert to lowercase
text = text.strip() #remove any whitespace
text = re.compile('<.*?>').sub('', text) #remove any HTML tags
text = re.compile('[%s]' % re.escape(string.punctuation)).sub(' ', text) #replace punctuation with space
text = re.sub('\s+', ' ', text) #remove extra space and tabs
text = re.sub(r'\[[0-9]*\]',' ', text) #[0-9] matches any digit (0 to 10000...)
text = re.sub(r'[^\w\s]', '', str(text).lower().strip())
text = re.sub(r'\d',' ', text) #matches any digit from 0 to 100000..., \D matches non-digits
text = re.sub(r'\s+',' ', text) #\s matches any whitespace, \s+ matches multiple whitespace, \S matches non-whitespace
return text
# Removing stopwords
def stopword(string):
a= [i for i in string.split() if i not in stopwords.words('english')]
return ' '.join(a)
# Initializing the stemmer
snow = SnowballStemmer('english')
def stemming(string):
a=[snow.stem(i) for i in word_tokenize(string) ]
return " ".join(a)
# Initializing the lemmatizer
wl = WordNetLemmatizer()
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return wordnet.NOUN
# Tokenizing the sentence
def lemmatizer(string):
word_pos_tags = nltk.pos_tag(word_tokenize(string))
a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
return " ".join(a)
# Executing the preprocess batches
def finalpreprocess(string):
return lemmatizer(stopword(preprocess(string)))
data['Text'] = data['Text'].apply(lambda x: finalpreprocess(x))
data.head()
"""## **5 <font color='yellow'>**|**</font> Encoding Dummy Variable**"""
category = pd.get_dummies(data.Category)
data = pd.concat([data, category], axis=1)
data = data.drop(columns=['Category'])
data
data.info()
"""## **6 <font color='yellow'>**|**</font> Splitting the Dataset**"""
X = data['Text'].values
y = data[['sport', 'business', 'politics', 'tech', 'entertainment']].values
X
y
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=43)
print('Train Set: ', X_train.shape, y_train.shape)
print('Test Set: ', X_test.shape, y_test.shape)
"""## **7 <font color='yellow'>**|**</font> Developing Model**
#### **7.1 <font color='yellow'>**|**</font> Tokenizing**
"""
tokenizer = Tokenizer(num_words=10000, oov_token="<oov>")
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index
seq_train = tokenizer.texts_to_sequences(X_train)
seq_test = tokenizer.texts_to_sequences(X_test)
pad_train = pad_sequences(seq_train, maxlen=200, truncating="post")
pad_test = pad_sequences(seq_test, maxlen=200, truncating="post")
print("Padded Train = ")
print(pad_train.shape)
print("Padded Test = ")
print(pad_test.shape)
pad_train
pad_test
"""#### **7.2 <font color='yellow'>**|**</font> Sequential Modelling using Embedding and LSTM**"""
model = tf.keras.Sequential([
tf.keras.layers.Embedding(input_dim=10000, output_dim=32, input_length=200, mask_zero=True),
tf.keras.layers.LSTM(64, return_sequences=False),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dropout(0.5),
#tf.keras.layers.LSTM(64),
#tf.keras.layers.Dropout(0.3),
tf.keras.layers.Dense(5, activation='softmax')
])
plot_model(model, show_shapes = True)
model.compile(optimizer="adam",
metrics=['accuracy'],
loss='categorical_crossentropy')
model.summary()
"""#### **7.3 <font color='yellow'>**|**</font> Defining Callbacks to Control Epochs**"""
# Defining calbacks to stop epoch
class myCallback(tf.keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs={}):
if(logs.get('accuracy')>0.90 and logs.get('val_accuracy')>0.90):
self.model.stop_training = True
print("\n Accuracy of the training set and the validation set have reached > 90%!")
callbacks = myCallback()
# Improving validation accuracy by decreasing Learning Rate
auto_reduction_LR = ReduceLROnPlateau(
monitor = 'val_accuracy',
patience = 2, # epoch waiting to decrease LR by factor
verbose = 1,
factor = 0.1, #factor for decreasing LR
min_lr = 1.5e-5 #minimum Learning Rate
)
early_stop = tf.keras.callbacks.EarlyStopping(
monitor="val_loss",
min_delta=0,
patience=12,
verbose=0,
mode="auto",
baseline=None,
restore_best_weights=True
)
"""#### **7.4 <font color='yellow'>**|**</font> Training the Model Using Fit Function**"""
num_epochs = 70
history = model.fit(pad_train, y_train,
epochs=num_epochs,
validation_data=(pad_test, y_test),
verbose=2,
callbacks=[callbacks, auto_reduction_LR])
"""#### **7.5 <font color='yellow'>**|**</font> Saving the Model for Deployment**"""
model.save_weights('model_weights.h5')
model.save('model.h5')
"""## **8 <font color='yellow'>**|**</font> Plotting**
#### **8.1 <font color='yellow'>**|**</font> Loss of Training and Validation**
"""
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epochs')
plt.legend(['Training Loss', 'Validation Loss'], loc = 'upper right')
plt.show()
"""#### **8.2 <font color='yellow'>**|**</font> Accuracy of Training and Validation**"""
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training Accuracy', 'Validation Accuracy'], loc='lower right')
plt.show()