In [1]:
import pandas as pd
import numpy as np
from scipy import linalg
import scipy as sp
from sklearn import decomposition
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
import keras
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D, Dropout
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.wrappers.scikit_learn import KerasClassifier
from nltk.corpus import stopwords
import pickle
import json
import gc
import re

Using TensorFlow backend.


In [55]:
rw = open("News_Category_Dataset_v2.json", "rb")
t = rw.readlines()
data = []
for i in range(len(t)):
    data.append(json.loads(t[i].decode("utf-8").replace("\r\n", "")))
data = pd.io.json.json_normalize(data)
data.to_csv("news_dataset.csv", index=False)

In [3]:
def cleaner(s):
    rem = re.compile('[^a-zA-Z0-9\'#-]')
    s = rem.sub(" ", s).lower()
    return s
data = data[data.headline.notnull()&data.short_description.notnull()]
print(len(data))
data['descrp'] = data['headline'] + " " + data['short_description']
data['descrp'] = data['descrp'].apply(lambda s: cleaner(s))
data.drop(['authors', 'link', 'headline', 'short_description', 'date'], axis=1, inplace=True)
data['descrp'].apply(lambda s: len(s.split())).value_counts(bins=10)

181140


(27.4, 51.8]      91461
(2.755, 27.4]     73763
(51.8, 76.2]      15348
(76.2, 100.6]       356
(149.4, 173.8]       90
(125.0, 149.4]       52
(100.6, 125.0]       42
(173.8, 198.2]       20
(222.6, 247.0]        4
(198.2, 222.6]        4
Name: descrp, dtype: int64

In [4]:
#Closing out similar category listings: pair[0] is old categories being closed to pair[1] new category
newcats = [(['WELLNESS', 'HEALTHY LIVING'], "WELLNESS"), (['STYLE & BEAUTY', 'HOME & LIVING', 'STYLE'], "LIVING"), 
           (['PARENTS', 'PARENTING', 'WEDDINGS', 'DIVORCE', 'FIFTY'], "FAMILY"), (['CRIME'], "CRIME"),
           (['WORLD NEWS', 'THE WORLDPOST', 'WORLDPOST', 'POLITICS', 'RELIGION'], "POLITICS"), 
           (['TECH', 'SCIENCE'], "TECHNOLOGY"), (['TRAVEL', 'FOOD & DRINK', 'TASTE'], "FOOD & TRAVEL"),
           (['ARTS & CULTURE', 'CULTURE & ARTS', 'ARTS'], "CULTURE"), (['COLLEGE', 'EDUCATION'], "EDUCATION"),
           (['ENTERTAINMENT', 'COMEDY', 'SPORTS'], "ENTERTAINMENT"), (['ENVIRONMENT', 'GREEN'], "ENVIRONMENT"),
           (['WOMEN', 'QUEER VOICES', 'BLACK VOICES', 'LATINO VOICES'], "REPRESENTATIVE VOICES"), 
           (['BUSINESS', 'MEDIA', 'IMPACT', 'MONEY'], "BUSINESS")]
for pair in newcats:
    data.loc[data.category.isin(pair[0]), "category"] = pair[1]
data.drop(data[data.category.isin(['GOOD NEWS', 'WEIRD NEWS'])].index, inplace=True)

In [5]:
#Let us set our maximum sequence length to 60 since that covers about 96.7% of our data
data['descrp'] = data['descrp'].apply(lambda s: s.split())
data = data[data.descrp.apply(len) < 60]
data['descrp'] = data['descrp'].apply(lambda s: s + [" "] * (60-len(s)))
#Label-Encode the Categories
tp = data.category.value_counts().index.values
data['response'] = data.category.apply(lambda s: np.where(tp==s)[0][0])

In [6]:
tokenizer = keras.preprocessing.text.Tokenizer(num_words=50000, filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(data['descrp'].values)
wordindex = tokenizer.word_index
X = tokenizer.texts_to_sequences(data['descrp'].values)
X = np.array([i + [wordindex[' ']] * (60 - len(i)) for i in X])
Y = data.response.values.reshape(-1, 1)
Y = keras.utils.to_categorical(Y)

In [7]:
trainX, testX, trainY, testY = train_test_split(np.array(X), np.array(Y), random_state=1, test_size=0.3)
trainX.shape, testX.shape, trainY.shape, testY.shape

((120379, 60), (51591, 60), (120379, 13), (51591, 13))

In [34]:
def build_mdr():
    model = Sequential()
    model.add(Embedding(50000, 100, input_length=X.shape[1]))
    model.add(SpatialDropout1D(0.1))
    model.add(LSTM(75, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    model.add(LSTM(75, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
    model.add(LSTM(75, dropout=0.2, recurrent_dropout=0.2, return_sequences=False))
    model.add(Dense(25, activation='relu'))
    model.add(Dense(Y.shape[1], activation="softmax"))
    model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
    return model

In [35]:
chk = ModelCheckpoint('./djiarnn2.h5', monitor='loss', save_best_only=True, period=10)
callbacklist = [chk]
mdl = KerasClassifier(build_fn=build_mdr, epochs=10, batch_size=500, verbose=True, callbacks=callbacklist, 
                      validation_data=(testX, testY))
mdl.fit(trainX, trainY)
mdl.model.save("news.h5")

Train on 120379 samples, validate on 51591 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [39]:
res = mdl.predict(testX)



In [40]:
sum(res==np.array([np.argmax(i) for i in testY]))

36055