In [4]:

from sklearn.preprocessing import MultiLabelBinarizer
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, Activation, Input, Embedding,LSTM, Flatten, MaxPooling1D, GlobalMaxPool1D, Dropout, Conv1D
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint, CSVLogger

from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.preprocessing.sequence import pad_sequences

import pandas as pd
import glob
import re
import numpy as np
import pickle

In [3]:
from cleaner import Cleaner
cleaner = Cleaner()
cleaner.create_tokenizer_and_clean()

ValueError: Expected object or value

In [5]:
filename = "../data/json_news_tagged_bundle/clean_data-unified-tags.json"
df = pd.read_json(filename)
#df = self.clean_news(df)

with open('../data/neural_network_config/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
    multilabel_binarizer = MultiLabelBinarizer()

print("creating tags and tag index for classes")
y = multilabel_binarizer.fit_transform(df.common_tags)
# Serialize both the pipeline and binarizer to disk.
with open('../data/neural_network_config/multilabel_binarizer.pickle', 'wb') as f:
    pickle.dump((multilabel_binarizer), f, protocol=pickle.HIGHEST_PROTOCOL)

ValueError: Expected object or value

In [None]:
print("separating data into test data and train data")
sentences = df['content'].values
sentences_train, sentences_test, y_train, y_test = train_test_split(sentences, y, test_size=0.25, random_state=1000)
maxlen = 700

X_train = tokenizer.texts_to_sequences(sentences_train)
X_test = tokenizer.texts_to_sequences(sentences_test)

X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

vocab_size = len(tokenizer.word_index) + 1  # Adding 1 because of reserved 0 index
output_size = len(y[0])


In [None]:
print("creating model")
filter_length = 300

model = Sequential()
model.add(Embedding(vocab_size, 20, input_length=maxlen))
model.add(Dropout(0.15))
model.add(Conv1D(filter_length, 5, activation='relu'))
model.add(Conv1D(filter_length, 5, activation='relu'))
model.add(GlobalMaxPool1D())
model.add(Dense(output_size, activation='softmax'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

print("saving model")
# serialize model to JSON
model_json = model.to_json()
with open("../data/neural_network_config/model.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
csv_logger = CSVLogger('log_loss.csv', append=False, separator=';')

callbacks = [
ReduceLROnPlateau(),
EarlyStopping(patience=4),
ModelCheckpoint(filepath='../data/neural_network_config/temp-model-new.h5', save_best_only=True),
csv_logger]

history = model.fit(X_train, y_train,
                    epochs=40,
                    batch_size=40,
                    validation_data=(X_test, y_test),
                    callbacks=callbacks)

loss, accuracy = model.evaluate(X_train, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))

loss, accuracy = model.evaluate(X_test, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))


In [None]:
model.save_weights("../data/neural_network_config/model.h5")
print("Saved model to disk")