In [1]:
import numpy as np
import pandas as pd
from keras.layers import Dense, Flatten, Dropout, Activation
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.optimizers import RMSprop
import os
import matplotlib
import matplotlib.pyplot as pl
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from hyperopt import Trials, STATUS_OK, tpe

from keras.objectives import binary_crossentropy
from keras.metrics import binary_accuracy

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
'''
I have created this notebook to first observe the task with binary classification. This way it was easier to find the most 
promising features and algorithm to go with.
I have found this (http://cs229.stanford.edu/proj2015/328_report.pdf) "paper" claiming that they have reached ~0.69
accuracy with RandomForest on the same dataset. (Binary classification)
I couldn't reproduce their result, not even with their best features. More suprisingly I didn't find their best features the best at all.
With a fully connected Neural Network on the binary classification problem, my best result was ~0.63 accuracy.
'''

def data():
    cols = [' title_sentiment_polarity', ' n_tokens_title', ' data_channel_is_lifestyle', ' data_channel_is_entertainment', ' data_channel_is_bus', ' data_channel_is_socmed', ' data_channel_is_tech', ' data_channel_is_world', ' weekday_is_monday', ' weekday_is_tuesday', ' weekday_is_wednesday', ' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday', ' weekday_is_sunday', ' is_weekend', ' global_sentiment_polarity', ' shares']
    raw_data = pd.read_csv('data/OnlineNewsPopularity.csv', usecols=cols)
    
    #Standardize n_tokens_title
    raw_data[' n_tokens_title'] -= raw_data[' n_tokens_title'].mean(axis=0)
    raw_data[' n_tokens_title'] /= raw_data[' n_tokens_title'].std(axis=0)
    
    #Create two classes, they are roughly equal in number
    def categorizeBinaryShares(shares):
        if shares <= 1400:
            return 0
        else:
            return 1
    
    raw_data['label'] = raw_data[' shares'].apply(categorizeBinaryShares)
    raw_data['label'].value_counts()
    raw_data.drop([' shares'], axis=1, inplace=True)
    
    y = raw_data['label']
    raw_data.drop(['label'], axis=1, inplace=True)
    
    raw_data.fillna("", inplace=True)
    raw_data = raw_data.reindex()
    
    def constructBestFeatures(df):
        from sklearn.tree import DecisionTreeClassifier
        if 'url' in raw_data.keys():
            raw_data.drop(['url'], axis=1, inplace=True)


        classifier = DecisionTreeClassifier()
        classifier.fit(raw_data, y)


        importances = []
        for name, importance in zip(raw_data.columns, classifier.feature_importances_):
            importances.append((name, importance))

        num_cols = len(raw_data.columns)
        importances_sorted = sorted(importances, key=lambda x: x[1])
        importances_sorted
        cols = []
        #Getting the top 14 features
        for imp_tupl in importances_sorted[-14:]:
            cols.append(imp_tupl[0])
        return cols

    cols = constructBestFeatures(raw_data)
    print(cols)
    best_features = raw_data[cols]
    
    X_train, X_test, y_train, y_test = train_test_split(best_features, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [3]:

def create_model(X_train, X_test, y_train, y_test):
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)
    model = Sequential()
    model.add(Dense(64, kernel_initializer='random_uniform', activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense({{choice([32,64, 128 ,256, 512, 1024])}}, activation='relu'))
    model.add(Dense({{choice([32,64, 128 ,256, 512, 1024])}}, activation='relu'))

    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer={{choice(['rmsprop', 'adam', 'sgd'])}},
                  loss=binary_crossentropy, metrics=[binary_accuracy])
    history = model.fit(np.array(X_train), np.array(y_train), epochs=15, batch_size={{choice([32, 64, 128])}}, validation_split=0.1, callbacks=[EarlyStopping(monitor='val_loss', mode='auto')])
    score, acc = model.evaluate(np.array(X_test), np.array(y_test), verbose=0)
    print('Test accuracy:', acc)
    
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}
best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=5,
                                          trials=Trials(),
                                          notebook_name='fully_connected_binary')
X_train, X_test, y_train, y_test = data()
print("Evalutation of best performing model:")
score, acc = best_model.evaluate(np.array(X_test), np.array(y_test))
print(score, acc)
print("Best performing model chosen hyper-parameters:")
print(best_run)

>>> Imports:
#coding=utf-8

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    from keras.layers import Dense, Flatten, Dropout, Activation
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from keras.callbacks import EarlyStopping
except:
    pass

try:
    from keras.utils import to_categorical
except:
    pass

try:
    from keras.optimizers import RMSprop
except:
    pass

try:
    import os
except:
    pass

try:
    import matplotlib
except:
    pass

try:
    import matplotlib.pyplot as pl
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform, conditional
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from keras.objectives import binary_crossentropy
except:
    pass

try:
    from keras

Epoch 3/15
Test accuracy: 0.6245428175955676
(31715, 14) (31715,) (7929, 14) (7929,)
Train on 28543 samples, validate on 3172 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Test accuracy: 0.618867448613899
[' data_channel_is_lifestyle', ' weekday_is_friday', ' data_channel_is_bus', ' data_channel_is_tech', ' weekday_is_monday', ' weekday_is_wednesday', ' weekday_is_thursday', ' weekday_is_tuesday', ' is_weekend', ' data_channel_is_world', ' data_channel_is_entertainment', ' n_tokens_title', ' title_sentiment_polarity', ' global_sentiment_polarity']
Evalutation of best performing model:
Best performing model chosen hyper-parameters:
{'Dense': 3, 'Dense_1': 4, 'batch_size': 2, 'optimizer': 1}


In [4]:
#Save best model
def save_model(model,model_name, weights_name):
    model_json = model.to_json()
    with open(model_name, "w") as f:
        f.write(model_json)
    model.save_weights(weights_name)
    print("Model saved")

In [5]:
save_model(best_model, 'models/binary_model.json', 'models/binary_weights.h5')

Model saved


In [6]:
best_model.predict(np.array(X_train[:1]))

array([[0.46900886]], dtype=float32)