In [1]:
import numpy as np
import pandas as pd
from keras.layers import Dense, Flatten, Dropout, Activation
from keras.models import Sequential
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping
from keras.utils import to_categorical
from keras.optimizers import RMSprop
import os
import matplotlib
import matplotlib.pyplot as pl
from hyperas import optim
from hyperas.distributions import choice, uniform, conditional
from hyperopt import Trials, STATUS_OK, tpe

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.
  return f(*args, **kwds)


In [2]:
'''
Data loading and preprocessing for Hyperas
'''
def data():
    #I have handpicked columns based on intuition after a handful of tries with all features or Dimensionality Reduction produced poor results
    cols = [' title_sentiment_polarity', ' n_tokens_title', ' data_channel_is_lifestyle', ' data_channel_is_entertainment', ' data_channel_is_bus', ' data_channel_is_socmed', ' data_channel_is_tech', ' data_channel_is_world', ' weekday_is_monday', ' weekday_is_tuesday', ' weekday_is_wednesday', ' weekday_is_thursday', ' weekday_is_friday', ' weekday_is_saturday', ' weekday_is_sunday', ' is_weekend', ' global_sentiment_polarity', ' shares']
    raw_data = pd.read_csv('data/OnlineNewsPopularity.csv', usecols=cols)
    
    #Dividing records into one of 4 classes. I tried to divide them in a way that the data stays balanced
    def categorizeShares(shares):
        if shares <= 945:   #FEW
            return 0        
        if shares <= 1400:  #MODEST
            return 1        
        if shares <= 2700:  #LOT
            return 2        
        return 3            #POPULAR
    
    
    #The numerical features are logarithmic scaled (based on the dataset authors) making my job relatively easy on preprocessing.
    
    multi_class = raw_data.copy(deep=True)
    multi_class['label'] = multi_class[' shares'].apply(categorizeShares)
    multi_class['label'].value_counts()
    multi_class.drop([' shares'], axis=1, inplace=True)

    #One-hot encoding labels
    y = multi_class['label']
    y = to_categorical(y, num_classes=4)
    multi_class.drop(['label'], axis=1, inplace=True)
    
    #Fill possible corrupted data & shuffle rows
    multi_class.fillna("", inplace=True)
    multi_class = multi_class.reindex()
    
    #Picking "best" features based on feature importance
    def constructBestFeatures(df):
        from sklearn.tree import DecisionTreeClassifier
        if 'url' in multi_class.keys():
            multi_class.drop(['url'], axis=1, inplace=True)

        classifier = DecisionTreeClassifier()
        classifier.fit(multi_class, y)


        importances = []
        for name, importance in zip(multi_class.columns, classifier.feature_importances_):
            importances.append((name, importance))

        num_cols = len(multi_class.columns)
        #Sorting features by importance
        importances_sorted = sorted(importances, key=lambda x: x[1])
        importances_sorted
        cols = []
        
        '''
        Picking the top 14 features: The number 14 came by fine-tuning, 
        the top 14 features of the handpicked subset of features seems to be the strongest combination
        '''
        for imp_tupl in importances_sorted[-14:]:
            cols.append(imp_tupl[0])
        return cols

    cols_m = constructBestFeatures(multi_class)
    best_features_m = multi_class[cols_m]
    
    X_train, X_test, y_train, y_test = train_test_split(best_features_m, y, test_size=0.2, random_state=42)
    return X_train, X_test, y_train, y_test

In [3]:
'''
For multi-class classification I came up with a fully connected Neural Network.
For hyper parameter optimization I am using Hyperas, hence the multiple parameters below.
The dataset is relatively small, so I had to be careful choosing validation set in order to have enough training data.
20% seems ok.
'''
def create_model(X_train, X_test, y_train, y_test):
    
    print(X_train.shape, y_train.shape, X_test.shape, y_test.shape[1])
    model = Sequential()
    model.add(Dense({{choice([16, 32 ,64, 128, 256, 512])}}, activation='relu', input_dim=X_train.shape[1]))


    model.add(Dense({{choice([16, 32 ,64])}}, activation='relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([16, 32 ,64])}}, activation='relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    model.add(Dense({{choice([16, 32 ,64])}}, activation='relu'))
    model.add(Dropout({{uniform(0, 1)}}))
    
    #For multi-class classification we are using softmax as activation and categorical-crossentropy as loss function
    model.add(Dense(4, activation='softmax'))
    model.compile(optimizer={{choice(['rmsprop', 'adam'])}},
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    #According to my tests (due to the small dataset) 10 epoch is enough most of the time for the loss function to converge.
    model.fit(np.array(X_train), np.array(y_train), epochs=10, batch_size={{choice([16, 32, 64, 128])}}, validation_data=(np.array(X_test), np.array(y_test)),callbacks=[EarlyStopping(monitor='val_loss', mode='auto')])
    score, acc = model.evaluate(np.array(X_test), np.array(y_test), verbose=0)
    print('Test accuracy:', acc)
    return {'loss': -acc, 'status': STATUS_OK, 'model': model}


In [4]:
#Running hyper-param optimalization and choosing the best model
best_run, best_model = optim.minimize(model=create_model,
                                          data=data,
                                          algo=tpe.suggest,
                                          max_evals=10,
                                          trials=Trials(),
                                          notebook_name='fully_connected_multi')
X_train, X_test, y_train, y_test = data()
print("Evalutation of best performing model:")
print(best_model.evaluate(np.array(X_test), np.array(y_test)))
print("Best performing model chosen hyper-parameters:")
print(best_run)

>>> Imports:
#coding=utf-8

try:
    import numpy as np
except:
    pass

try:
    import pandas as pd
except:
    pass

try:
    from keras.layers import Dense, Flatten, Dropout, Activation
except:
    pass

try:
    from keras.models import Sequential
except:
    pass

try:
    from sklearn.model_selection import train_test_split
except:
    pass

try:
    from keras.callbacks import EarlyStopping
except:
    pass

try:
    from keras.utils import to_categorical
except:
    pass

try:
    from keras.optimizers import RMSprop
except:
    pass

try:
    import os
except:
    pass

try:
    import matplotlib
except:
    pass

try:
    import matplotlib.pyplot as pl
except:
    pass

try:
    from hyperas import optim
except:
    pass

try:
    from hyperas.distributions import choice, uniform, conditional
except:
    pass

try:
    from hyperopt import Trials, STATUS_OK, tpe
except:
    pass

try:
    from sklearn.tree import DecisionTreeClassifier
except:
    pass

>>> Hyperas search s

Epoch 3/10
Test accuracy: 0.31706394248207787
(31715, 14) (31715, 4) (7929, 14) 4
Train on 31715 samples, validate on 7929 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Test accuracy: 0.25879682178214064
(31715, 14) (31715, 4) (7929, 14) 4
Train on 31715 samples, validate on 7929 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Test accuracy: 0.32778408378832485
(31715, 14) (31715, 4) (7929, 14) 4
Train on 31715 samples, validate on 7929 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Test accuracy: 0.31113633495033244
(31715, 14) (31715, 4) (7929, 14) 4
Train on 31715 samples, validate on 7929 samples
Epoch 1/10
Epoch 2/10
Test accuracy: 0.25879682178214064
(31715, 14) (31715, 4) (7929, 14) 4
Train on 31715 samples, validate on 7929 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Test accuracy: 0.34014377605721124
Evalutation of best performing model:
Best performing model chosen hyper-parameters:
{'Dense': 0, 'Dense_1': 1, 'Dense_2': 2, 'Dense_3

In [5]:
#Save best model
def save_model(model,model_name, weights_name):
    model_json = model.to_json()
    with open(model_name, "w") as f:
        f.write(model_json)
    model.save_weights(weights_name)
    print("Model saved")

In [6]:
save_model(best_model, 'models/multi_model.json', 'models/multi_weights.h5')

Model saved


In [7]:
X_train[0:1]

Unnamed: 0,data_channel_is_socmed,data_channel_is_tech,data_channel_is_bus,data_channel_is_world,data_channel_is_entertainment,is_weekend,weekday_is_friday,weekday_is_monday,weekday_is_tuesday,weekday_is_thursday,weekday_is_wednesday,title_sentiment_polarity,n_tokens_title,global_sentiment_polarity
19837,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.333333,15.0,0.047722


In [8]:
np.argmax(best_model.predict(np.array(X_train[0:1])))

1

In [17]:
print(X_train.loc[1].to_json("row{}.json".format(0)))

KeyError: 'the label [1] is not in the [index]'