In [1]:
import pandas as pd
import numpy as np
import scipy.sparse as ssp

from DSI_Capstone_Steemit.utils.utils import(
    load_data_and_description,
)
from sklearn.model_selection import train_test_split

import os

data_directory = '../data/'
input_directory = os.path.join(data_directory,'networkx_votes')

def load_joblib(filename):
    return joblib.load(os.path.join(input_directory,filename))

data,feature_names,data_desc = load_data_and_description(data_type='tfidf')
data_desc['log total_payout_value'] = np.log(data_desc['total_payout_value'])



Loading tfidf


In [2]:
import joblib

In [3]:
def add_network_features(df):

    hubs,authorities = load_joblib('hits')
    cluster = load_joblib('parts')
    pagerank = load_joblib('prank') 
    eig_cent = load_joblib('eig_cent') 
    core_k = load_joblib('core_k') 

    df['Cluster'] = df['author'].map(cluster)
    df.loc[:,'Cluster Condense'] = df['Cluster']
    df.loc[~df['Cluster'].isin([1,3,0,2,5,4]),'Cluster Condense'] = 'Other'
    df['Hubs'] = df['author'].map(hubs) * 10000
    df['Authorities'] = df['author'].map(authorities) * 10000
    df['Page Rank'] = df['author'].map(pagerank) * 10000
    df['Eigen Centrality'] = df['author'].map(eig_cent)* 10000
    df['Core K'] = df['author'].map(core_k)*10000
    return df
data_desc = add_network_features(data_desc)

network_cols = ['Page Rank','Cluster','Hubs','Authorities','Page Rank','Eigen Centrality']

In [27]:

# Remove middle value articles

idx1 = data_desc['log total_payout_value'] < 1.2
idx2 = data_desc['log total_payout_value'] >2.5

idx_not = (~idx1) & (~idx2)

data_desc = data_desc[~idx_not]
data = data[~idx_not.values,:]
y = data_desc['log total_payout_value'] >2.5

# For Regression
# y = data_desc['log total_payout_value']
value_counts = data_desc['category'].value_counts()
top_categories = value_counts.index[value_counts > np.percentile(data_desc['category'].value_counts(),97)]
idx = data_desc['category'].isin(top_categories)
data_desc['top category'] = idx.astype(int)

data_desc['top category listed'] = data_desc.ix[data_desc['top category'].values.astype(bool) ,'category']

data_desc['top category listed'] = data_desc['top category listed'].fillna('Other')


post_features = ['number of body tags',
                                   'number of body urls',
                                   'number of image urls',
                                   'number of body mentions',
                                   'number of image urls',
                                   'number of youtube urls',
                                   'language',
                                   'author_reputation_scaled',
                                   'number of steem counts',
                                'top category'] + network_cols


train_features = data_desc.ix[:,post_features].fillna(0)

train = pd.get_dummies(train_features,columns=['language','Cluster'])

num_image_urls = train['number of image urls'].values[:,0]
train.drop('number of image urls',axis = 1, inplace=True)

train['number of image urls'] = num_image_urls

training_names = train.columns

train_sparse = ssp.csr_matrix(train)
new_data = ssp.hstack([data,train_sparse])
train = new_data.tocsr()

# All samples
number_of_samples = train.shape[0]

X_train, X_test, y_train, y_test = train_test_split(
    train, y, test_size=0.33, random_state=42)


In [28]:
training_names

Index([u'number of body tags', u'number of body urls',
       u'number of body mentions', u'number of youtube urls',
       u'author_reputation_scaled', u'number of steem counts', u'top category',
       u'Page Rank', u'Hubs', u'Authorities',
       ...
       u'Cluster_5', u'Cluster_7', u'Cluster_10', u'Cluster_11', u'Cluster_27',
       u'Cluster_33', u'Cluster_78', u'Cluster_85', u'Cluster_218',
       u'number of image urls'],
      dtype='object', length=114)

In [29]:
X_train.shape

(56905, 21378)

In [30]:
import keras
from keras.models import Sequential
from keras.layers import Dense
import numpy


In [69]:
# samples = X_train
X = X_train.toarray()#[0:samples]
y = y_train.astype(int).values#[0:samples]
# y = keras.utils.np_utils.to_categorical(y)



X_val = X_test.toarray()#[0:samples]

y_val = y_test.astype(int).values#[0:samples]
# y_val = keras.utils.np_utils.to_categorical(y_val)




In [70]:
pd.DataFrame(y_val).drop_duplicates()

Unnamed: 0,0
0,1
2,0


In [32]:
from sklearn.feature_selection import f_classif

In [34]:
f_values = f_classif(X,y_train.values)



In [35]:
values,pvalues = f_values

In [36]:
idx = (pvalues < 0.005)

  if __name__ == '__main__':


In [37]:
all_features = np.array(feature_names + list(training_names))
all_features[idx]

array([u'academia', u'adventure', u'ai', u'alcohol', u'alien',
       u'allasyummyfood', u'amsterdam', u'anarchism', u'anarchy',
       u'animal', u'anyxphotos', u'architecture', u'art', u'artwork',
       u'astronomy', u'australia', u'author', u'autism', u'badge', u'bali',
       u'banking', u'basketball', u'beach', u'berlin', u'better',
       u'beyondbitcoin', u'biology', u'bitcoin', u'bittrex', u'blog',
       u'book', u'boutique', u'brain', u'bravenewcoin', u'breakfast',
       u'bsotn', u'burstcoin', u'business', u'ca', u'casa', u'cervantes',
       u'chemistry', u'chemistry-lesson', u'chemtrails', u'chinadaily',
       u'chocolate', u'christ', u'cn-qap', u'cn-stats', u'coding',
       u'coffee', u'cognitive-biases', u'comment', u'compassion',
       u'computer', u'consciousness', u'contest', u'cooking', u'craft',
       u'craigrant', u'creative', u'creativity', u'crisis',
       u'crowdfundedwhale', u'crypto-news', u'cryptocurrency', u'culture',
       u'curation', u'curators-wa

In [71]:
X_val = X_val[:,idx]
X = X[:,idx]

In [72]:
print X.shape
print X_val.shape

(56905, 396)
(28029, 396)


## Feature Selection

In [93]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import log_loss, roc_auc_score,accuracy_score
import sys

from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.optimizers import Adadelta, Adam, rmsprop


space = {
        'lr': hp.uniform('lr', 0.0001,0.1),
}

dropout = 0.5
def f_nn(params):   

    print ('Params testing: ', params)
    model = Sequential()
    model.add(Dense(output_dim=1000, input_dim = X.shape[1])) 
    model.add(Activation('relu'))
    model.add(Dropout(dropout))


    model.add(Dense(output_dim=500))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))
    
    model.add(Dense(output_dim=100))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))

    model.add(Dense(output_dim=10))
    model.add(Activation('relu'))
    model.add(Dropout(dropout))


    model.add(Dense(1))
    model.add(Activation('sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.adagrad(params['lr']))

    model.fit(X, y, nb_epoch=10, batch_size=32, verbose = 1)

    pred_auc =model.predict_classes(X_val, batch_size = 32, verbose = 0)
    acc = accuracy_score(y_val, pred_auc)
    
    print('Accuracy:', acc)
    sys.stdout.flush() 
    return {'loss': -acc, 'status': STATUS_OK}


trials = Trials()
best = fmin(f_nn, space, algo=tpe.suggest, max_evals=25, trials=trials)
print 'best: '
print best


('Params testing: ', {'lr': 0.07438498681878454})
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
('Accuracy:', 0.51097078026329867)
('Params testing: ', {'lr': 0.07652653327619954})
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
('Accuracy:', 0.51097078026329867)
('Params testing: ', {'lr': 0.0824617800009953})
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [81]:
model = Sequential()
model.add(Dense(output_dim=500, input_dim = X.shape[1])) 
model.add(Activation('relu'))
model.add(Dropout(dropout))


model.add(Dense(output_dim=250))
model.add(Activation('relu'))
model.add(Dropout(dropout))

model.add(Dense(output_dim=100))
model.add(Activation('relu'))
model.add(Dropout(dropout))

model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer=keras.optimizers.adagrad(0.034118198130292786))

model.fit(X, y, nb_epoch=3, batch_size=32, verbose = 1)



Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x391937ad0>

In [91]:
acc

0.65546398373113557

In [87]:
y_val

array([1, 1, 0, ..., 0, 1, 1])