# Test of the spark_sklearn fit for the NLP model

In [3]:
!pip install spark_sklearn --user

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.[0m
Collecting spark_sklearn
  Downloading https://files.pythonhosted.org/packages/b0/3f/34b8dec7d2cfcfe0ba99d637b4f2d306c1ca0b404107c07c829e085f6b38/spark-sklearn-0.3.0.tar.gz
Collecting scikit-learn<0.20,>=0.18.1 (from spark_sklearn)
[?25l  Downloading https://files.pythonhosted.org/packages/bc/67/370aa248f54769a56216707ad7b9af19745e85a603fafa47bde353f327fb/scikit_learn-0.19.2-cp27-cp27mu-manylinux1_x86_64.whl (5.0MB)
[K     |████████████████████████████████| 5.0MB 10.8MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: spark-sklearn
  Building wheel for spark-sklearn (setup.py) ... [?25ldone
[?25h  Stored in directory: /tmp/llayer/.cache/pip/wheels/64/28/e8/cb0250888675c630786f932dcc63ed96ac1aca299bcfb7235f
Successfully built spark-sklearn
Installing col

In [1]:
import numpy as np
import pandas as pd

### Indexer

In [2]:
def to_index(sites, codes):

    sites_index = {k: v for v, k in enumerate(sites)}
    codes_index = {k: v for v, k in enumerate(codes)}
    return sites_index, codes_index


def prune_to_index(codes, sites, only_unknown = False, counts = False, error_threshold = 0, site_threshold = 0):
    
    all_sites = list(sites['site'])
    all_codes = list(codes['error'])
    good_sites = list(sites['site'])
    good_codes = list(codes['error'])
    
    if only_unknown == True:
        informative_sites = list(sites[sites['only_unknown'] == False]['site'])
        good_sites = list(set(informative_sites) & set(good_sites))  

    if site_threshold > 0:
        if counts == False:
            frequent_sites = list(sites[sites['frequency'] > site_threshold]['site'])
        else:
            frequent_sites = list(sites[sites['counts'] > site_threshold]['site'])
        good_sites = list(set(frequent_sites) & set(good_sites))  
            
    if error_threshold > 0:
        if counts == False:
            frequent_errors = list(codes[codes['frequency'] > error_threshold]['error'])
        else:
            frequent_errors = list(codes[codes['counts'] > error_threshold]['error'])    
        good_codes = list(set(frequent_errors) & set(good_codes)) 
        
    # Get the pruned sites and codes
    pruned_sites = list(set(all_sites) - set(good_sites))
    pruned_codes = list(set(all_codes) - set(good_codes))
    
    # Index the results
    good_sites_index = {k: v for v, k in enumerate(good_sites)}
    pruned_sites_index = {k: len(good_sites) for k in pruned_sites}
    good_codes_index = {k: v for v, k in enumerate(good_codes)}
    pruned_codes_index = {k: len(good_codes) for k in pruned_codes}    
        
    def merge_dicts(x, y):
        z = x.copy()   
        z.update(y) 
        return z
    
    codes_index = merge_dicts(good_codes_index, pruned_codes_index)
    sites_index = merge_dicts(good_sites_index, pruned_sites_index)
    
    return sites_index, codes_index

### Load data

In [3]:
def load_data(path, msg_only = False, sample = False, sample_fact = 3):

    actionshist = pd.read_hdf(path, 'frame')
    
    print( actionshist['label'].value_counts() )
    
    if sample == True:
        minority_class = actionshist[actionshist['label'] == 1]
        n_samples = int(sample_fact*len(minority_class))
        majority_class_sampled = actionshist[actionshist['label'] == 0].sample(n_samples , random_state=42)
        print('After sampling:', 'Minority class', len(minority_class), 'Majority class', len(majority_class_sampled) )
        actionshist = pd.concat([minority_class, majority_class_sampled])
    
    if msg_only == False:
        sites = pd.read_hdf(path, 'frame2')
        codes = pd.read_hdf(path, 'frame3')
    else:
        codes = pd.read_hdf(path, 'frame4')
        sites = pd.read_hdf(path, 'frame5')
        codes.rename({'errors_msg': 'error'}, axis=1, inplace=True)
        sites.rename({'sites_msg': 'site'}, axis=1, inplace=True)
        
    return actionshist, codes, sites

### Setup the data

In [4]:
# Experiment parameters

# Include counts
MSG_ONLY = False
PRUNING = 'Neg'

# sample
SAMPLE = False
SAMPLE_FACT = 5

# batch generator param
AVG_W2V = False
MAX_WORDS = 400
GEN_PARAM = {}
GEN_PARAM['averaged'] = AVG_W2V
GEN_PARAM['only_msg'] = MSG_ONLY 
GEN_PARAM['sequence'] = False
GEN_PARAM['max_msg'] = 1
GEN_PARAM['cut_front'] = True
TRAIN_ON_BATCH = True

# Model
MODEL = 'nlp_msg'

# Defines the input experiments for the machine learning
EXPERIMENTS = [
    
    # 1st experiment initial parameter
    {'NAME': 'NOMINAL_t', 'DIM':50, 'VOCAB': -1, 'ALGO': 'sg',
     'NLP_PARAM': {'cudnn': False, 'batch_norm': False, 'word_encoder': 'LSTM', 
                   'attention': False, 'include_counts': True, 'avg_w2v': False},
     'CALLBACK': { 'es': True, 'patience': 3, 'kill_slowstarts': True, 'kill_threshold': 0.51 }
    }
]
    
e = EXPERIMENTS[ 0 ]

# Load the data
path = '/eos/user/l/llayer/AIErrorLogAnalysis/data/input/' + 'input_' + 'NOMINAL' + '.h5'
actionshist, codes, sites = load_data(path, msg_only=MSG_ONLY,
                                                  sample=SAMPLE, sample_fact = SAMPLE_FACT)
e['NLP_PARAM']['embedding_matrix_path'] = '/eos/user/l/llayer/AIErrorLogAnalysis/data/word2vec/' + 'embedding_matrix_' + 'NOMINAL' + '.npy'

sites_index, codes_index = sites_index, codes_index = prune_to_index(codes, sites, only_unknown = True)

0    31839
1     1747
Name: label, dtype: int64


In [5]:
actionshist.head()

Unnamed: 0,task_name,label,error,site,site_state,count,msg_encoded,exit_code,error_type,avg
0,/amaltaro_Run2016D-v2-DoubleMuonLowMass-07Aug1...,0,"[-1, -1]","[T1_US_FNAL_Disk, T3_US_FNALLPC]","[bad, bad]","[1, 1]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]"
1,/amaltaro_Run2016D-v2-DoubleMuonLowMass-07Aug1...,0,"[-1, -1]","[T3_US_FNALLPC, T1_US_FNAL_Disk]","[bad, bad]","[1, 1]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]"
2,/amaltaro_Run2016D-v2-DoubleMuonLowMass-07Aug1...,0,"[-1, -1]","[T3_US_FNALLPC, T1_US_FNAL_Disk]","[bad, bad]","[1, 1]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]"
3,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,0,"[-1, -1, -1, -1, 85]","[T1_DE_KIT_Disk, T0_CH_CERN_MSS, T0_CH_CERN_Ex...","[bad, bad, bad, bad, good]","[1, 1, 1, 1, 1]","[nan, nan, nan, nan, [35, 12, 10, 37, 186, 34,...","[nan, nan, nan, nan, 8021.0]","[nan, nan, nan, nan, Fatal Exception]","[nan, nan, nan, nan, [-1.26796770096, 1.129392..."
4,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,0,"[50664, -1, -1]","[T2_DE_RWTH, T2_DE_RWTH, T2_CH_CERN]","[good, good, good]","[2, 1, 1]","[[53, 2, 74, 141, 129, 198, 10, 200, 4, 32, 42...","[50664.0, nan, nan]","[PerformanceKill, nan, nan]","[[-0.292789727449, 1.32788562775, 0.7245721220..."


In [7]:
def insert_list(msgs):
    new_array = []
    for msg in msgs:
        if not isinstance(msg, list):
            new_array.append([])
        else:
            new_array.append(msg)
    return new_array

In [8]:
actionshist['msg'] = actionshist['msg_encoded'].apply(insert_list)

In [9]:
actionshist.head()

Unnamed: 0,task_name,label,error,site,site_state,count,msg_encoded,exit_code,error_type,avg,msg
0,/amaltaro_Run2016D-v2-DoubleMuonLowMass-07Aug1...,0,"[-1, -1]","[T1_US_FNAL_Disk, T3_US_FNALLPC]","[bad, bad]","[1, 1]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[[], []]"
1,/amaltaro_Run2016D-v2-DoubleMuonLowMass-07Aug1...,0,"[-1, -1]","[T3_US_FNALLPC, T1_US_FNAL_Disk]","[bad, bad]","[1, 1]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[[], []]"
2,/amaltaro_Run2016D-v2-DoubleMuonLowMass-07Aug1...,0,"[-1, -1]","[T3_US_FNALLPC, T1_US_FNAL_Disk]","[bad, bad]","[1, 1]","[nan, nan]","[nan, nan]","[nan, nan]","[nan, nan]","[[], []]"
3,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,0,"[-1, -1, -1, -1, 85]","[T1_DE_KIT_Disk, T0_CH_CERN_MSS, T0_CH_CERN_Ex...","[bad, bad, bad, bad, good]","[1, 1, 1, 1, 1]","[nan, nan, nan, nan, [35, 12, 10, 37, 186, 34,...","[nan, nan, nan, nan, 8021.0]","[nan, nan, nan, nan, Fatal Exception]","[nan, nan, nan, nan, [-1.26796770096, 1.129392...","[[], [], [], [], [35, 12, 10, 37, 186, 34, 25,..."
4,/amaltaro_Run2018A-v1-DoubleMuon-17Sep2018_102...,0,"[50664, -1, -1]","[T2_DE_RWTH, T2_DE_RWTH, T2_CH_CERN]","[good, good, good]","[2, 1, 1]","[[53, 2, 74, 141, 129, 198, 10, 200, 4, 32, 42...","[50664.0, nan, nan]","[PerformanceKill, nan, nan]","[[-0.292789727449, 1.32788562775, 0.7245721220...","[[53, 2, 74, 141, 129, 198, 10, 200, 4, 32, 42..."


In [10]:
t = actionshist[['label', 'error', 'site', 'msg']]

In [11]:
import pandas as pd
test = [[[[2,1], []], [1,0]]]
tf = pd.DataFrame(test)

In [12]:
tf.head()

Unnamed: 0,0,1
0,"[[2, 1], []]","[1, 0]"


In [13]:
df = spark.createDataFrame(t)

In [14]:
df.take(1)

[Row(label=0, error=[u'-1', u'-1'], site=[u'T1_US_FNAL_Disk', u'T3_US_FNALLPC'], msg=[[], []])]

### Model

In [6]:
import os
import sys
import numpy as np
import keras
from keras.layers import Embedding, Input, Dense, LSTM, GRU, Bidirectional, TimeDistributed, Dropout, Flatten, Reshape
from keras.layers import average, Concatenate, Lambda, CuDNNLSTM, CuDNNGRU, Conv1D, GlobalMaxPooling1D, MaxPooling1D
from keras.regularizers import l2
from keras.models import Model
from keras.optimizers import Adam
from keras import backend as K
from keras.layers import BatchNormalization
K.set_floatx('float32')


class NLP():
    
    
    def __init__(self, num_classes, num_error, num_sites, max_sequence_length, embedding_matrix,
                 cudnn = False, batch_norm = False, word_encoder = 'LSTM', encode_sites = True, attention = False,
                 include_counts = False, avg_w2v = False, verbose = 1):
    
        self.embedding_matrix = embedding_matrix.astype('float32')
        self.max_sequence_length = max_sequence_length
        self.num_error = num_error
        self.num_sites = num_sites
        self.num_classes = num_classes
        self.max_senten_num = num_error * num_sites
        self.cudnn = cudnn
        self.attention = attention
        self.word_encoder = word_encoder
        self.encode_sites = encode_sites
        self.batch_norm = batch_norm
        self.include_counts = include_counts
        self.avg_w2v = avg_w2v
        self.verbose = verbose
        # Hyperparameters
        self.hp = {
            # Regularization
            'l2_regulizer': 0.0001,
            'dropout':0.2,
            # Conv1D
            'filters':256,
            'kernel_size':3,
            'conv_layers':3,
            'max_pooling':3,
            'units_conv':10,
            # RNN with optional attention
            'train_embedding': False,
            'att_units':10,
            'rec_dropout':0.0,
            'rnn': LSTM, #TRY GRU
            'rnncud': CuDNNLSTM, # TRY CuDNNGRU
            'rnn_units' : 10,
            # Site encoding
            'encode_sites': False,
            'activation_site': 'relu', #TRY linear
            'units_site': 10,
            # Final layers
            'dense_layers': 3,
            'dense_units': 20,
            'learning_rate':0.0001,
            'decay':0.0
                    }

        
        
    def set_hyperparameters(self, tweaked_instances):

        for  key, value in tweaked_instances.items():
            if key in self.hp:
                self.hp[key] = value
            else:
                raise KeyError(key + ' does not exist in hyperparameters')

            
    def print_hyperparameters(self):

        print('Hyperparameter\tCorresponding Value')
        for key, value in self.hp.items():
            print(key, '\t\t', value)
        
        
    def get_embedding_layer( self ):
        
        dims_embed = self.embedding_matrix.shape
        """
        if self.cudnn == True or self.word_encoder == 'Conv1D':
            embedding = Embedding(dims_embed[0], dims_embed[1], weights=[self.embedding_matrix], \
                                  input_length = self.max_sequence_length, trainable = self.train_embedding)
        else:
        """
        
        embedding = Embedding(dims_embed[0], dims_embed[1], weights=[self.embedding_matrix], \
                  input_length = self.max_sequence_length, mask_zero = True, trainable = int(self.hp['train_embedding']))
    
        return embedding
    
    
    def word_encoder_lstm( self ):
        
        #TODO add recurrent_dropout
        
        word_input = Input(shape = ( None, ), dtype='int32')
        word_sequences = self.get_embedding_layer()(word_input)
                
        if self.attention == False:
            if self.cudnn == True:
                word_lstm = self.hp['rnncud'](int(self.hp['rnn_units']), 
                                              kernel_regularizer=l2(self.hp['l2_regulizer']))(word_sequences)
            else:
                word_lstm = self.hp['rnn'](int(self.hp['rnn_units']), kernel_regularizer=l2(self.hp['l2_regulizer']),
                                          recurrent_dropout = self.hp['rec_dropout'])(word_sequences)
            wordEncoder = Model(word_input, word_lstm)
        else:
            if self.cudnn == True:
                word_lstm = self.hp['rnncud'](int(self.hp['rnn_units']), kernel_regularizer=l2(self.hp['l2_regulizer']),
                                             return_sequences=True)(word_sequences)
            else:
                word_lstm = self.hp['rnn'](int(self.hp['rnn_units']), kernel_regularizer=l2(self.hp['l2_regulizer']),
                                          recurrent_dropout = self.hp['rec_dropout'], return_sequences=True)(word_sequences)
            word_dense = TimeDistributed(Dense(int(self.hp['att_units'])))(word_lstm)
            word_att = AttentionWithContext()(word_dense)
            wordEncoder = Model(word_input, word_att)
        
        return wordEncoder
    

    def word_encoder_conv( self ):
        
        #TODO add spatial dropout
        
        word_input = Input(shape = ( self.max_sequence_length, ), dtype='float32')
        word_sequences = self.get_embedding_layer()(word_input)

        for i in range(self.hp['conv_layers']):
            word_sequences = Conv1D(self.hp['filters'], self.hp['kernel_size'], 
                                    activation='relu',kernel_regularizer=l2(self.hp['l2_regulizer']))(word_sequences)
            word_sequences = MaxPooling1D(self.hp['max_pooling'])(word_sequences)

        word_sequences = GlobalMaxPooling1D()(word_sequences)
        word_sequences = Dense(self.hp['units_conv'], activation='relu',
                               kernel_regularizer=l2(self.hp['l2_regulizer']))(word_sequences)
        
        wordEncoder = Model(word_input, word_sequences)

        return wordEncoder
    
        
    def create_model( self ):
        
        if self.verbose == 1:
            self.print_hyperparameters()
        
        
        # Input layers
        #sent_input = Input(shape = (self.num_error, self.num_sites, None), dtype='int32')
        
        # Reshape the matrix
        #sent_input_reshaped = Reshape(( self.num_error * self.num_sites, ))(sent_input)
       
        if self.avg_w2v == False:
            
            sent_input = Input(shape = (self.num_error * self.num_sites, None), dtype='int32')
            
            # Encode the words of the sentences
            if self.word_encoder == 'LSTM':
                encoder_units = int(self.hp['rnn_units'])
                sent_encoder = TimeDistributed(self.word_encoder_lstm())(sent_input)
            elif self.word_encoder == 'Conv1D':
                encoder_units = self.hp['units_conv']
                sent_encoder = TimeDistributed(self.word_encoder_conv())(sent_input_reshaped)
            else: 
                print( 'No valid encoder' )    


            """    
            sent_encoder = Dropout(self.hp['dropout'])(sent_encoder)
            if self.batch_norm == True:
                sent_encoder = BatchNormalization()(sent_encoder)
            """

            # Reshape the error sites matrix

            sent_encoder_reshaped = Reshape(( self.num_error , self.num_sites, encoder_units))(sent_encoder)
         
        else:
            
            sent_input = Input(shape = (self.num_error * self.num_sites, self.max_sequence_length), dtype='float32')
            sent_encoder_reshaped = Reshape(( self.num_error , self.num_sites, self.max_sequence_length))(sent_input)
            sent_encoder_reshaped = TimeDistributed(Dense(int(self.hp['units_site']), activation = self.hp['activation_site'], 
                      kernel_regularizer=l2(self.hp['l2_regulizer'])))(sent_encoder_reshaped)
            encoder_units = int(self.hp['units_site'])
        
        # Add the meta information
        if self.include_counts == True:
            
            count_input = Input(shape = (self.num_error, self.num_sites, 2, ), dtype='float32')
            print( count_input )
            # Merge the counts and words
            exit_code_site_repr = Concatenate(axis=3)([sent_encoder_reshaped, count_input])
            print( exit_code_site_repr )
            exit_code_site_repr = Reshape(( self.num_error , self.num_sites * (encoder_units+2)))(exit_code_site_repr)
            print( exit_code_site_repr )
        else:
            exit_code_site_repr = sent_encoder_reshaped
            exit_code_site_repr = Reshape(( self.num_error , self.num_sites * (encoder_units)))(exit_code_site_repr)
        
        
        # Encode the site
        if int(self.hp['encode_sites']) == True:
            
            exit_code_encoder = TimeDistributed(Dense(int(self.hp['units_site']), activation = self.hp['activation_site'], 
                      kernel_regularizer=l2(self.hp['l2_regulizer'])))(exit_code_site_repr)
        else:
            exit_code_encoder = exit_code_site_repr

            """
            exit_code_encoder = Dropout(self.hp['dropout'])(exit_code_encoder)
            if self.batch_norm == True:
                exit_code_encoder = BatchNormalization()(exit_code_encoder)
            """
            
        # Flatten
        flattened = Flatten()(exit_code_encoder)
            
        # Dense
        dense = flattened
        for _ in range(int(self.hp['dense_layers'])):
            
            dense = Dense( units=int(self.hp['dense_units']), activation='relu', 
                          kernel_regularizer=l2(self.hp['l2_regulizer']) )(dense)
            dense = Dropout(self.hp['dropout'])(dense)
            if self.batch_norm == True:
                dense = BatchNormalization()(dense)            
            
        # Output layer
        preds = Dense(1, activation='sigmoid', kernel_regularizer=l2(self.hp['l2_regulizer']) )(dense)
                
        # Final model
        if self.include_counts == False:
            self.model = Model(sent_input, preds)
        else:
            self.model = Model([sent_input, count_input], preds)
        self.model.compile( loss='binary_crossentropy', optimizer = Adam(lr = self.hp['learning_rate'], 
                                                                         decay = self.hp['decay']) )
        
        if self.verbose == 1:
            self.model.summary()
        
        return self.model
    

Using TensorFlow backend.


### Setup the keras wrapper

In [7]:
dim_sites = len(list(set(sites_index.values())))
dim_errors = len(list(set(codes_index.values())))
embedding_dim = 400
embedding_matrix = np.load(e['NLP_PARAM']['embedding_matrix_path'])

In [8]:
def build_model(dense_units):
    
    nlp = NLP(2, dim_errors, dim_sites, embedding_dim, 
                                         embedding_matrix = embedding_matrix,
                                         cudnn = e['NLP_PARAM']['cudnn'],
                                         batch_norm = e['NLP_PARAM']['batch_norm'], 
                                         word_encoder = e['NLP_PARAM']['word_encoder'], 
                                         include_counts = e['NLP_PARAM']['include_counts'], 
                                         avg_w2v = e['NLP_PARAM']['avg_w2v'],
                                         attention = e['NLP_PARAM']['attention'] ) 
    
    model_param = {}
    model_param['dense_units'] = dense_units
    nlp.set_hyperparameters(model_param)
    model = nlp.create_model()
    return model
    
    

In [9]:
model = build_model(10)

W0916 14:28:18.792243 139877145265984 deprecation_wrapper.py:119] From /cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Mon/x86_64-centos7-gcc8-opt/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0916 14:28:18.832078 139877145265984 deprecation_wrapper.py:119] From /cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Mon/x86_64-centos7-gcc8-opt/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0916 14:28:18.843162 139877145265984 deprecation_wrapper.py:119] From /cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Mon/x86_64-centos7-gcc8-opt/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0916 14:28:18.864232 139877145265984 deprecation_wrapper.py:119] From /cvmfs/sft-nightlies.cern.ch/l

Hyperparameter	Corresponding Value
('units_conv', '\t\t', 10)
('rnncud', '\t\t', <class 'keras.layers.cudnn_recurrent.CuDNNLSTM'>)
('l2_regulizer', '\t\t', 0.0001)
('encode_sites', '\t\t', False)
('learning_rate', '\t\t', 0.0001)
('rnn', '\t\t', <class 'keras.layers.recurrent.LSTM'>)
('decay', '\t\t', 0.0)
('dropout', '\t\t', 0.2)
('units_site', '\t\t', 10)
('dense_units', '\t\t', 10)
('max_pooling', '\t\t', 3)
('att_units', '\t\t', 10)
('rec_dropout', '\t\t', 0.0)
('dense_layers', '\t\t', 3)
('filters', '\t\t', 256)
('train_embedding', '\t\t', False)
('conv_layers', '\t\t', 3)
('activation_site', '\t\t', 'relu')
('kernel_size', '\t\t', 3)
('rnn_units', '\t\t', 10)


W0916 14:28:19.296227 139877145265984 deprecation.py:323] From /cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Mon/x86_64-centos7-gcc8-opt/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:2974: where (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
W0916 14:28:19.706080 139877145265984 deprecation.py:506] From /cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Mon/x86_64-centos7-gcc8-opt/lib/python2.7/site-packages/keras/backend/tensorflow_backend.py:3445: calling dropout (from tensorflow.python.ops.nn_ops) with keep_prob is deprecated and will be removed in a future version.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


Tensor("input_3:0", shape=(?, 77, 81, 2), dtype=float32)
Tensor("concatenate_1/concat:0", shape=(?, 77, 81, 12), dtype=float32)
Tensor("reshape_2/Reshape:0", shape=(?, 77, 972), dtype=float32)


W0916 14:28:19.856180 139877145265984 deprecation_wrapper.py:119] From /cvmfs/sft-nightlies.cern.ch/lcg/views/dev3/Mon/x86_64-centos7-gcc8-opt/lib/python2.7/site-packages/keras/optimizers.py:790: The name tf.train.Optimizer is deprecated. Please use tf.compat.v1.train.Optimizer instead.



__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 6237, None)   0                                            
__________________________________________________________________________________________________
time_distributed_1 (TimeDistrib (None, 6237, 10)     1536140     input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 77, 81, 10)   0           time_distributed_1[0][0]         
__________________________________________________________________________________________________
input_3 (InputLayer)            (None, 77, 81, 2)    0                                            
__________________________________________________________________________________________________
concatenat

### Batch generator

In [10]:
import pandas as pd
import itertools
import math
import numpy as np
from keras.utils import to_categorical



class InputBatchGenerator(object):
    
    def __init__(self, frame, label, codes, sites, pad_dim, batch_size = 1, max_msg = 5, 
                 averaged = False, sequence = False, only_msg = False, cut_front = True):
        
        self.frame = frame
        self.n_tasks = len(frame)
        self.label = label
        self.batch_size = batch_size
        self.codes = codes
        self.sites = sites
        self.pad_dim = pad_dim
        if sequence == False:
            self.max_msg = 1
        else:
            self.max_msg = max_msg
        self.averaged = averaged
        self.sequence = sequence
        self.cut_front = cut_front
        self.only_msg = only_msg
        self.unique_sites = len(list(set(self.sites.values())))
        self.unique_codes = len(list(set(self.codes.values())))
        self.n_tasks = len(frame)
       
    
    

    def pad_along_axis(self, array, axis=0):

        array = np.array(array)
        pad_size = self.pad_dim - array.shape[axis]
        axis_nb = len(array.shape)

        if pad_size < 0:
            if self.cut_front == True:
                return array[-self.pad_dim : ]
            else:
                return array[ : self.pad_dim ]

        npad = [(0, 0) for x in range(axis_nb)]
        npad[axis] = (0, pad_size)

        b = np.pad(array, pad_width=npad, mode='constant', constant_values=int(0))

        return b
    
    
    def fill_counts(self, index, error, site, site_state, count):
        
        # Encode good and bad sites
        if site_state == 'good':
            site_state_encoded = 0
        else:
            site_state_encoded = 1

        self.error_site_counts[index, self.codes[error], self.sites[site], site_state_encoded] += count
    
    
    def fill_first_message(self, index, error, site, error_message):
        
                               
        # Pad the error message
        if self.averaged == False:
            error_message = self.pad_along_axis(error_message)
        #print( error_message )
        self.error_site_tokens[index, self.codes[error], self.sites[site]] = error_message

    
 
    def fill_messages_sequence(self, index, error, site, error_message_sequence):
        
        # Loop over the error message sequence
        for counter, error_message in enumerate(error_message_sequence):
           
            # Stop when maximal message is reached
            if counter == self.max_msg:
                break           
            
            # Pad the error message
            if self.averaged == False:
                error_message = self.pad_along_axis(error_message)
                
            
            # Sequence per task, error, site
            self.error_site_tokens[index, self.codes[error], self.sites[site], counter ] = error_message    
            
    
    def to_dense(self, index_matrix, values):
        
        errors, sites, counts, site_states, error_messages = values
        
        # Loop over the codes and sites
        for i_key in range(len(counts)):
            
            error = errors[i_key]
            site = sites[i_key]
            count = counts[i_key]
            site_state = site_states[i_key]
    
            
            # Fill the counts
            if self.only_msg == False:
                self.fill_counts(index_matrix, error, site, site_state, count)
           
            if self.only_counts == True:
                continue
            
            error_message_sequence = error_messages[i_key]
            
            # Only continue if there exists a message
            if isinstance(error_message_sequence, (list,)):
                
                # Fill the error message
                if self.sequence == True:
                    self.fill_messages_sequence( index_matrix, error, site, error_message_sequence)
                else:
                    self.fill_first_message( index_matrix, error, site, error_message_sequence)
                    

                
    def get_counts_matrix(self, sum_good_bad = False):
        
        self.only_counts = True
        
        self.error_site_counts = np.zeros((self.n_tasks, self.unique_codes, self.unique_sites, 2), dtype=np.int32)
        batch = self.frame
        [self.to_dense(counter, values) for counter, values in enumerate(zip(self.frame['error'], self.frame['site'], 
                                                                             self.frame['count'], self.frame['site_state'],
                                                                             self.frame['msg_encoded'],))]        
        if sum_good_bad == True:
            return self.error_site_counts.sum(axis=3), self.frame[self.label].values
        else:
            return self.error_site_counts, self.frame[self.label].values        
    
    
    def msg_batch(self, start_pos, end_pos):
        
        self.only_counts = False
        
        # Batch of frame
        batch = self.frame.iloc[start_pos : end_pos]
        chunk_size = len(batch)
        
        # Tokens
        if self.averaged == False:
            tokens_key = 'msg_encoded'
            self.pad_dim = 1
            msg_t = []
            for key in batch[tokens_key]:
                for msg in key:
                    if isinstance(msg, (list,)):
                        if len(msg) > self.pad_dim:
                            msg_t = msg
                            self.pad_dim = len(msg)
                        
            if self.pad_dim > 200:
                self.pad_dim = 200
        else:
            tokens_key = 'avg'
       
        
        #print( self.pad_dim )
        #print( msg )
        
        # Error site matrix
        self.error_site_counts = np.zeros((chunk_size, self.unique_codes, self.unique_sites, 2), dtype=np.int32)
        
        if self.sequence == True:
            dim = (chunk_size, self.unique_codes, self.unique_sites, self.max_msg, self.pad_dim)
        else:
            dim = (chunk_size, self.unique_codes, self.unique_sites, self.pad_dim)    
        
        
        # Error message matrix
        self.error_site_tokens = np.zeros(dim, dtype=np.int32)
        
        [self.to_dense(counter, values) for counter, values in enumerate(zip(batch['error'], batch['site'], batch['count'],
                                                                          batch['site_state'], batch[tokens_key]))]
        
        if self.only_msg == False:
            #self.error_site_tokens = np.reshape(
            #print self.error_site_tokens.shape
            return [self.error_site_tokens.reshape((chunk_size, self.unique_codes * self.unique_sites, self.pad_dim)) , self.error_site_counts]   
        else:
            return self.error_site_tokens
    
    
    def gen_batches(self):
        
        for cur_pos in range(0, self.n_tasks, self.batch_size):
 
            next_pos = cur_pos + self.batch_size 
            if next_pos <= self.n_tasks:
                yield (self.msg_batch( cur_pos, next_pos ), self.frame[self.label].iloc[cur_pos : next_pos].values)
            else:
                yield (self.msg_batch( cur_pos, self.n_tasks ), self.frame[self.label].iloc[cur_pos : self.n_tasks].values)   
                  
                    
    def gen_inf_batches(self):
        
        while True:
            try:
                for B in self.gen_batches():
                    yield B
            except StopIteration:
                logging.warning("start over generator loop")

In [11]:
from keras.wrappers.scikit_learn import KerasClassifier
class KerasClassifierCustom(KerasClassifier):

  
    
    def fit(self, x, y, **kwargs):
        
        
        
        
        
        """
        if self.build_fn is None:
            self.model = self.__call__(**self.filter_sk_params(self.__call__))
        elif not isinstance(self.build_fn, types.FunctionType):
            self.model = self.build_fn(
                **self.filter_sk_params(self.build_fn.__call__))
        else:
            self.model = self.build_fn(**self.filter_sk_params(self.build_fn))

        """
        self.model = self.build_fn(
            **self.filter_sk_params(self.build_fn))    
                
        """
        self.model = self.build_fn()
        """
        """
        fit_args = copy.deepcopy(self.filter_sk_params(Sequential.fit_generator))
        fit_args.update(kwargs)
        
        print( fit_args )
        """
        
        batch_size = 1
        self.classes_ = np.unique(y)
        steps_per_epoch = int(len(x) / batch_size)

        generator_train = InputBatchGenerator(x, 'label', codes_index, sites_index,
                                                                    100, batch_size = batch_size)   
    
        self.model.fit_generator(generator = generator_train.gen_inf_batches(), steps_per_epoch = steps_per_epoch, 
                                  epochs = 1, workers=0)        
        
        
        
  
    def predict_proba(self, x):

        """
        preds = self.model.predict_generator(
                    self.get_batch(x, None, self.sk_params["batch_size"]), 
                                               val_samples=x.shape[0])
        return preds
        """
        generator_test = InputBatchGenerator(x, 'label', codes_index, sites_index,
                                                            100, batch_size = 1)  
        
        y_pred_batches = []
        for X,y in generator_test.gen_batches():
            y_pred_batches.append(np.asarray(self.model.predict(X)))

        y_pred = np.concatenate(y_pred_batches)  
        y_pred_sk = [ [1-pred, pred] for pred in y_pred ]
        
        return np.array(y_pred_sk)
    

### GridSearch

In [12]:
from sklearn.model_selection import GridSearchCV

In [21]:
clf = KerasClassifierCustom(build_fn=build_model)

In [22]:
n_samples = 20
majority_class_sampled = actionshist[actionshist['label'] == 0].sample(n_samples , random_state=42)
minority_class_sampled = actionshist[actionshist['label'] == 1].sample(n_samples , random_state=42)
t = pd.concat([minority_class_sampled, majority_class_sampled])

In [23]:
# Shuffle
t = t.sample(frac=1)

In [176]:
validator = GridSearchCV(test,
                         param_grid={'dense_units': [3, 6]},
                         scoring='neg_log_loss',
                         n_jobs=1, cv=2)
validator.fit(t, t['label'])

Hyperparameter	Corresponding Value
('units_conv', '\t\t', 10)
('rnncud', '\t\t', <class 'keras.layers.cudnn_recurrent.CuDNNLSTM'>)
('l2_regulizer', '\t\t', 0.0001)
('encode_sites', '\t\t', False)
('learning_rate', '\t\t', 0.0001)
('rnn', '\t\t', <class 'keras.layers.recurrent.LSTM'>)
('decay', '\t\t', 0.0)
('dropout', '\t\t', 0.2)
('units_site', '\t\t', 10)
('dense_units', '\t\t', 3)
('max_pooling', '\t\t', 3)
('att_units', '\t\t', 10)
('rec_dropout', '\t\t', 0.0)
('dense_layers', '\t\t', 3)
('filters', '\t\t', 256)
('train_embedding', '\t\t', False)
('conv_layers', '\t\t', 3)
('activation_site', '\t\t', 'relu')
('kernel_size', '\t\t', 3)
('rnn_units', '\t\t', 10)
Tensor("input_89:0", shape=(?, 77, 81, 2), dtype=float32)
Tensor("concatenate_29/concat:0", shape=(?, 77, 81, 12), dtype=float32)
Tensor("reshape_58/Reshape:0", shape=(?, 77, 972), dtype=float32)
__________________________________________________________________________________________________
Layer (type)                    

KeyboardInterrupt: 

In [24]:
from spark_sklearn import GridSearchCV

In [25]:
from sklearn.metrics import roc_auc_score

def scorer(estimator, X, y):
    
    y_pred = estimator.predict_proba(X)[:,1]
    score = roc_auc_score(X['label'], y_pred)
    
    return score
    

In [26]:
grid = GridSearchCV(sc, estimator=clf, param_grid={'dense_units': [3, 6]}, scoring=scorer, verbose=1)
gridSearch_result = grid.fit(t, t['label'])

Fitting 3 folds for each of 2 candidates, totalling 6 fits
Hyperparameter	Corresponding Value
('units_conv', '\t\t', 10)
('rnncud', '\t\t', <class 'keras.layers.cudnn_recurrent.CuDNNLSTM'>)
('l2_regulizer', '\t\t', 0.0001)
('encode_sites', '\t\t', False)
('learning_rate', '\t\t', 0.0001)
('rnn', '\t\t', <class 'keras.layers.recurrent.LSTM'>)
('decay', '\t\t', 0.0)
('dropout', '\t\t', 0.2)
('units_site', '\t\t', 10)
('dense_units', '\t\t', 3)
('max_pooling', '\t\t', 3)
('att_units', '\t\t', 10)
('rec_dropout', '\t\t', 0.0)
('dense_layers', '\t\t', 3)
('filters', '\t\t', 256)
('train_embedding', '\t\t', False)
('conv_layers', '\t\t', 3)
('activation_site', '\t\t', 'relu')
('kernel_size', '\t\t', 3)
('rnn_units', '\t\t', 10)
Tensor("input_6:0", shape=(?, 77, 81, 2), dtype=float32)
Tensor("concatenate_2/concat:0", shape=(?, 77, 81, 12), dtype=float32)
Tensor("reshape_4/Reshape:0", shape=(?, 77, 972), dtype=float32)
___________________________________________________________________________

In [197]:
gridSearch_result.best_estimator_.get_params()

{'build_fn': <function __main__.build_model>, 'dense_units': 6}

In [27]:
sorted(gridSearch_result.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'mean_train_score',
 'param_dense_units',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split0_train_score',
 'split1_test_score',
 'split1_train_score',
 'split2_test_score',
 'split2_train_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score',
 'std_train_score']

In [28]:
res = pd.DataFrame(gridSearch_result.cv_results_)

In [29]:
res.head()

Unnamed: 0,mean_fit_time,mean_score_time,mean_test_score,mean_train_score,param_dense_units,params,rank_test_score,split0_test_score,split0_train_score,split1_test_score,split1_train_score,split2_test_score,split2_train_score,std_fit_time,std_score_time,std_test_score,std_train_score
0,43.743652,7.659284,0.473393,0.535689,3,{u'dense_units': 3},1,0.377551,0.47929,0.45,0.530556,0.6,0.597222,1.365076,0.684214,0.092749,0.048282
1,45.066958,7.909493,0.350536,0.5389,6,{u'dense_units': 6},2,0.397959,0.727811,0.325,0.427778,0.325,0.461111,1.475866,0.623274,0.034799,0.134271
