# Import standard packages

In [1]:
import itertools
import sys
import os
import time
import datetime
import pickle as pkl

import numpy as np

from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold
from keras.preprocessing import sequence

Using TensorFlow backend.


# Import custom packages

In [2]:
try:
    from text_classification import load_data, preprocess_sequences, extract_features
    from text_classification.model import Hybrid_RNN, max_f1
except ImportError as e:
    print('Please install "text_classification" package (`pip install text_classification` in root directory)')
    raise e

# Preprocess sentences

In [9]:
preprocess_sequences.main(data_dir='Data', max_len=133)

# Extract additional features

## Load data

* with raw sequences
* without features, since they are not created yet

In [8]:
train = load_data.load_train(data_dir='Data', features=False, processed_sequences=False)
test = load_data.load_test(data_dir='Data', features=False, processed_sequences=False)

## Extract features

In [9]:
output_dir = 'Data'
extract_features.main(train, test, output_dir, max_features=20000, n_components=100)

  return self.partial_fit(X, y)
  train_features = pd.DataFrame(scaler.transform(train_features), columns=train_features.columns)
  test_features = pd.DataFrame(scaler.transform(test_features), columns=test_features.columns)


# Load preprocessed data

In [3]:
train, feature_cols = load_data.load_train(data_dir='Data', features=True, processed_sequences=True)
test, feature_cols = load_data.load_test(data_dir='Data', features=True, processed_sequences=True)

# Global variable and functions definition

## Define class weights for binary cross entropy loss function

In [4]:
class1_weight = np.bincount(train.labels.values)[0] * 1.0 / np.bincount(train.labels.values)[1]
class_weight = {0: 1, 1: class1_weight}

## Define max_len and max_words

In [5]:
max_len = 133       # Maximum length of sequences in Train and Test
max_words = 30432   # Total number of tokens in Train and Test

# Select best hyperparameters

## Define dummy test set

Here we take an artificial test set in Train data (10%) in order to assess the performance of the models on a dataset that is not used for either training of early stopping validation.

Stratified Shuffle Split allows to keep the labels distribution in Train data.

In [6]:
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=0)
sss.get_n_splits(train, train.labels)

for train_index, test_index in sss.split(train, train.labels):
    df_train, df_test = train.iloc[train_index], train.iloc[test_index]

## Define hyperparameter search space

 Hyperparameter `search_space` is a list of dictionnaries of all possibilities for hyperparameters, defined in `hyperparams`.

In [7]:
hyperparams = {'learning_rate':            [1e-3, 1e-4],
               'embedding_size':           [64, 128, 256],
               'dense_size_features':      [128, 256, 512],
               'dense_size_concat':        [64, 128, 256],
               'add_dense':                [None, 16, 32, 64],
               'dropout_rate':             [0.3, 0.4, 0.5],
               'lstm_size':                [32, 64]}

keys, values = zip(*hyperparams.items())

search_space = []
for state in itertools.product(*values):
    search_space.append(dict(zip(keys, state)))

## Define cross validation split

In [8]:
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(df_train, df_train.labels)

5

## Train models

For a given sample of `search_space`, 5 fold Stratified Cross Validation is performed. Early Stopping on valid maximum F1-score is performed.

The five trained models are then stacked taking the average scores, and the performance is assessed on the artificial test by looking at the accuracy. The threshold for test accuracy computation is chosen by taking the average of thresholds that maximize accuracy on the 5 validation sets.

In [9]:
results = []

# Define output_dir
ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
output_dir = 'results/' + st
os.makedirs(output_dir)

# Number of random samples taken in `search_space`
no_it = 1

for hyperparameters_dict, it in zip(np.random.permutation(search_space), range(no_it)):
    
    print('=====')
    print('=====')
    print('Training on hyper dict no ' + str(it + 1))
    print('=====')
    print(str(hyperparameters_dict))
    print('=====')
    
    result_dict = {}
    result_dict['hyperparameters'] = hyperparameters_dict
    
    i=0
    
    # Iterate on folds for cross validation
    for train_index, test_index in skf.split(df_train, df_train.labels):
                
        fold = 'Fold ' + str(i + 1)
        print('========')
        print(fold)
        print('========')
        
        cross_val_dict = {}
        
        # Define model
        hrnn = Hybrid_RNN(max_words=max_words, max_len=max_len)
        hrnn.init_hyperparams(hyperparameters_dict)
        hrnn.init_network(feature_cols)
        
        # Split
        small_train, small_valid = df_train.iloc[train_index].iloc[:1000], df_train.iloc[test_index].iloc[:1000]
        
        # Pad sequences
        sequences_train_matrix = sequence.pad_sequences(small_train['sequence'], maxlen=max_len, value=2)
        sequences_valid_matrix = sequence.pad_sequences(small_valid['sequence'], maxlen=max_len, value=2)
        sequences_test_matrix = sequence.pad_sequences(df_test['sequence'], maxlen=max_len, value=2)

        # Train model on small_train, small_valid
        hrnn.train(sequences_train_matrix, small_train[feature_cols], small_train.labels,
                   sequences_valid_matrix, small_valid[feature_cols], small_valid.labels,
                   class_weight=class_weight,
                   epochs=10,
                   patience=2)
        
        # Score model on small_valid and df_test, select best threshold
        ## Predict
        y_pred_valid = hrnn.model.predict([sequences_valid_matrix, small_valid[feature_cols]]).flatten()
        y_pred_test = hrnn.model.predict([sequences_test_matrix, df_test[feature_cols]]).flatten()
        
        cross_val_dict['y_pred_test'] = y_pred_test
                
        y_true_valid = small_valid.labels
        y_true_test = df_test.labels
                
        ## Compute accuracy at threshold 0.5 for valid, test
        cross_val_dict['f1_05_valid'] = f1_score(y_true_valid, (y_pred_valid >= 0.5))
        cross_val_dict['f1_05_test'] = f1_score(y_true_test, (y_pred_test >= 0.5))
                
        ## Compute max accuracy on valid, and corresponding threshold
        cross_val_dict['best_f1_valid'], cross_val_dict['best_thresh_valid'] = max_f1(y_true_valid, y_pred_valid)
        
        ## Compute accuracy on test for threshold that maximizes accuracy on valid
        cross_val_dict['f1_thresh_test'] = f1_score(y_true_test, (y_pred_test >= cross_val_dict['best_thresh_valid']))
    
        result_dict[fold] = cross_val_dict
        
        # Save result_dict
        ts = time.time()
        st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
        with open(output_dir + '/result_dict_it_' + str(it) + '_Fold_' + str(i) + '_' + st + '.pkl', 'wb') as f:
            pkl.dump(result_dict, f)
            
        i += 1
        
    results.append(result_dict)
    
for result_dict in results:
    # Stack models
    ## Compute average predictions on test over folds
    result_dict['stacked_y_pred'] = np.mean(np.array([result_dict[fold]['y_pred_test'] for fold in ['Fold ' + str(i + 1) for i in range(5)]]), axis=0)
    
    ## Compute average threshold over folds for valid
    result_dict['stacked_thresh_valid'] = np.mean([result_dict[fold]['best_thresh_valid'] for fold in ['Fold ' + str(i + 1) for i in range(5)]])
    
    ## Compute accuracy at threshold 0.5 on stacked predictions on test
    result_dict['f1_05_test_stacking'] = f1_score(y_true_test, (result_dict['stacked_y_pred'] >= 0.5))
    
    ## Compute accuracy at average best threshold on stacked predictions on test 
    result_dict['f1_thresh_test_stacking'] = f1_score(y_true_test, (result_dict['stacked_y_pred'] >= result_dict['stacked_thresh_valid']))

ts = time.time()
st = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')            
with open(output_dir + '/all_result_list_' + st + '.pkl', 'wb') as f:
    pkl.dump(results, f)

=====
=====
Training on hyper dict no 1
=====
{'dense_size_features': 256, 'lstm_size': 32, 'add_dense': None, 'embedding_size': 256, 'dense_size_concat': 64, 'learning_rate': 0.0001, 'dropout_rate': 0.5}
=====
Fold 1
Train on 1000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Restoring model weights from the end of the best epoch
Epoch 00005: early stopping
Fold 2
Train on 1000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Restoring model weights from the end of the best epoch
Epoch 00006: early stopping
Fold 3
Train on 1000 samples, validate on 1000 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20

KeyboardInterrupt: 

# Train final models

In [None]:
skf = StratifiedKFold(n_splits=5)
skf.get_n_splits(train, train.labels)

## Define best hyperparameters

In [11]:
best_hyperparameters = {'add_dense': 32, 
                        'dense_size_features': 128, 
                        'dense_size_concat': 256, 
                        'lstm_size': 64, 
                        'learning_rate': 0.0001, 
                        'val_metric': 'val_acc', 
                        'embedding_size': 256, 
                        'dropout_rate': 0.5}

## Train

In [13]:
result_dict = {}

i = 0

for train_index, test_index in skf.split(train, train.labels):
    
    train_, valid_ = train.iloc[train_index], train.iloc[test_index]
    
    fold = 'Fold ' + str(i + 1)
    print('========')
    print(fold)
    print('========')

    cross_val_dict = {}

    # Define model
    hrnn = Hybrid_RNN(max_words=max_words, max_len=max_len)
    hrnn.init_hyperparams(best_hyperparameters)
    hrnn.init_network(feature_cols)

    # Pad sequence
    sequences_train_matrix = sequence.pad_sequences(train_['sequence'], maxlen=max_len, value=2)
    sequences_valid_matrix = sequence.pad_sequences(valid_['sequence'], maxlen=max_len, value=2)
    sequences_test_matrix = sequence.pad_sequences(test['sequence'], maxlen=max_len, value=2)
    
    # Train model on train, valid
    hrnn.train(sequences_train_matrix, train_[feature_cols], train_.labels,
               sequences_valid_matrix, valid_[feature_cols], valid_.labels,
               class_weight=class_weight,
               epochs=20,
               patience=2)
    
    # Score model on valid and test
    y_pred_valid = hrnn.model.predict([sequences_valid_matrix, valid_[feature_cols]])
    y_pred_test = hrnn.model.predict([sequences_test_matrix, test[feature_cols]])

    # Save model and scores
    cross_val_dict['y_pred_test'] = y_pred_test

    y_true_valid = valid_.labels

    cross_val_dict['acc_05_valid'] = accuracy_score(y_true_valid, (y_pred_valid > 0.5))

    cross_val_dict['best_acc_valid'], cross_val_dict['best_thresh_valid'] = max_accuracy(y_true_valid, y_pred_valid)

    result_dict[fold] = cross_val_dict
    
    i += 1

Fold 1
Train on 36743 samples, validate on 9187 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold 2
Train on 36743 samples, validate on 9187 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold 3
Train on 36744 samples, validate on 9186 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold 4
Train on 36745 samples, validate on 9185 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Fold 5
Train on 36745 samples, validate on 9185 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Stack models over folds

In [14]:
# Stack models
result_dict['stacked_y_pred'] = np.mean(np.array([result_dict[fold]['y_pred_test'] for fold in ['Fold ' + str(i + 1) for i in range(5)]]), axis=0)
result_dict['stacked_thresh_valid'] = np.mean([result_dict[fold]['best_thresh_valid'] for fold in ['Fold ' + str(i + 1) for i in range(5)]])

## Predict on Test data

In [15]:
# Score on real test
predictions = result_dict['stacked_y_pred'] >= result_dict['stacked_thresh_valid']

# Map binary labels for real prediction format
map_label = {0: 'C', 1: 'M'}
mapped_predictions = [map_label[int(pred)] for pred in predictions]

## Save predictions

In [16]:
# Dump results
with open('Data/Test/predicted_labels.pkl', 'wb') as f:
        pkl.dump(mapped_predictions, f)