In [166]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.layers.merge import concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import np_utils

from nltk.corpus import reuters
from random import shuffle
from tqdm import tqdm_notebook

In [189]:
import pickle

In [211]:
embeddings_index = {}
with open('dataset/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [214]:
def make_data(params):
    """Load reuters datasets, categories, and preprocesses the texts"""
    dataset_filename = "dataset-TESTSPLIT_{}_NB_WORDS_{}_MAX_SEQ_LENGTH_{}.pkl".format(
        params['TEST_SPLIT'],
        params['MAX_NB_WORDS'],
        params['MAX_SEQUENCE_LENGTH']
    )

    #print("Searching for dataset file {}".format(dataset_filename))
    #try:
    #    file_content = pickle.load(open(dataset_filename, "rb"))
    #except (OSError, IOError):
    #    print("File not found")
    #else:
    #    print("File found")
    #    return file_content

    nltk.download('stopwords')
    nltk.download('reuters')
    nltk.download('punkt')

    categories_to_idx = {c: i for i, c in enumerate(reuters.categories())}

    stop_words = set(stopwords.words("english"))
    stemmer = SnowballStemmer('english')
    dataset = []
    for category in reuters.categories():
        for file_id in reuters.fileids(category):
            txt = reuters.raw(file_id)
            txt_lower = txt.lower()
            list_of_words = [stemmer.stem(w) for w in word_tokenize(txt_lower) if w not in stop_words]
            dataset.append((" ".join(list_of_words), categories_to_idx[category]))

    # Shuffle the dataset
    shuffle(dataset)

    # Make train and test sets
    test_limit = int(len(dataset)*params['TEST_SPLIT'])
    test_set = dataset[:test_limit]
    train_set = dataset[test_limit:]

    # Fit the tokenizer on train texts
    tokenizer = Tokenizer(num_words=params['MAX_NB_WORDS'])
    tokenizer.fit_on_texts((txt for txt, category in dataset))

    # Convert them to indices and truncate them if they are too large
    train_sequences = pad_sequences(
        sequences=tokenizer.texts_to_sequences((txt for txt, category in train_set)),
        maxlen=params['MAX_SEQUENCE_LENGTH'])
    test_sequences = pad_sequences(
        sequences=tokenizer.texts_to_sequences((txt for txt, category in test_set)),
        maxlen=params['MAX_SEQUENCE_LENGTH'])
    train_categories = [category for txt, category in train_set]
    test_categories = [category for txt, category in test_set]
    pickle.dump((train_sequences, train_categories, test_sequences, test_categories), open(dataset_filename, "wb"))

    return train_sequences, train_categories, test_sequences, test_categories, tokenizer.word_index

In [215]:
train_sequences, train_categories, test_sequences, test_categories, tokenizer = make_data(params)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/perceval/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package reuters to
[nltk_data]     /Users/perceval/nltk_data...
[nltk_data]   Package reuters is already up-to-date!
[nltk_data] Downloading package punkt to /Users/perceval/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [220]:
word_index = tokenizer.word_index
EMBEDDING_DIM = 100

In [221]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [185]:
def make_model(params):
    """Builds the model"""
    embedding_layer = Embedding(
        input_dim=params['MAX_NB_WORDS'],
        output_dim=params['EMBEDDING_DIM'],
        # weights=[embedding_matrix],
        input_length=params['MAX_SEQUENCE_LENGTH'])
    lstm_layer = LSTM(params['NUM_LSTM'], dropout=params['RATE_DROP_LSTM'], recurrent_dropout=params['RATE_DROP_LSTM'])

    sequence_1_input = Input(shape=(params['MAX_SEQUENCE_LENGTH'],), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    # sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    # embedded_sequences_2 = embedding_layer(sequence_2_input)
    # y1 = lstm_layer(embedded_sequences_2)
    #
    # merged = concatenate([x1, y1])
    merged = Dropout(params['RATE_DROP_DENSE'])(x1)  # (merged)
    merged = BatchNormalization()(merged)

    merged = Dense(params['NUM_DENSE'], activation=params['ACTIVATION_FN'])(merged)
    merged = Dropout(params['RATE_DROP_DENSE'])(merged)
    merged = BatchNormalization()(merged)

    preds = Dense(params['NUM_CLASSES'], activation='softmax')(merged)

    model = Model(inputs=[sequence_1_input], outputs=preds)
    model.compile(
        loss='categorical_crossentropy',
        optimizer='nadam',
        metrics=['acc']
    )
    model.summary()
    return model
model = make_model(params)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_6 (Embedding)      (None, 300, 25)           250000    
_________________________________________________________________
lstm_6 (LSTM)                (None, 200)               180800    
_________________________________________________________________
dropout_11 (Dropout)         (None, 200)               0         
_________________________________________________________________
batch_normalization_11 (Batc (None, 200)               800       
_________________________________________________________________
dense_11 (Dense)             (None, 120)               24120     
_________________________________________________________________
dropout_12 (Dropout)         (None, 120)               0         
__________

In [176]:
params_dict = {
    "NUM_CLASSES": len(reuters.categories()),
    "MAX_SEQUENCE_LENGTH": 300,
    "MAX_NB_WORDS": 10000,
    "EMBEDDING_DIM": 25,
    "VALIDATION_SPLIT": 0.1,
    "TEST_SPLIT": 0.2,
    "NUM_LSTM": 200,
    "NUM_DENSE": 120,
    "RATE_DROP_LSTM": 0.2,
    "RATE_DROP_DENSE": 0.2,
    "ACTIVATION_FN": 'relu',
    "PATIENCE": 3,
    "EPOCHS": 100,
    "BATCH_SIZE": 64,
}

In [183]:
print("{{\n{}\n}}".format("\n".join("  {}: {},".format(k, repr(v)) for k, v in params.items())))

{
  NUM_LSTM: 200,
  TEST_SPLIT: 0.2,
  MAX_NB_WORDS: 10000,
  VALIDATION_SPLIT: 0.1,
  NUM_CLASSES: 90,
  MAX_SEQUENCE_LENGTH: 300,
  ACTIVATION_FN: 'relu',
  EMBEDDING_DIM: 25,
  RATE_DROP_DENSE: 0.2,
  RATE_DROP_LSTM: 0.2,
  NUM_DENSE: 120,
}


In [159]:
STAMP = 'lstm_%d_%d_%.2f_%.2f'%(num_lstm, num_dense, rate_drop_lstm, rate_drop_dense)
print(STAMP)

lstm_200_120_0.20_0.20


In [164]:
train_categories.shape

(10662,)

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
bst_model_path = STAMP + '.h5'
model_checkpoint = ModelCheckpoint(bst_model_path, save_best_only=True, save_weights_only=True)

hist = model.fit(
    x=train_sequences_padded,
    y=np_utils.to_categorical(train_categories),
    validation_split=validation_split,
    epochs=100,
    batch_size=64,
    shuffle=True,
    #class_weight=class_weight,
    callbacks=[early_stopping, model_checkpoint])

In [169]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt

In [186]:
model.load_weights('model-20171211-230432.h5')

In [202]:
np.array(test_categories).shape

(2665,)



[2.7894735943756674, 0.30881801129058423]

In [209]:
from sklearn.utils import class_weight
class_weight = class_weight.compute_class_weight('balanced', np.unique(train_categories), train_categories)
#model.fit(X_train, y_train, class_weight=class_weight)

In [210]:
class_weight

array([  6.23895618e-02,   2.57560386e+00,   2.96194444e+00,
         1.44485095e+00,   2.19403292e+00,   1.18477778e+02,
         2.07855750e+00,   2.96194444e+01,   2.96194444e+01,
         1.17304730e+00,   2.15414141e+00,   5.92388889e+01,
         6.13874496e-01,   2.52080378e+00,   3.94925926e+01,
         1.39385621e+00,   3.94925926e+01,   2.54244158e-01,
         3.94925926e+01,   8.84162521e-01,   1.18477778e+01,
         3.76478480e-02,   5.64179894e+00,   2.63283951e+00,
         1.07707071e+00,   1.11771488e+00,   2.60390720e-01,
         1.69253968e+01,   1.18477778e+02,   6.58209877e+00,
         7.40486111e+00,   9.11367521e+00,   1.07707071e+01,
         2.96194444e+01,   3.07734488e-01,   2.63283951e+00,
         2.69267677e+00,   2.36955556e+01,   1.97462963e+00,
         2.36955556e+01,   5.64179894e+00,   9.87314815e+00,
         5.92388889e+01,   1.44485095e+00,   9.87314815e+00,
         3.29104938e+00,   2.00809793e-01,   8.40267928e-01,
         2.36955556e+01,

In [208]:
{c: len(reuters.fileids(c)) for c in reuters.categories()}

{'acq': 2369,
 'alum': 58,
 'barley': 51,
 'bop': 105,
 'carcass': 68,
 'castor-oil': 2,
 'cocoa': 73,
 'coconut': 6,
 'coconut-oil': 7,
 'coffee': 139,
 'copper': 65,
 'copra-cake': 3,
 'corn': 237,
 'cotton': 59,
 'cotton-oil': 3,
 'cpi': 97,
 'cpu': 4,
 'crude': 578,
 'dfl': 3,
 'dlr': 175,
 'dmk': 14,
 'earn': 3964,
 'fuel': 23,
 'gas': 54,
 'gnp': 136,
 'gold': 124,
 'grain': 582,
 'groundnut': 9,
 'groundnut-oil': 2,
 'heat': 19,
 'hog': 22,
 'housing': 20,
 'income': 16,
 'instal-debt': 6,
 'interest': 478,
 'ipi': 53,
 'iron-steel': 54,
 'jet': 5,
 'jobs': 67,
 'l-cattle': 8,
 'lead': 29,
 'lei': 15,
 'lin-oil': 2,
 'livestock': 99,
 'lumber': 16,
 'meal-feed': 49,
 'money-fx': 717,
 'money-supply': 174,
 'naphtha': 6,
 'nat-gas': 105,
 'nickel': 9,
 'nkr': 3,
 'nzdlr': 4,
 'oat': 14,
 'oilseed': 171,
 'orange': 27,
 'palladium': 3,
 'palm-oil': 40,
 'palmkernel': 3,
 'pet-chem': 32,
 'platinum': 12,
 'potato': 6,
 'propane': 6,
 'rand': 3,
 'rape-oil': 8,
 'rapeseed': 27,
 're