In [177]:
########################################
## import packages
########################################
import os
import re
import csv
import codecs
import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from string import punctuation

from gensim.models import KeyedVectors
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Embedding, Dropout, Activation
from keras.models import Model
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.utils import np_utils

from nltk.corpus import reuters
import nltk
from random import shuffle
from tqdm import tqdm_notebook

import pickle
from sklearn.preprocessing import MultiLabelBinarizer
import os
import pickle
import time
import warnings
from random import shuffle

import numpy as np
import sklearn.exceptions
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.layers import Dense, Input, LSTM, Embedding, Dropout
from keras.models import Model
from keras.optimizers import Nadam
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from nltk import word_tokenize, download as download_dataset
from nltk.corpus import stopwords, reuters
from nltk.stem import SnowballStemmer
from sklearn.metrics import log_loss, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import MultiLabelBinarizer

from metrics import *

import warnings
import sklearn.exceptions

In [355]:
from sklearn.metrics import log_loss, accuracy_score, precision_recall_fscore_support

In [235]:
idx2w = {v: k for k, v in tokenizer.word_index.items()}
idx2w.update({0: '<unk>'})

In [211]:
embeddings_index = {}
with open('dataset/glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs
    f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [221]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [178]:
params = {
  'ACTIVATION_FN': 'relu',
  'BATCH_SIZE': 64,
  'EMBEDDING_DIM': 100,
  'EPOCHS': 100,
  'LR': 0.001,
  'MAX_NB_WORDS': 10000,
  'MAX_SEQUENCE_LENGTH': 300,
  'NUM_CLASSES': 90,
  'NUM_DENSE': 200,
  'NUM_LSTM': 300,
  'PATIENCE': 2,
  'RATE_DROP_DENSE': 0.2,
  'RATE_DROP_LSTM': 0.2,
  'VALIDATION_SPLIT': 0.2,
  'metrics_acc': 0.77012255713812516,
  'metrics_f1': 0.77711240909775625,
  'metrics_loss': 7.148816829092576,
  'metrics_prec': 0.81774655224802373,
  'metrics_rec': 0.76228632478632474,
}

In [169]:
def make_data(params):
    """Load reuters datasets, categories, and preprocesses the texts"""
    dataset_filename = "dataset/dataset-TESTSPLIT_{}_NB_WORDS_{}_MAX_SEQ_LENGTH_{}.pkl".format(
        params['TEST_SPLIT'],
        params['MAX_NB_WORDS'],
        params['MAX_SEQUENCE_LENGTH']
    )

    print("Searching for dataset file {}".format(dataset_filename))
    try:
        file_content = pickle.load(open(dataset_filename, "rb"))
    except (OSError, IOError):
        print("File not found")
    else:
        print("File found")
        return file_content

    nltk.download('stopwords')
    nltk.download('reuters')
    nltk.download('punkt')

    categories_to_idx = {c: i for i, c in enumerate(reuters.categories())}

    stop_words = set(stopwords.words("english"))
    stemmer = SnowballStemmer('english')
    train_set = []
    test_set = []
    for file_id in reuters.fileids():
        txt = reuters.raw(file_id)
        txt_lower = txt.lower()
        list_of_words = [stemmer.stem(w) for w in word_tokenize(txt_lower) if w not in stop_words]
        val = (" ".join(list_of_words),
                            [categories_to_idx[c] for c in reuters.categories(file_id)],
                            file_id)
        if "train" in file_id:
            train_set.append(val)
        else:
            test_set.append(val)

    mlb = MultiLabelBinarizer(list(range(len(categories_to_idx))))
    
    # Shuffle the dataset
    shuffle(dataset)

    # Make train and test sets
    #test_limit = int(len(dataset)*params['TEST_SPLIT'])
    #test_set = dataset[:test_limit]
    #train_set = dataset[test_limit:]

    # Fit the tokenizer on train texts
    tokenizer = Tokenizer(num_words=params['MAX_NB_WORDS'])
    tokenizer.fit_on_texts((txt for txt, category, _ in dataset))

    # Convert them to indices and truncate them if they are too large
    train_sequences = pad_sequences(
        sequences=tokenizer.texts_to_sequences((txt for txt, category, _ in train_set)),
        maxlen=params['MAX_SEQUENCE_LENGTH'])
    test_sequences = pad_sequences(
        sequences=tokenizer.texts_to_sequences((txt for txt, category, _ in test_set)),
        maxlen=params['MAX_SEQUENCE_LENGTH'])
    train_categories = mlb.fit_transform([categories for txt, categories, _ in train_set])
    test_categories = mlb.fit_transform([categories for txt, categories, _ in test_set])
    train_fileids = [fileid for txt, categories, fileid in train_set]
    test_fileids = [fileid for txt, categories, fileid in test_set]
    pickle.dump((train_sequences, train_categories, test_sequences, test_categories, train_fileids, test_fileids), open(dataset_filename, "wb"))

    return train_sequences, train_categories, test_sequences, test_categories, train_fileids, test_fileids, tokenizer.word_index

In [171]:
train_sequences, train_categories, test_sequences, test_categories, train_fileids, test_fileids, word_index = make_data(params)

Searching for dataset file dataset/dataset-TESTSPLIT_0.2_NB_WORDS_10000_MAX_SEQ_LENGTH_300.pkl
File found


In [172]:
all_sequences = np.concatenate((train_sequences, test_sequences), axis=0)
all_categories = np.concatenate((train_categories, test_categories), axis=0)
all_fileids = np.concatenate((train_fileids, test_fileids), axis=0)

## Build the recurrent model

In [173]:
def make_model(params):
    """Builds the model"""
    embedding_layer = Embedding(
        input_dim=params['MAX_NB_WORDS'],
        output_dim=params['EMBEDDING_DIM'],
        # weights=[embedding_matrix],
        input_length=params['MAX_SEQUENCE_LENGTH'])
    lstm_layer = LSTM(params['NUM_LSTM'], dropout=params['RATE_DROP_LSTM'], recurrent_dropout=params['RATE_DROP_LSTM'])

    sequence_1_input = Input(shape=(params['MAX_SEQUENCE_LENGTH'],), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    merged = lstm_layer(embedded_sequences_1)

    merged = Dense(params['NUM_DENSE'], activation=params['ACTIVATION_FN'])(merged)
    merged = Dropout(params['RATE_DROP_DENSE'])(merged)

    preds = Dense(params['NUM_CLASSES'], activation='softmax')(merged)

    model = Model(inputs=[sequence_1_input], outputs=preds)
    model.compile(
        loss='categorical_crossentropy',
        optimizer='nadam',
        metrics=['acc']
    )
    model.summary()
    model_bis = Model(inputs=[sequence_1_input], outputs=merged)
    return model, model_bis
model, model_bis = make_model(params)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 300, 100)          1000000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 300)               481200    
_________________________________________________________________
dense_5 (Dense)              (None, 200)               60200     
_________________________________________________________________
dense_6 (Dense)              (None, 90)                18090     
Total params: 1,559,490
Trainable params: 1,559,490
Non-trainable params: 0
_________________________________________________________________


In [174]:
model.load_weights("model-20171214-023454.h5")

In [175]:
all_representations = model_bis.predict(all_sequences)

In [176]:
pickle.dump((all_representations, all_fileids), open("model-20171214-representations.pkl", "wb"))

array(['training/7275', 'test/15190', 'training/6208', ..., 'test/15695',
       'training/7359', 'training/1562'],
      dtype='<U14')