In [None]:
import import_ipynb
from clean_sentence import expand_contractions, remove_special, lemate_tokenize, stem_tokenize, split_sentences

test["split_sentence"] = test["sentence"].apply(split_sentences)
test = test.explode("split_sentences")

test["clean_sentence"] = test["sentence"].apply(expand_contractions)
test["clean_sentence"] = test["clean_sentence"].apply(remove_special)
test["words"] = test["clean_sentence"].apply(stem_tokenize)

In [None]:
train["clean_sentence"] = train["sentence"].apply(expand_contractions)
train["clean_sentence"] = train["clean_sentence"].apply(remove_special)
train["words"] = train["clean_sentence"].apply(stem_tokenize)

Using the train_CNN script train a CNN on the training data, then label the testing data and filter out the data by keeping the sentences where the trained CNN agrees with the label.
This ensures that the data is split by conjuctions having only 1 topic and they are correctly labled.

In [None]:
import pandas as pd
import pickle

def save_model(model, location):
    file = open(location, 'wb')
    pickle.dump(model, file)

def load_model(location):
    file = open(location, 'rb')
    model = pickle.load(file)
    return model

#Trained in train_word2vec script
word2vec = load_model("word2vec.pickle")

In [None]:
import numpy as np

# +1 for unknown vocab at the beginning
embedding_matrix = np.zeros((len(word2vec.wv.vocab)+1, word2vec.vector_size))
for i in range(len(word2vec.wv.vocab)):
    embedding_vector = word2vec.wv[word2vec.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i+1] = embedding_vector

In [None]:
from keras.layers import Dense, Input, CuDNNLSTM, Embedding, Dropout, Reshape, SpatialDropout1D, Bidirectional, Conv1D, GlobalMaxPooling1D, GlobalAveragePooling1D, concatenate, Lambda, TimeDistributed, Multiply, LSTM, RepeatVector, Permute, Activation, MaxPooling1D, AveragePooling1D, concatenate
from keras.models import Model
from keras.layers.normalization import BatchNormalization
import keras.backend as K
import tensorflow as tf

def get_model(num_classes):
  wv_layer = Embedding(embedding_matrix.shape[0],
                      embedding_matrix.shape[1],
                      weights=[embedding_matrix],
                      input_length=200,
                      trainable=True)

  # Inputs
  comment_input = Input(shape=(200,))
  mask_input = Input(shape=(200,))
  mask_input_r = Reshape((200, 1))(mask_input)

  embedded_sequences = wv_layer(comment_input)
  embedded_sequences = SpatialDropout1D(0.5)(embedded_sequences)

  embedded_sequences = concatenate([embedded_sequences, mask_input_r], axis=-1)

  activations = Conv1D(128, 7, padding='same', activation='relu')(embedded_sequences)
  #activations = LSTM(128, return_sequences=True)(embedded_sequences)
  activations = SpatialDropout1D(0.5)(activations)
  activations = BatchNormalization()(activations)
  activations = AveragePooling1D(2)(activations)

  activations = Conv1D(256, 7, padding='same', activation='relu')(activations)
  #activations = LSTM(64, return_sequences=True)(embedded_sequences)
  activations = SpatialDropout1D(0.5)(activations)
  activations = BatchNormalization()(activations)
  activations = AveragePooling1D(2)(activations)

  activations = Conv1D(512, 7, padding='same', activation='relu')(activations)
  #activations = LSTM(32, return_sequences=True)(embedded_sequences)
  activations = SpatialDropout1D(0.5)(activations)
  activations = BatchNormalization()(activations)
  x = GlobalAveragePooling1D()(activations)

  x = Dense(256, activation='relu')(x)
  x = Dropout(0.5)(x)
  x = BatchNormalization()(x)
    
  # outputs = []
  # losses = []
  # for i in range(num_classes):
  #   outputs.append(Dense(1, activation='sigmoid')(x))
  #   losses.append('binary_crossentropy')

  outputs = Dense(num_classes, activation='softmax')(x)
  losses = 'categorical_crossentropy'

  # build the model
  model = Model(inputs=[comment_input, mask_input], outputs=outputs)

  model.compile(loss=losses, optimizer='adam')
  model.summary()
  return model

In [None]:
import random
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 200
from clean_sentence import TOPICS

# This generator constructs new sentences by taking old ones and concatenate them together with conjunctions.
# creates masks for important words (aka. topics)
def generate_data(data, batch_size, max_old_size, num_classes):
  """Replaces Keras' native ImageDataGenerator."""
  data = data.reset_index(drop=True)
  old_concats = pd.DataFrame(columns=["x", "mask", "y"])
  while True:
    new_concats = pd.DataFrame(columns=["x", "mask", "y"])

    num_old = random.randint(0, batch_size)
    if num_old < len(old_concats):
      from_old = old_concats.sample(num_old)
      old_concats = old_concats.drop(from_old.index, axis=0)
      new_concats = pd.concat([new_concats, from_old])

    while len(new_concats) < batch_size:
      num_sent = random.randint(1, 3)
      chosen = data.sample(num_sent)
      X = []
      Y = []
      for j in range(num_sent):
        X = X + chosen["words"].iloc[j]
        Y = Y + [chosen["labels"].iloc[j]] * len(chosen["words"].iloc[j])
        if (j < num_sent-1) and random.randint(0, 5)>0:
          X = X + [random.choice(["and", "or", "but", "howev", "although", "moreov", 'also', 'further', 'furthermor', 'so'])]
          Y = Y + [-1]

      added = False
      for i in range(len(X)):
        if X[i] in TOPICS:
          mask = [0]*len(X)
          mask[i] = 1
          if added:
            old_concats = old_concats.append({'x': X, 'mask': mask, 'y': Y[i]}, ignore_index=True)
            if len(old_concats) > max_old_size:
              old_concats = old_concats.drop(random.choice(old_concats.index), axis=0)
          else:
            added = True
            new_concats = new_concats.append({'x': X, 'mask': mask, 'y': Y[i]}, ignore_index=True)
    
    x_data = [[word2vec.wv.vocab[t].index +1 if t in word2vec.wv.vocab else 0 for t in comment] for comment in new_concats['x'].tolist()]
    x_data = pad_sequences(x_data, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")
    y_data = np.eye(num_classes)[new_concats['y'].tolist()]
    mask_data = pad_sequences(new_concats['mask'].tolist(), maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")
    yield [[x_data, mask_data], y_data]

In [None]:
from sklearn.metrics import accuracy_score, r2_score, f1_score, confusion_matrix
from sklearn.utils import shuffle, class_weight
from keras.callbacks import EarlyStopping

print('**Training Model')

batch_size = 128
model = get_model(5)
model.fit_generator(
    generate_data(train, batch_size, 1000, 3),
    epochs=100,
    steps_per_epoch = 200,
    validation_data = generate_data(test, batch_size, 1000, 3),
    validation_steps = 2
    )
model.save_weights("keras_topic_sentiment.h5")

In [None]:
sent = "Fantastic car, although has one problem with a rattling sound coming from the steering column."
sent = tokenize(remove_special(expand_contractions(sent)))
x_data = [[word2vec.wv.vocab[w].index +1 for w in sent if w in word2vec.wv.vocab]]
x_data = pad_sequences(x_data, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")

mask = [0]*len(sent)
mask[sent.index('steer')] = 1
mask_data = pad_sequences([mask], maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")
model.predict([x_data, mask_data]).tolist()