In [1]:
import pandas as pd

data = pd.read_excel("data")

In [None]:
import import_ipynb
from clean_sentence import expand_contractions, remove_special, lemate_tokenize, stem_tokenize

data["clean_sentence"] = data["sentence"].apply(expand_contractions)
data["clean_sentence"] = data["clean_sentence"].apply(remove_special)
data["words"] = data["clean_sentence"].apply(stem_tokenize)

In [None]:
import pandas as pd
import pickle

def save_model(model, location):
    file = open(location, 'wb')
    pickle.dump(model, file)

def load_model(location):
    file = open(location, 'rb')
    model = pickle.load(file)
    return model

#Trained in train_word2vec script
word2vec = load_model("word2vec.pickle")

In [None]:
import numpy as np

# +1 for unknown vocab at the beginning
embedding_matrix = np.zeros((len(word2vec.wv.vocab)+1, word2vec.vector_size))
for i in range(len(word2vec.wv.vocab)):
    embedding_vector = word2vec.wv[word2vec.wv.index2word[i]]
    if embedding_vector is not None:
        embedding_matrix[i+1] = embedding_vector

In [None]:
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 200
X_vocab = [[word2vec.wv.vocab[t].index +1 if t in word2vec.wv.vocab else 0 for t in comment] for comment in data["words"].tolist()]
X_vocab = pad_sequences(X_vocab, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")

In [None]:
from keras.layers import Dense, Input, Embedding, Dropout, SpatialDropout1D, Conv1D, GlobalAveragePooling1D, Lambda, AveragePooling1D, concatenate
from keras.models import Model
from keras.optimizers import Adam
from keras.layers.normalization import BatchNormalization
import keras.backend as K
import tensorflow as tf

def weighted_binary_crossentropy(y_true, y_pred):
    false_positive_weight = 0.2
    thresh = 0.5
    y_pred_true = K.greater_equal(thresh, y_pred)
    y_not_true = K.less_equal(thresh, y_true)
    
    false_positive_tensor = K.equal(y_pred_true, y_not_true)
    false_positive_tensor = K.cast(false_positive_tensor, 'float32')
    
    complement = 1 - false_positive_tensor
    falsePosGroupTrue = y_true * false_positive_tensor
    falsePosGroupPred = y_pred * false_positive_tensor
    
    nonFalseGroupTrue = y_true * complement
    nonFalseGroupPred = y_pred * complement
    
    falsePosLoss = K.binary_crossentropy(falsePosGroupTrue, falsePosGroupPred)
    nonFalseLoss = K.binary_crossentropy(nonFalseGroupTrue, nonFalseGroupPred)
    
    return (false_positive_weight * falsePosLoss) + nonFalseLoss

def f1_loss(y_true, y_pred):
    
    tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
    tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
    fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
    fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

    p = tp / (tp + fp + K.epsilon())
    r = tp / (tp + fn + K.epsilon())

    f1 = 2*p*r / (p+r+K.epsilon())
    f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
    return (1 - K.mean(f1)) + K.binary_crossentropy(y_true, y_pred)

def get_model(num_classes):
  wv_layer = Embedding(embedding_matrix.shape[0],
                      embedding_matrix.shape[1],
                      weights=[embedding_matrix],
                      input_length=200,
                      trainable=True)

  # Inputs
  comment_input = Input(shape=(200,))
  embedded_sequences = wv_layer(comment_input)

  # biGRU
  embedded_sequences = SpatialDropout1D(0.5)(embedded_sequences)

  activations = Conv1D(128, 7, padding='same', activation='relu')(embedded_sequences)
  #activations = LSTM(128, return_sequences=True)(embedded_sequences)
  activations = SpatialDropout1D(0.5)(activations)
  activations = BatchNormalization()(activations)
  activations = AveragePooling1D(2)(activations)

  activations = Conv1D(256, 7, padding='same', activation='relu')(activations)
  #activations = LSTM(64, return_sequences=True)(embedded_sequences)
  activations = SpatialDropout1D(0.5)(activations)
  activations = BatchNormalization()(activations)
  activations = AveragePooling1D(2)(activations)

  activations = Conv1D(512, 7, padding='same', activation='relu')(activations)
  #activations = LSTM(32, return_sequences=True)(embedded_sequences)
  activations = SpatialDropout1D(0.5)(activations)
  activations = BatchNormalization()(activations)
  x = GlobalAveragePooling1D()(activations)

  x = Dense(256, activation='relu')(x)
  x = Dropout(0.5)(x)
  x = BatchNormalization()(x)
    
  outputs = []
  losses = []
  for i in range(num_classes):
    outputs.append(Dense(1, activation='sigmoid')(x))
    losses.append('binary_crossentropy')

  # build the model
  model = Model(inputs=[comment_input], outputs=outputs)

  model.compile(loss=losses, optimizer='adam')
  #model.compile(loss='categorical_crossentropy', optimizer='adam')
  model.summary()
  return model

In [None]:
from sklearn.model_selection import train_test_split

train = (data["split"] == "train")
test = (data["split"] == "test")
validate = (data["split"] == "validate")

categories = ["Positive", "Negative", "Neutral"]

y_train = pd.get_dummies(data[train]["sentiment"])
x_train = X_vocab[train]

y_test = pd.get_dummies(data[test]["sentiment"])
x_test = X_vocab[test]

y_train = y_train[categories]
y_test = y_test[categories]

In [None]:
#Generate and Train Model

from sklearn.metrics import accuracy_score, r2_score, f1_score, confusion_matrix
from sklearn.utils import shuffle, class_weight
from keras.callbacks import EarlyStopping

print('**Training Model')
x_train, y_train = shuffle(x_train, y_train)

class_weights = class_weight.compute_class_weight('balanced', y_train.columns, y_train.idxmax(axis=1))
class_weights = dict(zip(list(range(len(categories))), class_weights))

model = get_model(len(categories))
model.fit([x_train], y_train.to_numpy().T.tolist(), epochs=40, batch_size=512, validation_data=([x_test], y_test.to_numpy().T.tolist()), class_weight=class_weights)

In [None]:
#Evaluate Model

from sklearn.metrics import accuracy_score, f1_score, confusion_matrix

y_pred = np.array(model.predict([x_test])).T[0]
#y_pred = pd.DataFrame((y_pred > 0.5).reshape(len(y_test), 5), columns=categories).astype(int) #Take any value above 0.5
y_pred = pd.DataFrame(pd.get_dummies(np.argmax(y_pred, axis=1)).values, columns=categories) #Take largest value

for category in categories:
    print("{}**".format(category))
    print(confusion_matrix(y_test[category].tolist(), y_pred[category].tolist()))
    print(accuracy_score(y_test[category].tolist(), y_pred[category].tolist()))
    print(f1_score(y_test[category].tolist(), y_pred[category].tolist()))

print("\nTotal")
print(confusion_matrix(y_test[categories].idxmax(axis=1), y_pred[categories].idxmax(axis=1)))
print(accuracy_score(y_test[categories].idxmax(axis=1), y_pred[categories].idxmax(axis=1)))
print(f1_score(y_test[categories].idxmax(axis=1), y_pred[categories].idxmax(axis=1), average='macro'))

In [None]:
#Use Model on new dataset

newdata = pd.read_excel("data")
data["clean_sentence"] = data["sentence"].apply(expand_contractions)
data["clean_sentence"] = data["clean_sentence"].apply(remove_special)
data["words"] = data["clean_sentence"].apply(stem_tokenize)
    
MAX_SEQUENCE_LENGTH = 200
X_vocab = [[word2vec.wv.vocab[t].index +1 if t in word2vec.wv.vocab else 0 for t in comment] for comment in data["words"].tolist()]
X_vocab = pad_sequences(X_vocab, maxlen=MAX_SEQUENCE_LENGTH, padding="pre", truncating="post")

predictions = pd.DataFrame(np.array(model.predict(X_vocab)).T[0]
data = pd.concat([data.reset_index(), predictions, columns=categories)], axis=1)