In [3]:
"""
Load the data and split into training and testing.
"""

from sklearn.model_selection import train_test_split
import os
import numpy as np

dirname = os.path.abspath('')

X = np.load(os.path.join(dirname, '..', '..', 'data', 'X.npy'))
y = np.load(os.path.join(dirname, '..', '..', 'data', 'y.npy'))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=576)

del X, y


In [42]:
"""
Train model
"""

from keras.models import Sequential
from keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Activation
from keras import backend as K
from keras.regularizers import l2

batch_size = 64
maxlen = 300
embedding_dims = 100 #Length of the token vectors
filters = 128 #number of filters in your Convnet
kernel_size = 5 # a window size of 5 tokens
epochs = 8

model = Sequential()
model.add(Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1, input_shape=(maxlen,embedding_dims), kernel_regularizer=l2(0.01), bias_regularizer=l2(0.01)))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.1))
model.add(Dense(256, activation='leaky_relu'))
model.add(Dense(1, activation='sigmoid'))

#model.compile(loss = 'mean_squared_error',optimizer = 'adam', metrics=[soft_acc])
model.compile(loss = 'binary_crossentropy',optimizer = 'adam', metrics=['accuracy'])
model.fit(X_train,y_train,batch_size = batch_size,epochs = epochs , validation_data = (X_test,y_test))
del X_train, X_test, y_train, y_test

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

stop_words = set(map(lambda x : re.sub(r'[^\w\s]+', '', x), stopwords.words('english')))  # Loads nltk stopwords and removes punctuation

from gensim.models import KeyedVectors
import os

dirname = os.path.abspath('')
filename = os.path.join(dirname, '..', '..', 'data', 'word2vec.100d.txt')

word2vec_model = KeyedVectors.load_word2vec_format(filename, binary=False)

def vectorize(sentence):
    """
    Takes a list of words and returns a list of vectors through word embeddings.
    """

    out = np.empty((maxlen, embedding_dims))
    i_ = 0
    for i, word in enumerate(sentence):
        if i < maxlen:
            try:
                out[i] = word2vec_model[word]
            except KeyError:
                out[i] = np.zeros(embedding_dims)
            i_ += 1
    out[range(i_+1, maxlen)] = np.zeros(embedding_dims)  # pad the array with arrays of zeros.

    return out


def convert(sentence):
    sentence = re.sub(r'[^\w\s]+', '', sentence).lower()

    sentence = re.split(r'\s+', sentence)

    sentence = [word for word in sentence if word not in stop_words]

    sentence = [wnl.lemmatize(wnl.lemmatize(word), pos='v') for word in sentence]

    sentence = vectorize(sentence)

    sentence = np.reshape(sentence, (1, maxlen, embedding_dims))
    return sentence



In [51]:
a = model.predict(convert("I fucking hate you, go to hell!"))

print(a)

[[0.7911954]]
