In [98]:
import glob
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.initializers import Constant
import os
from nltk.corpus import stopwords
import pandas as pd
from keras import models
from keras import layers
from keras.layers import *
from keras import regularizers
import csv

In [2]:
def read_files_to_df(path):
    contents = []
    files = glob.glob(path)
    for file in files:
        with open(file) as f:
            contents.append(f.read())
    return pd.DataFrame(contents, columns=['text'])

def remove_stopwords(input_text):
    stopwords_list = stopwords.words('english')
    # Some words which might indicate a certain sentiment are kept via a whitelist
    whitelist = ["n't", "not", "no"]
    words = input_text.split() 
    clean_words = [word for word in words if (word not in stopwords_list or word in whitelist) and len(word) > 1] 
    return " ".join(clean_words) 

In [3]:
SEED = 42

### prepare data
print('preparing data...')
n_data_train = read_files_to_df('group34/data/train/neg/*.txt')
p_data_train = read_files_to_df('group34/data/train/pos/*.txt')
train_data = n_data_train.append(p_data_train, ignore_index=True)
train_data.text = train_data.text.apply(remove_stopwords)

n_labels = [0] * len(n_data_train)
p_labels = [1] * len(p_data_train)
labels = n_labels + p_labels

X_train, X_val, y_train, y_val = train_test_split(train_data.text, labels, test_size=0.20, random_state=SEED)
print('done')

preparing data...
done


In [9]:
NB_WORDS = 6000
tk = Tokenizer(num_words=NB_WORDS,
               filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
               lower=True,
               split=" ")
tk.fit_on_texts(X_train)

X_train_seq = tk.texts_to_sequences(X_train)
X_val_seq = tk.texts_to_sequences(X_val)

In [10]:
seq_lengths = X_train.apply(lambda x: len(x.split(' ')))

MAX_LEN = seq_lengths.max()
X_train_seq_padded = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_val_seq_padded = pad_sequences(X_val_seq, maxlen=MAX_LEN)

In [21]:
embed_size = 128
model = models.Sequential()
model.add(Embedding(NB_WORDS, embed_size))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 128)         768000    
_________________________________________________________________
bidirectional_5 (Bidirection (None, None, 64)          41216     
_________________________________________________________________
global_max_pooling1d_5 (Glob (None, 64)                0         
_________________________________________________________________
dense_11 (Dense)             (None, 20)                1300      
_________________________________________________________________
dropout_7 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 21        
Total params: 810,537
Trainable params: 810,537
Non-trainable params: 0
_________________________________________________________________
None

In [22]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 100
epochs = 8
fft = model.fit(X_train_seq_padded,y_train, batch_size=batch_size, epochs=epochs, 
          validation_data=(X_val_seq_padded, y_val), 
          verbose=2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/8
 - 504s - loss: 0.4333 - acc: 0.8030 - val_loss: 0.2871 - val_acc: 0.8816
Epoch 2/8
 - 491s - loss: 0.2293 - acc: 0.9123 - val_loss: 0.2868 - val_acc: 0.8798
Epoch 3/8
 - 488s - loss: 0.1660 - acc: 0.9396 - val_loss: 0.3226 - val_acc: 0.8794
Epoch 4/8
 - 482s - loss: 0.1199 - acc: 0.9597 - val_loss: 0.3585 - val_acc: 0.8734
Epoch 5/8
 - 483s - loss: 0.0788 - acc: 0.9765 - val_loss: 0.4330 - val_acc: 0.8680
Epoch 6/8
 - 484s - loss: 0.0637 - acc: 0.9803 - val_loss: 0.4422 - val_acc: 0.8648
Epoch 7/8
 - 483s - loss: 0.0404 - acc: 0.9886 - val_loss: 0.5230 - val_acc: 0.8660
Epoch 8/8
 - 484s - loss: 0.0373 - acc: 0.9890 - val_loss: 0.5894 - val_acc: 0.8636


In [19]:
embed_size2 = 300
model = models.Sequential()
model.add(Embedding(NB_WORDS, embed_size2))
model.add(Bidirectional(LSTM(32, return_sequences = True)))
model.add(GlobalMaxPool1D())
model.add(Dense(20, activation="relu"))
model.add(Dropout(0.05))
model.add(Dense(1, activation="sigmoid"))
print(model.summary())

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
batch_size = 100
epochs = 8
fft2 = model.fit(X_train_seq_padded,y_train, batch_size=batch_size, epochs=epochs, 
          validation_data=(X_val_seq_padded, y_val), 
          verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 300)         1800000   
_________________________________________________________________
bidirectional_3 (Bidirection (None, None, 64)          85248     
_________________________________________________________________
global_max_pooling1d_3 (Glob (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_3 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 21        
Total params: 1,886,569
Trainable params: 1,886,569
Non-trainable params: 0
_________________________________________________________________


In [14]:
#download data from https://nlp.stanford.edu/projects/glove/
GLOVE_DIM = 300
glove_file = 'glove.42B.' + str(GLOVE_DIM) + 'd.txt'
emb_dict = {}
glove = open(glove_file)
for line in glove:
    values = line.split()
    word = values[0]
    vector = np.asarray(values[1:], dtype='float32')
    emb_dict[word] = vector
glove.close()

In [16]:
emb_matrix = np.zeros((NB_WORDS, GLOVE_DIM))

for w, i in tk.word_index.items():
    if i < NB_WORDS:
        vect = emb_dict.get(w)
        if vect is not None:
            emb_matrix[i] = vect
    else:
        break


In [23]:
glove_model = models.Sequential()
glove_model.add(Embedding(NB_WORDS, GLOVE_DIM, embeddings_initializer=Constant(emb_matrix), trainable = False))
glove_model.add(Bidirectional(LSTM(32, return_sequences = True)))
glove_model.add(GlobalMaxPool1D())
glove_model.add(Dense(20, activation="relu"))
glove_model.add(Dropout(0.05))
glove_model.add(Dense(1, activation="sigmoid"))
print(glove_model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 300)         1800000   
_________________________________________________________________
bidirectional_6 (Bidirection (None, None, 64)          85248     
_________________________________________________________________
global_max_pooling1d_6 (Glob (None, 64)                0         
_________________________________________________________________
dense_13 (Dense)             (None, 20)                1300      
_________________________________________________________________
dropout_8 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 21        
Total params: 1,886,569
Trainable params: 86,569
Non-trainable params: 1,800,000
_____________________________________________________________

In [25]:
glove_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
batch_size = 100
epochs = 5
fit = glove_model.fit(X_train_seq_padded,y_train, batch_size=batch_size, epochs=epochs, 
          validation_data=(X_val_seq_padded, y_val), 
          verbose=2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/5
 - 726s - loss: 0.1157 - acc: 0.9620 - val_loss: 0.3237 - val_acc: 0.8806
Epoch 2/5
 - 546s - loss: 0.0864 - acc: 0.9755 - val_loss: 0.3432 - val_acc: 0.8746
Epoch 3/5
 - 637s - loss: 0.0643 - acc: 0.9839 - val_loss: 0.3884 - val_acc: 0.8690
Epoch 4/5
 - 545s - loss: 0.0440 - acc: 0.9908 - val_loss: 0.3932 - val_acc: 0.8780
Epoch 5/5
 - 571s - loss: 0.0307 - acc: 0.9944 - val_loss: 0.4270 - val_acc: 0.8748


In [20]:
glove_model = models.Sequential()
glove_model.add(Embedding(NB_WORDS, GLOVE_DIM, embeddings_initializer=Constant(emb_matrix), trainable = False))
glove_model.add(Bidirectional(LSTM(32, return_sequences = True)))
glove_model.add(GlobalMaxPool1D())
glove_model.add(Dense(20, activation="relu"))
glove_model.add(Dropout(0.05))
glove_model.add(Dense(20, activation="relu"))
glove_model.add(Dropout(0.05))
glove_model.add(Dense(20, activation="relu"))
glove_model.add(Dropout(0.05))
glove_model.add(Dense(1, activation="sigmoid"))
print(glove_model.summary())

glove_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
batch_size = 100
epochs = 12
fit2 = glove_model.fit(X_train_seq_padded,y_train, batch_size=batch_size, epochs=epochs, 
          validation_data=(X_val_seq_padded, y_val), 
          verbose=2)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 300)         1800000   
_________________________________________________________________
bidirectional_4 (Bidirection (None, None, 64)          85248     
_________________________________________________________________
global_max_pooling1d_4 (Glob (None, 64)                0         
_________________________________________________________________
dense_7 (Dense)              (None, 20)                1300      
_________________________________________________________________
dropout_4 (Dropout)          (None, 20)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 20)                420       
_________________________________________________________________
dropout_5 (Dropout)          (None, 20)                0         
__________

In [None]:
def read_files_to_df(path):
    contents_dict = {}
    files = glob.glob(path)
    for file in files:
        with open(file) as f:
            base = os.path.basename(file)
            id = os.path.splitext(base)[0]
            contents_dict[id] = f.read()
    return contents_dict

In [29]:
fit.on_batch_begin(3)

In [37]:
mdl = fft2.model
name = "embed300"

model_json = mdl.to_json()
with open(name + ".json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
mdl.save_weights(name+".h5")
print("Saved model to disk")

Saved model to disk


In [32]:
fit.model

<keras.engine.sequential.Sequential at 0x17e005710>

In [33]:
glove_model

<keras.engine.sequential.Sequential at 0x17e005710>

In [95]:
def read_files_to_dict(path):
    contents_dict = {}
    files = glob.glob(path)
    for file in files:
        with open(file) as f:
            base = os.path.basename(file)
            id = os.path.splitext(base)[0]
            contents_dict[id] = f.read()
    return contents_dict

def prepare_test_data(test_data):
    test_data_processed = []
    for v in test_data.values():
        test_data_processed += [remove_stopwords(v)]
    test_seq = tk.texts_to_sequences(test_data_processed)
    return pad_sequences(X_test_seq, maxlen=MAX_LEN)

def create_submission_file(res_dir, keys, y_results):
    with open(res_dir + '/submission.csv', 'w') as csv_file:
        csv_writer = csv.writer(csv_file)
        csv_writer.writerow(['Id', 'Category'])
        for key, result in zip(keys, y_results):
            csv_writer.writerow([key, result])

In [41]:
test_data = read_files_to_dict('group34/data/test/*.txt')
X_test_seq_padded = prepare_test_data(test_data)
y_test_results = fit.model.predict(X_test_seq_padded)

In [91]:
test_results = []
for res in y_test_results:
    if (res < 0.5):
        test_results += [0]
    else:
        test_results += [1]

In [99]:
create_submission_file(".", test_data.keys(), test_results)

In [93]:
y_test_results[3]

array([0.00073427], dtype=float32)

In [None]:
y_test_results = classifier.predict(test_data.values())