In [1]:
import re
from tqdm import tqdm
from sklearn.utils import shuffle
import numpy as np
from tqdm import tqdm
import bz2
from keras.layers import *
from keras.models import Model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
def splitReviewsLabels(lines):
    reviews = []
    labels = []
    for review in tqdm(lines):
        rev = reviewToX(review)
        label = reviewToY(review)
        reviews.append(rev[:512])
        labels.append(label)
    return reviews, labels

In [3]:
def reviewToY(review):
    return [1,0] if review.split(' ')[0] == '__label__1' else [0,1] 

def reviewToX(review):
    review = review.split(' ', 1)[1][:-1].lower()
    review = re.sub('\d','0',review)
    if 'www.' in review or 'http:' in review or 'https:' in review or '.com' in review:
        review = re.sub(r"([^ ]+(?<=\.[a-z]{3}))", "<url>", review)
    return review

In [5]:
train_file = bz2.BZ2File('./train.ft.txt.bz2')
test_file = bz2.BZ2File('./test.ft.txt.bz2')

train_lines = train_file.readlines()
test_lines = test_file.readlines()

train_lines = [x.decode('utf-8') for x in train_lines]
test_lines = [x.decode('utf-8') for x in test_lines]

In [6]:
# Load from the file
reviews_train, y_train = splitReviewsLabels(train_lines)
reviews_test, y_test = splitReviewsLabels(test_lines)

reviews_train, y_train = shuffle(reviews_train, y_train)
reviews_test, y_test = shuffle(reviews_test, y_test)

y_train = np.array(y_train)
y_test = np.array(y_test)

100%|██████████████████████████████████████████████████████████████████| 3600000/3600000 [00:53<00:00, 67903.99it/s]
100%|████████████████████████████████████████████████████████████████████| 400000/400000 [00:05<00:00, 67042.77it/s]


In [7]:
max_features = 8192
maxlen = 128
embed_size = 64

In [8]:
tokenizer = Tokenizer(num_words=max_features)
tokenizer.fit_on_texts(reviews_train)

token_train = tokenizer.texts_to_sequences(reviews_train)
token_test = tokenizer.texts_to_sequences(reviews_test)

x_train = pad_sequences(token_train, maxlen=maxlen, padding='post')
x_test = pad_sequences(token_test, maxlen=maxlen, padding='post')

In [9]:
input = Input(shape=(maxlen,))
net = Embedding(max_features, embed_size)(input)
net = Dropout(0.2)(net)
net = BatchNormalization()(net)

net = Conv1D(32, 7, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net = BatchNormalization()(net)
net = Conv1D(32, 3, padding='same', activation='relu')(net)
net1 = BatchNormalization()(net)

net = Conv1D(2, 1)(net)
net = GlobalAveragePooling1D()(net)
output = Activation('softmax')(net)
model = Model(inputs = input, outputs = output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
model.summary()

W0715 10:58:26.270829  3636 deprecation_wrapper.py:119] From c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0715 10:58:41.826187  3636 deprecation_wrapper.py:119] From c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0715 10:58:47.427420  3636 deprecation_wrapper.py:119] From c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0715 10:58:51.917491  3636 deprecation_wrapper.py:119] From c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py:133: The name tf.placeholder_with_default is deprecated. Pleas

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 128)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 128, 64)           524288    
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 64)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 128, 64)           256       
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 32)           14368     
_________________________________________________________________
batch_normalization_2 (Batch (None, 128, 32)           128       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 128, 32)           3104      
__________

In [10]:
model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)

W0715 10:58:58.919482  3636 deprecation.py:323] From c:\users\0\appdata\local\programs\python\python37\lib\site-packages\tensorflow\python\ops\math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Train on 3240000 samples, validate on 360000 samples
Epoch 1/5
 110592/3240000 [>.............................] - ETA: 17:43:28 - loss: 0.7099 - acc: 0.491 - ETA: 9:40:10 - loss: 0.7014 - acc: 0.499 - ETA: 6:53:53 - loss: 0.6951 - acc: 0.52 - ETA: 5:29:18 - loss: 0.6928 - acc: 0.53 - ETA: 4:38:02 - loss: 0.6915 - acc: 0.53 - ETA: 4:03:29 - loss: 0.6893 - acc: 0.54 - ETA: 3:39:08 - loss: 0.6868 - acc: 0.54 - ETA: 3:21:14 - loss: 0.6839 - acc: 0.55 - ETA: 3:06:51 - loss: 0.6807 - acc: 0.56 - ETA: 2:55:20 - loss: 0.6772 - acc: 0.58 - ETA: 2:45:52 - loss: 0.6725 - acc: 0.59 - ETA: 2:38:00 - loss: 0.6679 - acc: 0.60 - ETA: 2:31:10 - loss: 0.6629 - acc: 0.61 - ETA: 2:25:24 - loss: 0.6567 - acc: 0.62 - ETA: 2:20:23 - loss: 0.6508 - acc: 0.63 - ETA: 2:15:57 - loss: 0.6434 - acc: 0.63 - ETA: 2:12:04 - loss: 0.6366 - acc: 0.64 - ETA: 2:08:37 - loss: 0.6295 - acc: 0.65 - ETA: 2:05:25 - loss: 0.6216 - acc: 0.66 - ETA: 2:02:36 - loss: 0.6132 - acc: 0.66 - ETA: 2:00:02 - loss: 0.6042 - acc: 0.67 - E

E0715 11:02:22.171533  3636 ultratb.py:155] Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "c:\users\0\appdata\local\programs\python\python37\lib\site-packages\IPython\core\interactiveshell.py", line 3325, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-10-5e422a6ae3e6>", line 1, in <module>
    model.fit(x_train, y_train, batch_size=2048, epochs=5, validation_split=0.1)
  File "c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\engine\training.py", line 1039, in fit
    validation_steps=validation_steps)
  File "c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\engine\training_arrays.py", line 199, in fit_loop
    outs = f(ins_batch)
  File "c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py", line 2715, in __call__
    return self._call(inputs)
  File "c:\users\0\appdata\local\programs\python\python37\lib\site-packages\keras\backend\tensorflow_backend.py", line 2675, in _call
    fetched = sel

KeyboardInterrupt: 

In [None]:
model.evaluate(x_test, y_test)