**классификатор текстов LSTM на Keras+TensorFlow**

Евгений Борисов <borisov.e@solarl.ru>

In [None]:
# https://habr.com/ru/company/dca/blog/274027/
# http://neuro.compute.dtu.dk/wiki/Sentiment_analysis#Corpora
# http://help.sentiment140.com/for-students/
# http://study.mokoron.com

## Библиотеки

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 200  
import re
import gzip

In [None]:
def pp(d): return "{:,.0f}".format(d).replace(",", " ")
def ppr(d): print('записей:', pp(len(d)) )  

## Данные

In [None]:
ff = ['id', 'tdate', 'tmane', 'ttext', 'ttype', 'trep', 'tfav', 'tstcount', 'tfol', 'tfrien', 'listcount','unk']

In [None]:
neg = pd.read_csv('../data/text/twit/negative.csv.gz',sep=';',header=None)
ppr(neg)
neg.columns = ff

In [None]:
pos = pd.read_csv('../data/text/twit/positive.csv.gz',sep=';')
ppr(pos)
pos.columns = ff

In [None]:
data = pd.concat([pos,neg],sort=False)[['id','ttext', 'ttype']]
ppr(data)

In [None]:
data.sample(10)

## очистка данных

In [None]:
data['ttext_clean'] = data['ttext'].apply(lambda t:[ w.strip() for w in t.split() if w.strip() ] )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'^http.*',' url ', w.strip() ) for w in t  ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'[:;]-*[)D]',' happysmile ', w.strip() )for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'\)\)\)*',' happysmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'[:;]\*',' kisssmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r':\(',' sadsmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'\(\(\(*',' sadsmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = [ ' '.join(s) for s in data['ttext_clean'] ]

In [None]:
data['ttext_clean'] = data['ttext_clean'].str.lower()
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'\W', ' ', s))
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'_', ' ', s))
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'\b\d+\b', ' digit ', s)) 


In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[ w.strip() for w in t.split() if w.strip() ] )

In [None]:
# замена буквенно-цифровых кодов
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t: [w for w in t if not re.match( r'\b.*\d+.*\b', w) ]
)

In [None]:
# data[['ttext_clean']]
# data[['ttext']]

---

In [None]:
with gzip.open('../data/text/stop-nltk.txt.gz','rt',encoding='utf-8') as f: 
    stopwords = set([ w.strip() for w in  f.read().split() if w.strip() ] )
ppr(stopwords)

In [None]:
# удаление лишних слов
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[w for w in t if w not in stopwords])

In [None]:
%xdel stopwords

In [None]:
%%time 

from Stemmer import Stemmer
# pacman -S python-pystemmer
# pip install pystemmer

# стемминг, выделение основы слова
data['ttext_clean'] = data['ttext_clean'].apply( lambda t:Stemmer('russian').stemWords(t) )

In [None]:
# удаление коротких слов
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[w for w in t if len(w)>2])

---

In [None]:
# data[ data['ttext_clean'].str.len()<1 ][['ttext_clean']]

In [None]:
ppr(data)
data = data[ data['ttext_clean'].str.len()>0 ].reset_index(drop=True) 
ppr(data)

In [None]:
data.sample(3)

## строим датасет

In [None]:
vocab = ['<PAD>','<START>','<UNK>'] + sorted(set([ w for t in data['ttext_clean'] for w in t if w ]))
ppr(vocab)

In [None]:
vocab = { w:n for n,w in enumerate(vocab) }

---

In [None]:
data['ttext_clean'] = data['ttext_clean'] + ['<START>']

In [None]:
n_max = data['ttext_clean'].str.len().max()
n_max

In [None]:
pad = ['<PAD>']*n_max

In [None]:
data['ttext_clean']

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t: pad[len(t):] + list(reversed(t)) 
  )

In [None]:
data['ttext_clean']

In [None]:
data['ttext_code'] = data['ttext_clean'].apply(lambda t: [ vocab[w] for w in t ] )

In [None]:
data['ttext_code'].values

In [None]:
len(data)//32

In [None]:
ppr(data)
data = data.sample(32*7088).reset_index(drop=True)
ppr(data)


---

In [None]:
X = np.stack( data['ttext_code'].values).astype(np.float32 ) # , axis=-1)
X.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder

y = data['ttype'].values
y = OneHotEncoder(categories='auto').fit_transform(y.reshape(-1,1) ).todense().astype(np.float32)
y.shape


In [None]:
np.save('X.npy',X)
np.save('y.npy',y)

In [2]:
import numpy as np

X = np.load('X.npy')
y = np.load('y.npy')
vocab_size = int(X.max())

X.shape , y.shape, vocab_size

((226816, 31), (226816, 2))

## строим нейросеть 

In [9]:
# import numpy as np

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense

In [10]:
# n=226826
# for i in range(1,n//2):
#     if n%i==0: print(i)
# # 23
# # 46
# # 4931
# # 9862

In [12]:
time_steps=X.shape[1]
batch_size=32
num_classes=y.shape[1]

vocab_size = len(vocab)

In [13]:
embedding_size=64

model = Sequential()

model.add(Embedding(
       input_dim=vocab_size, # e.g, 10 if you have 10 words in your vocabulary
       output_dim=embedding_size, # size of the embedded vectors
       input_length=time_steps,
       batch_input_shape=(batch_size,time_steps)
    ))

model.add(LSTM(
       32, 
       return_sequences=False, 
       stateful=False)
    )

model.add(Dense(num_classes, activation='softmax'))

Instructions for updating:
Colocations handled automatically by placer.


In [14]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [15]:
model.fit(X,y, batch_size=batch_size, epochs=10, )


Instructions for updating:
Use tf.cast instead.
Epoch 1/10


ResourceExhaustedError: OOM when allocating tensor with shape[32,2] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node dense_1/random_uniform/RandomUniform (defined at /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4049) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.


Caused by op 'dense_1/random_uniform/RandomUniform', defined at:
  File "/usr/lib/python3.7/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/lib/python3.7/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/usr/lib/python3.7/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/usr/lib/python3.7/site-packages/ipykernel/kernelapp.py", line 497, in start
    self.io_loop.start()
  File "/usr/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 132, in start
    self.asyncio_loop.run_forever()
  File "/usr/lib/python3.7/asyncio/base_events.py", line 539, in run_forever
    self._run_once()
  File "/usr/lib/python3.7/asyncio/base_events.py", line 1775, in _run_once
    handle._run()
  File "/usr/lib/python3.7/asyncio/events.py", line 88, in _run
    self._context.run(self._callback, *self._args)
  File "/usr/lib/python3.7/site-packages/tornado/platform/asyncio.py", line 122, in _handle_events
    handler_func(fileobj, events)
  File "/usr/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 456, in _handle_events
    self._handle_recv()
  File "/usr/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 486, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/lib/python3.7/site-packages/zmq/eventloop/zmqstream.py", line 438, in _run_callback
    callback(*args, **kwargs)
  File "/usr/lib/python3.7/site-packages/tornado/stack_context.py", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/lib/python3.7/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/lib/python3.7/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/lib/python3.7/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2843, in run_cell
    raw_cell, store_history, silent, shell_futures)
  File "/usr/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 2869, in _run_cell
    return runner(coro)
  File "/usr/lib/python3.7/site-packages/IPython/core/async_helpers.py", line 67, in _pseudo_sync_runner
    coro.send(None)
  File "/usr/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3044, in run_cell_async
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3215, in run_ast_nodes
    if (yield from self.run_code(code, result)):
  File "/usr/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3291, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-13-9603c460f128>", line 18, in <module>
    model.add(Dense(num_classes, activation='softmax'))
  File "/usr/lib/python3.7/site-packages/keras/engine/sequential.py", line 185, in add
    output_tensor = layer(self.outputs[0])
  File "/usr/lib/python3.7/site-packages/keras/engine/base_layer.py", line 431, in __call__
    self.build(unpack_singleton(input_shapes))
  File "/usr/lib/python3.7/site-packages/keras/layers/core.py", line 861, in build
    constraint=self.kernel_constraint)
  File "/usr/lib/python3.7/site-packages/keras/legacy/interfaces.py", line 91, in wrapper
    return func(*args, **kwargs)
  File "/usr/lib/python3.7/site-packages/keras/engine/base_layer.py", line 249, in add_weight
    weight = K.variable(initializer(shape),
  File "/usr/lib/python3.7/site-packages/keras/initializers.py", line 218, in __call__
    dtype=dtype, seed=self.seed)
  File "/usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py", line 4049, in random_uniform
    dtype=dtype, seed=seed)
  File "/usr/lib/python3.7/site-packages/tensorflow/python/ops/random_ops.py", line 247, in random_uniform
    rnd = gen_random_ops.random_uniform(shape, dtype, seed=seed1, seed2=seed2)
  File "/usr/lib/python3.7/site-packages/tensorflow/python/ops/gen_random_ops.py", line 777, in random_uniform
    name=name)
  File "/usr/lib/python3.7/site-packages/tensorflow/python/framework/op_def_library.py", line 788, in _apply_op_helper
    op_def=op_def)
  File "/usr/lib/python3.7/site-packages/tensorflow/python/util/deprecation.py", line 507, in new_func
    return func(*args, **kwargs)
  File "/usr/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 3300, in create_op
    op_def=op_def)
  File "/usr/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1801, in __init__
    self._traceback = tf_stack.extract_stack()

ResourceExhaustedError (see above for traceback): OOM when allocating tensor with shape[32,2] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node dense_1/random_uniform/RandomUniform (defined at /usr/lib/python3.7/site-packages/keras/backend/tensorflow_backend.py:4049) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info.



---

In [None]:
# from keras.preprocessing import sequence
# from keras.utils import np_utils
# from keras.models import Sequential
# from keras.layers.core import Dense, Dropout, Activation
# from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import LSTM

In [None]:
# max_features = 100000
# maxlen = X.shape[0]
# # batch_size = 32

# model = Sequential()
# model.add(Embedding(max_features, 128, input_length=maxlen))
# # model.add(LSTM(64, return_sequences=True))
# model.add(LSTM(64))
# # model.add(Dropout(0.5))
# model.add(Dense(2))
# model.add(Activation('sigmoid'))

In [None]:
# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               class_mode="binary")

In [None]:
# model.fit(
#     X, y, 
#     batch_size=batch_size, 
#     nb_epoch=1 # , show_accuracy=True
# )

In [None]:
# result = model.predict_proba(X)

---

In [None]:
import numpy as np

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [None]:
data_dim = 16
timesteps = 8
num_classes = 2

num_ex = 1000

x_train = np.random.random((num_ex, timesteps, data_dim))
y_train = np.random.randint(1,3,num_ex)

x_train.shape

# [ пример, элемент посл., вектор ]

In [None]:
from sklearn.preprocessing import OneHotEncoder

y_train = np.random.randint(1,3,num_ex)
y_train = OneHotEncoder(categories='auto').fit_transform(y_train.reshape(-1,1) ).todense()
y_train.shape

In [None]:
# # expected input data shape: (batch_size, timesteps, data_dim)
# model = Sequential()

# # returns a sequence of vectors of dimension 32
# model.add(LSTM(32,return_sequences=True,input_shape=(timesteps, data_dim)))  

# # returns a sequence of vectors of dimension 32
# model.add(LSTM(32,return_sequences=True))  

# model.add(LSTM(32))  # return a single vector of dimension 32

# model.add(Dense(num_classes, activation='softmax'))

In [None]:
# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()

# returns a sequence of vectors of dimension 32
model.add(LSTM(32,input_shape=(timesteps, data_dim)))  

model.add(Dense(num_classes, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
          batch_size=64, epochs=115,
          # validation_data=(x_val, y_val)
         )