**классификатор текстов LSTM на Keras+TensorFlow**

Евгений Борисов <borisov.e@solarl.ru>

In [None]:
# https://habr.com/ru/company/dca/blog/274027/
# http://neuro.compute.dtu.dk/wiki/Sentiment_analysis#Corpora
# http://help.sentiment140.com/for-students/
# http://study.mokoron.com

In [None]:
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 200  
import re
import gzip

In [None]:
def pp(d): return "{:,.0f}".format(d).replace(",", " ")
def ppr(d): print('записей:', pp(len(d)) )  

---

In [None]:
ff = ['id', 'tdate', 'tmane', 'ttext', 'ttype', 'trep', 'tfav', 'tstcount', 'tfol', 'tfrien', 'listcount','unk']

In [None]:
neg = pd.read_csv('../data/text/twit/negative.csv.gz',sep=';',header=None)
ppr(neg)
neg.columns = ff

In [None]:
pos = pd.read_csv('../data/text/twit/positive.csv.gz',sep=';')
ppr(pos)
pos.columns = ff

In [None]:
data = pd.concat([pos,neg],sort=False)[['id','ttext', 'ttype']]
ppr(data)

In [None]:
data.sample(10)

---

In [None]:
data['ttext_clean'] = data['ttext'].apply(lambda t:[ w.strip() for w in t.split() if w.strip() ] )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'^http.*',' url ', w.strip() ) for w in t  ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'[:;]-*[)D]',' happysmile ', w.strip() )for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'\)\)\)*',' happysmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'[:;]\*',' kisssmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r':\(',' sadsmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'\(\(\(*',' sadsmile ', w.strip() ) for w in t ]
  )

In [None]:
data['ttext_clean'] = [ ' '.join(s) for s in data['ttext_clean'] ]

In [None]:
data['ttext_clean'] = data['ttext_clean'].str.lower()
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'\W', ' ', s))
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'_', ' ', s))
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'\b\d+\b', ' digit ', s)) 


In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[ w.strip() for w in t.split() if w.strip() ] )

In [None]:
# замена буквенно-цифровых кодов
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t: [w for w in t if not re.match( r'\b.*\d+.*\b', w) ]
)

In [None]:
# data[['ttext_clean']]
# data[['ttext']]

---

In [None]:
with gzip.open('../data/text/stop-nltk.txt.gz','rt',encoding='utf-8') as f: 
    stopwords = set([ w.strip() for w in  f.read().split() if w.strip() ] )
ppr(stopwords)

In [None]:
# удаление лишних слов
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[w for w in t if w not in stopwords])

In [None]:
%xdel stopwords

In [None]:
data[data['ttext_clean'].str.len()<2 ]

In [None]:
# # удаление отдельных символов
# lot['text_clean'] = lot['text_clean'].apply(lambda t:[w for w in t if len(w)>1])

# # удаляем лоты без описания 
# ppr(lot)
# lot = lot[ lot['text_clean'].str.len()>1 ].reset_index(drop=True)
# ppr(lot)

# lot[['text_clean']].sample(3)
# # lot.sample(2)

---

In [None]:
import tensorflow as tf
from tensorflow.python.client import device_lib

import keras 

import matplotlib.pyplot as plt
# import os

In [None]:
print(tf.__version__)
print(keras.__version__)

In [None]:
from keras.preprocessing import sequence
from keras.utils import np_utils
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

In [None]:
max_features = 100000
maxlen = 100 
batch_size = 32

model = Sequential()
model.add(Embedding(max_features, 128, input_length=maxlen))
model.add(LSTM(64, return_sequences=True))
model.add(LSTM(64))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

In [None]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              class_mode="binary")

In [None]:
model.fit(
    X_train, y_train, 
    batch_size=batch_size, 
    nb_epoch=1,
    show_accuracy=True
)

In [None]:
result = model.predict_proba(X)

---

In [None]:
print(device_lib.list_local_devices())

In [None]:
dev = tf.test.gpu_device_name()
print('Default GPU Device:',dev)

In [None]:
# sess = tf.Session(config=tf.ConfigProto(log_device_placement=True))
# # sess.list_devices()

In [None]:
print(tf.test.is_built_with_cuda())