**классификатор текстов LSTM на Keras+TensorFlow**

Евгений Борисов <borisov.e@solarl.ru>

In [1]:
# https://habr.com/ru/company/dca/blog/274027/
# http://neuro.compute.dtu.dk/wiki/Sentiment_analysis#Corpora
# http://help.sentiment140.com/for-students/
# http://study.mokoron.com

## Библиотеки

In [2]:
import numpy as np
import pandas as pd
pd.options.display.max_colwidth = 200  
import re
import gzip

In [3]:
def pp(d): return "{:,.0f}".format(d).replace(",", " ")
def ppr(d): print('записей:', pp(len(d)) )  

## Данные

In [4]:
ff = ['id', 'tdate', 'tmane', 'ttext', 'ttype', 'trep', 'tfav', 'tstcount', 'tfol', 'tfrien', 'listcount','unk']

In [5]:
neg = pd.read_csv('../data/text/twit/negative.csv.gz',sep=';',header=None)
ppr(neg)
neg.columns = ff

записей: 111 923


In [6]:
pos = pd.read_csv('../data/text/twit/positive.csv.gz',sep=';')
ppr(pos)
pos.columns = ff

записей: 114 910


In [7]:
data = pd.concat([pos,neg],sort=False)[['id','ttext', 'ttype']]
ppr(data)

записей: 226 833


In [8]:
data.sample(10)

Unnamed: 0,id,ttext,ttype
7075,409092007999860736,"#С_ДНЁМ_РОЖДЕНИЯ! @TheFuckDePolice \nСчастья тебе, удачи, везения, любви, и побольше незабываемых впечатлений в жизни!) #МираДобраКотяток!))",1
94203,422403742521372672,@cherry96moon тебя мне не хватало на хоре 2 года. Не с кем поржать было:(,-1
17610,409372604185972736,"RT @AnnaUglova: @Skinny_Ger вот вот, всё возможно)) так что если всё съедите, как в тагане киндеры",1
19903,409409077672828928,Присоединяйтесь :) \n#анекдот \n#ВзаимныйФолловинг \n#ЧитаюВзаимно \n#rufollowback \n#followback,1
26705,409580250570719233,"RT @LanaRey_: @Heilig_99 я замечаю, но молчу :D \nи вообще,я соскучился :с",1
98645,411055804306161664,"RT @Shkolina_1997: Скучновато было, но мне нравки;) http://t.co/eruVPmfDEl",1
95265,411011432172769280,"Обзавёлся жёлтым хромом... Просто так, на поглазеть...)",1
109685,424590900770394112,"""@DaryMalyavskaya: Что? Диета? А давай купим тебе пол кило мороженого?"" Знакомо. Папа столько всего накупил:(",-1
34779,409769560255250432,"RT @vitalich_ololo: Кстати, мне сегодня гадали по руке) судя по ней, Господь нервничал, когда зачали Виталича)",1
37511,409847655280287744,"А сейчас, джентльмены, перейдем к народному пению. :-D #ff",1


## очистка данных

In [9]:
data['ttext_clean'] = data['ttext'].apply(lambda t:[ w.strip() for w in t.split() if w.strip() ] )

In [10]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'^http.*',' url ', w.strip() ) for w in t  ]
  )

In [11]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'[:;]-*[)D]',' happysmile ', w.strip() )for w in t ]
  )

In [12]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'\)\)\)*',' happysmile ', w.strip() ) for w in t ]
  )

In [13]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'[:;]\*',' kisssmile ', w.strip() ) for w in t ]
  )

In [14]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r':\(',' sadsmile ', w.strip() ) for w in t ]
  )

In [15]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t:[ re.sub(r'\(\(\(*',' sadsmile ', w.strip() ) for w in t ]
  )

In [16]:
data['ttext_clean'] = [ ' '.join(s) for s in data['ttext_clean'] ]

In [17]:
data['ttext_clean'] = data['ttext_clean'].str.lower()
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'\W', ' ', s))
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'_', ' ', s))
data['ttext_clean'] = data['ttext_clean'].apply(lambda s: re.sub( r'\b\d+\b', ' digit ', s)) 


In [18]:
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[ w.strip() for w in t.split() if w.strip() ] )

In [19]:
# замена буквенно-цифровых кодов
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t: [w for w in t if not re.match( r'\b.*\d+.*\b', w) ]
)

In [20]:
# data[['ttext_clean']]
# data[['ttext']]

---

In [21]:
with gzip.open('../data/text/stop-nltk.txt.gz','rt',encoding='utf-8') as f: 
    stopwords = set([ w.strip() for w in  f.read().split() if w.strip() ] )
ppr(stopwords)

записей: 151


In [22]:
# удаление лишних слов
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[w for w in t if w not in stopwords])

In [23]:
%xdel stopwords

In [24]:
# %%time 

# from Stemmer import Stemmer
# # pacman -S python-pystemmer
# # pip install pystemmer

# # стемминг, выделение основы слова
# data['ttext_clean'] = data['ttext_clean'].apply( lambda t:Stemmer('russian').stemWords(t) )

In [25]:
# удаление коротких слов
data['ttext_clean'] = data['ttext_clean'].apply(lambda t:[w for w in t if len(w)>2])

---

In [26]:
# data[ data['ttext_clean'].str.len()<1 ][['ttext_clean']]

In [27]:
ppr(data)
data = data[ data['ttext_clean'].str.len()>0 ].reset_index(drop=True) 
ppr(data)

записей: 226 833
записей: 226 826


In [28]:
data.sample(3)

Unnamed: 0,id,ttext,ttype,ttext_clean
121763,409988515750232065,"Как всегда, моего самого сильного программиста нет под рукой, когда он так нужен... Ууу :(",-1,"[моего, самого, сильного, программиста, рукой, нужен, ууу, sadsmile]"
45615,409987749975183360,RT @ponaditazej: Какая попаболь чувствуется в этом тексте :) А как красиво сформирован несвязный поток мыслей…,1,"[ponaditazej, попаболь, чувствуется, тексте, happysmile, красиво, сформирован, несвязный, поток, мыслей]"
70515,410469301649285121,@AIrzhanov атрофларига кометаларимни сочиб ташлайман)))),1,"[airzhanov, атрофларига, кометаларимни, сочиб, ташлайман, happysmile]"


## строим датасет

In [None]:
vocab = ['<PAD>','<START>','<UNK>'] + sorted(set([ w for t in data['ttext_clean'] for w in t if w ]))
ppr(vocab)

In [None]:
# %%time

# from gensim.models.word2vec import Word2Vec

# w2v = Word2Vec( common_texts, min_count=1, size=256, window=4, workers=4)

# # with open('result/Word2Vec.pkl', 'wb') as f: pickle.dump(w2v, f)

In [None]:
vocab = { w:n for n,w in enumerate(vocab) }

---

In [None]:
data['ttext_clean'] = data['ttext_clean'] + ['<START>']

In [None]:
n_max = data['ttext_clean'].str.len().max()
n_max

In [None]:
pad = ['<PAD>']*n_max

In [None]:
data['ttext_clean']

In [None]:
data['ttext_clean'] = data['ttext_clean'].apply(
    lambda t: pad[len(t):] + list(reversed(t)) 
  )

In [None]:
data['ttext_clean']

In [None]:
data['ttext_code'] = data['ttext_clean'].apply(lambda t: [ vocab[w] for w in t ] )

In [None]:
data['ttext_code'].values

In [None]:
len(data)//32

In [None]:
ppr(data)
data = data.sample(32*7088).reset_index(drop=True)
ppr(data)


---

In [None]:
X = np.stack( data['ttext_code'].values).astype(np.float32 ) # , axis=-1)
X.shape

In [None]:
from sklearn.preprocessing import OneHotEncoder

y = data['ttype'].values
y = OneHotEncoder(categories='auto').fit_transform(y.reshape(-1,1) ).todense().astype(np.float32)
y.shape


In [None]:
np.save('X.npy',X)
np.save('y.npy',y)

In [None]:
import numpy as np

X = np.load('X.npy')
y = np.load('y.npy')
vocab_size = int(X.max())

X.shape , y.shape, vocab_size

## строим нейросеть 

In [None]:
# import numpy as np

from keras.models import Sequential
from keras.layers import Embedding
from keras.layers import LSTM
from keras.layers import Dense

In [None]:
# n=226826
# for i in range(1,n//2):
#     if n%i==0: print(i)
# # 23
# # 46
# # 4931
# # 9862

In [None]:
time_steps=X.shape[1]
batch_size=32
num_classes=y.shape[1]

vocab_size = len(vocab)

In [None]:
embedding_size=64

model = Sequential()

model.add(Embedding(
       input_dim=vocab_size, # e.g, 10 if you have 10 words in your vocabulary
       output_dim=embedding_size, # size of the embedded vectors
       input_length=time_steps,
       batch_input_shape=(batch_size,time_steps)
    ))

model.add(LSTM(
       32, 
       return_sequences=False, 
       stateful=False)
    )

model.add(Dense(num_classes, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
model.fit(X,y, batch_size=batch_size, epochs=10, )


---

In [None]:
# from keras.preprocessing import sequence
# from keras.utils import np_utils
# from keras.models import Sequential
# from keras.layers.core import Dense, Dropout, Activation
# from keras.layers.embeddings import Embedding
# from keras.layers.recurrent import LSTM

In [None]:
# max_features = 100000
# maxlen = X.shape[0]
# # batch_size = 32

# model = Sequential()
# model.add(Embedding(max_features, 128, input_length=maxlen))
# # model.add(LSTM(64, return_sequences=True))
# model.add(LSTM(64))
# # model.add(Dropout(0.5))
# model.add(Dense(2))
# model.add(Activation('sigmoid'))

In [None]:
# model.compile(loss='binary_crossentropy',
#               optimizer='adam',
#               class_mode="binary")

In [None]:
# model.fit(
#     X, y, 
#     batch_size=batch_size, 
#     nb_epoch=1 # , show_accuracy=True
# )

In [None]:
# result = model.predict_proba(X)

---

In [None]:
import numpy as np

from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense

In [None]:
data_dim = 16
timesteps = 8
num_classes = 2

num_ex = 1000

x_train = np.random.random((num_ex, timesteps, data_dim))
y_train = np.random.randint(1,3,num_ex)

x_train.shape

# [ пример, элемент посл., вектор ]

In [None]:
from sklearn.preprocessing import OneHotEncoder

y_train = np.random.randint(1,3,num_ex)
y_train = OneHotEncoder(categories='auto').fit_transform(y_train.reshape(-1,1) ).todense()
y_train.shape

In [None]:
# # expected input data shape: (batch_size, timesteps, data_dim)
# model = Sequential()

# # returns a sequence of vectors of dimension 32
# model.add(LSTM(32,return_sequences=True,input_shape=(timesteps, data_dim)))  

# # returns a sequence of vectors of dimension 32
# model.add(LSTM(32,return_sequences=True))  

# model.add(LSTM(32))  # return a single vector of dimension 32

# model.add(Dense(num_classes, activation='softmax'))

In [None]:
# expected input data shape: (batch_size, timesteps, data_dim)
model = Sequential()

# returns a sequence of vectors of dimension 32
model.add(LSTM(32,input_shape=(timesteps, data_dim)))  

model.add(Dense(num_classes, activation='softmax'))

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [None]:
model.fit(x_train, y_train,
          batch_size=64, epochs=115,
          # validation_data=(x_val, y_val)
         )