In [1]:
import tensorflow as tf

from keras.callbacks import LambdaCallback
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding
from keras.layers import LSTM
from keras.optimizers import RMSprop, Adam
from keras.utils.data_utils import get_file
import pandas as pd
from sklearn.cross_validation import train_test_split
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.utils.np_utils import to_categorical
import random
from keras.optimizers import TFOptimizer
import sys
import io
import pickle
import re

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
train = pd.read_csv('data/train.csv')

In [3]:
X = train.description.fillna(' ')
y = train.deal_probability

In [4]:
del train

In [5]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=13)

In [67]:
chars = set()
for line in open('descriptions.txt'):
    chars.update(re.sub('[\n]', ' ', line.lower()))

In [70]:
pickle.dump(chars, open('chars.pkl', 'wb'))

In [6]:
chars = pickle.load(open('chars.pkl', 'rb'))

In [7]:
chars = ['<P>'] + sorted(list(chars))
print('total chars:', len(chars))
char_indices = dict((c, i) for i, c in enumerate(chars))
indices_char = dict((i, c) for i, c in enumerate(chars))

total chars: 2336


In [8]:
maxlen = 3212

In [9]:
def get_batches_reg(X_train, y_train, size=32):
    
    x = np.zeros((size, maxlen))
    y = np.zeros((size, 1))
    seqs = [[]]

    for i,sent in enumerate(X_train.iteritems()):
        idx,sent = sent
        sent = re.sub('\n', ' ', sent).lower()

        if not len(seqs)%size:
            pad_seqs = pad_sequences(seqs, maxlen)
#             pad_seqs = to_categorical(pad_seqs, len(chars))
            x = pad_seqs
            seqs = [[]]
            yield x,y
            x = np.zeros((size, maxlen))
            y = np.zeros((size, 1))

        seq = []    

        for t, char in enumerate(sent):
            seq.append(char_indices[char])
    #         pad_seq = pad_sequences([seq], maxlen)

        seqs.append(seq)
    #         cat_pad_seq = to_categorical(pad_seq, len(chars))

    #         x[i%size] = pad_seq

        y[i%size] = y_train[idx]

In [178]:
y.shape

(128, 1)

In [10]:
a = get_batches_reg(X_train, y_train, 128)

In [11]:
next(a)[0].shape

(128, 3212, 2336)

In [71]:

s_c = 0
f = open('descriptions_maxlen.txt', 'w')
for text in open('descriptions.txt'):
    text = re.sub('[\n]', ' ', text.lower())
    
    for i in range(0, len(text) - maxlen, step):
        sent = text[i: i + maxlen]
        next_char = text[i + maxlen]
        if not sent:
            continue
        f.write(sent + next_char + '\n')
        s_c += 1
f.close()


In [14]:
import keras

In [53]:
from keras import backend as K

def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_true - y_pred), axis=-1)) 

In [15]:
del model

In [54]:
# build the model: a single LSTM
model = Sequential()
model.add(Embedding(len(chars), 8, input_length=maxlen))
model.add(Dropout(0.5))
model.add(keras.layers.CuDNNLSTM(54, return_sequences=False,
              ))
# model.add(LSTM(54, return_sequences=False))

model.add(Dense(1))


optimizer = TFOptimizer(tf.train.GradientDescentOptimizer(0.01))
model.compile(loss='mse', metrics=[root_mean_squared_error], optimizer=optimizer)
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 3212, 8)           18688     
_________________________________________________________________
dropout_10 (Dropout)         (None, 3212, 8)           0         
_________________________________________________________________
cu_dnnlstm_11 (CuDNNLSTM)    (None, 54)                13824     
_________________________________________________________________
dense_12 (Dense)             (None, 1)                 55        
Total params: 32,567
Trainable params: 32,567
Non-trainable params: 0
_________________________________________________________________
None


In [47]:
import gc
gc.collect()

305

In [7]:
def get_batches(size=64):
    file = open('descriptions_maxlen.txt')
    x = np.zeros((size, maxlen, len(chars)), dtype=np.bool)
    y = np.zeros((size, len(chars)), dtype=np.bool)
    
    for i,sent in enumerate(file):
        sent = sent.rstrip('\n')
        
        if not (i+1)%size:
            yield x,y
            x = np.zeros((size, maxlen, len(chars)), dtype=np.bool)
            y = np.zeros((size, len(chars)), dtype=np.bool)
        
            
        
        for t, char in enumerate(sent[:-1]):
            x[i%size, 
              t, 
              char_indices[char]] = 1
        
        y[i%size,
          char_indices[sent[-1]]] = 1


In [55]:
model.fit_generator(get_batches_reg(X_train, y_train, 64), 
                    steps_per_epoch=10000, nb_epoch=20, verbose=1)

  
  


Epoch 1/20
 1720/10000 [====>.........................] - ETA: 15:30 - loss: 0.0675 - root_mean_squared_error: 0.1893

KeyboardInterrupt: 

In [49]:
model.evaluate_generator(get_batches_reg(X_valid, y_valid), steps=100)

[0.06368298651650547, 0.18198715493083]

In [50]:
a = get_batches_reg(X_valid, y_valid)
x_v, y_v = next(a)
pred_v = model.predict(x_v)

In [51]:
from sklearn.metrics import mean_squared_error

In [52]:
np.sqrt(mean_squared_error(y_v, pred_v))

0.25795307453296906

In [39]:
list(zip(list(y_v), list(pred_v)))

[(array([0.86521]), array([0.14930925], dtype=float32)),
 (array([0.16155]), array([0.14810081], dtype=float32)),
 (array([0.]), array([0.14825732], dtype=float32)),
 (array([0.14563]), array([0.15518433], dtype=float32)),
 (array([0.52801]), array([0.149205], dtype=float32)),
 (array([0.14983]), array([0.14859098], dtype=float32)),
 (array([0.]), array([0.14725094], dtype=float32)),
 (array([0.]), array([0.14825732], dtype=float32)),
 (array([0.]), array([0.14473967], dtype=float32)),
 (array([0.]), array([0.1506771], dtype=float32)),
 (array([0.]), array([0.14495273], dtype=float32)),
 (array([0.]), array([0.14955118], dtype=float32)),
 (array([0.76786]), array([0.1428457], dtype=float32)),
 (array([0.12343]), array([0.15154795], dtype=float32)),
 (array([0.]), array([0.14619778], dtype=float32)),
 (array([0.]), array([0.15172417], dtype=float32)),
 (array([0.]), array([0.14825732], dtype=float32)),
 (array([0.34363]), array([0.15087205], dtype=float32)),
 (array([0.]), array([0.1488

In [32]:
model.lr.set_value(0.005)

AttributeError: 'Sequential' object has no attribute 'lr'

In [None]:
def find_prices(text):
\tprices = [re.sub('[^\d]+', '', x) for x in re.findall('([\d \.\,]{3,10}) ?[₽рР]у?б?', text)]
\tprices = [x for x in prices if len(x) > 2]
\tprices = [int(x) for x in prices if int(x) > 0]

\tif not prices:
\t\treturn 0
\telse:
\t\treturn np.log10(np.mean(prices))

In [27]:




def sample(preds, temperature=1.0):
    # helper function to sample an index from a probability array
    preds = np.asarray(preds).astype('float64')
    preds = np.log(preds) / temperature
    exp_preds = np.exp(preds)
    preds = exp_preds / np.sum(exp_preds)
    probas = np.random.multinomial(1, preds, 1)
    return np.argmax(probas)


def on_epoch_end(epoch, logs):
    # Function invoked at end of each epoch. Prints generated text.
    print()
    print('----- Generating text after Epoch: %d' % epoch)

    start_index = random.randint(0, len(text) - maxlen - 1)
    for diversity in [0.2, 0.5, 1.0, 1.2]:
        print('----- diversity:', diversity)

        generated = ''
        sentence = text[start_index: start_index + maxlen]
        generated += sentence
        print('----- Generating with seed: "' + sentence + '"')
        sys.stdout.write(generated)

        for i in range(100):
            x_pred = np.zeros((1, maxlen, len(chars)))
            for t, char in enumerate(sentence):
                x_pred[0, t, char_indices[char]] = 1.

            preds = model.predict(x_pred, verbose=0)[0]
            next_index = sample(preds, diversity)
            next_char = indices_char[next_index]

            generated += next_char
            sentence = sentence[1:] + next_char

            sys.stdout.write(next_char)
            sys.stdout.flush()
        print()




In [28]:
print_callback = LambdaCallback(on_epoch_end=on_epoch_end)



In [22]:
text = ''
for i,line in enumerate(open('descriptions_maxlen.txt')):
    text += line.rstrip('\n')
    text += ' '
    if i > 500:
        break

In [23]:
text[start_index: start_index + maxlen]

NameError: name 'start_index' is not defined

In [26]:
on_epoch_end(0,)



----- Generating text after Epoch: 0
----- diversity: 0.2
----- Generating with seed: "енно новое платье. цвет темно синий. кон"
енно новое платье. цвет темно синий. контрона в комплект                                                                                    
----- diversity: 0.5
----- Generating with seed: "енно новое платье. цвет темно синий. кон"
енно новое платье. цвет темно синий. контакти:                                         подчитается области и детские торг.           комплек
----- diversity: 1.0
----- Generating with seed: "енно новое платье. цвет темно синий. кон"
енно новое платье. цвет темно синий. контактвопд, нядюй  мковами и отличный эконеа за 100     для плотно.  изделимаяель, амекторы, молиз , а
----- diversity: 1.2
----- Generating with seed: "енно новое платье. цвет темно синий. кон"
енно новое платье. цвет темно синий. конината или  ув. изведим детестле кочеглаховна. иданед телефон ( тишинв 3.мартные гтлшные, разделегига
