In [None]:
# adapted from https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html
# https://www.youtube.com/watch?v=MqugtGD605k
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, TimeDistributed, RepeatVector, Dense, BatchNormalization
import numpy as np
import wandb
from wandb.keras import WandbCallback
import random

wandb.init()
config = wandb.config

class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)

# Parameters for the model and dataset.
config.hidden_size = 512
config.batch_size = 64

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
maxlen = 30
output_len = 10
# All the numbers, plus sign and space for padding.
chars = '0123456789+$-,/.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuwxyv '
ctable = CharacterTable(chars)


import pandas as pd
df = pd.read_csv("sale_prices.csv")
df = df.fillna(0)



raw_prices = df["Price on site"].tolist()
parced_prices = []

int_min_price = [int(price) for price in df["Min price"]]
int_max_price = [int(price) for price in df["Max price"]]

# for idx in range(len(int_min_price)):
#     toappend = str(int_min_price[idx])
#     toappend+=' '*(output_len - len(str(int_min_price[idx])))
#     parced_prices.append(toappend)
    
#augmentation (acc: 0.9878)
# for i in range(4):
#     for idx in range(len(int_min_price)):
#         raw_price = str(raw_prices[idx])
#         parsed_price = str(int_min_price[idx])
#         num_of_replacements = random.randint(1,5)
#         for i in range(num_of_replacements):
#             to_replace = str(random.randint(1,9))
#             replace_with = str(random.randint(1,9))
#             parsed_price.replace(to_replace,replace_with)
#             raw_price.replace(to_replace,replace_with)

#         parsed_price+=' '*(output_len - len(str(int_min_price[idx])))
#         parced_prices.append(parsed_price)
#         raw_prices.append(raw_price)

#augmentation v2

split_at = len(int_min_price) - len(int_min_price) // 10

#train data
raw_prices_train = []
parced_prices_train = []

for i in range(4):
    for idx in range(split_at):
        raw_price = str(raw_prices[idx])
        parsed_price = str(int_min_price[idx])
        for i in range(len(parsed_price)):
            to_replace = parsed_price[i]
            if i == 0: # to do not make first digit as zero
                replace_with = str(random.randint(1,9))
            else:
                replace_with = str(random.randint(0,9))
            parsed_price = parsed_price.replace(to_replace,replace_with)
            raw_price = raw_price.replace(to_replace,replace_with)

        parsed_price+=' '*(output_len - len(str(int_min_price[idx])))
        parced_prices_train.append(parsed_price)
        raw_prices_train.append(raw_price)
        
#val data
raw_prices_val = []
parced_prices_val = []
for index in range(split_at,len(raw_prices)-1):
    raw_price = str(raw_prices[index])
    parsed_price = str(int_min_price[index])
    for i in range(len(parsed_price)):
        to_replace = parsed_price[i]
        if i == 0: # to do not make first digit as zero
            replace_with = str(random.randint(1,9))
        else:
            replace_with = str(random.randint(0,9))
        parsed_price = parsed_price.replace(to_replace,replace_with)
        raw_price = raw_price.replace(to_replace,replace_with)

    parsed_price+=' '*(output_len - len(str(int_min_price[index])))
    parced_prices_val.append(parsed_price)
    raw_prices_val.append(raw_price)
        
        
print('Total train prices:', len(raw_prices_train))
print('Total validation prices:', len(raw_prices_val))

print('Vectorization...')
x_train = np.zeros((len(raw_prices_train), maxlen, len(chars)), dtype=np.bool)
y_train = np.zeros((len(parced_prices_train), output_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(raw_prices_train):
    x_train[i] = ctable.encode(str(sentence), maxlen)
for i, sentence in enumerate(parced_prices_train):
    y_train[i] = ctable.encode(str(sentence), output_len)
    
x_val = np.zeros((len(raw_prices_val), maxlen, len(chars)), dtype=np.bool)
y_val = np.zeros((len(parced_prices_val), output_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(raw_prices_val):
    x_val[i] = ctable.encode(str(sentence), maxlen)
for i, sentence in enumerate(parced_prices_val):
    y_val[i] = ctable.encode(str(sentence), output_len)

model = Sequential()
model.add(LSTM(config.hidden_size, input_shape=(maxlen, len(chars))))
model.add(RepeatVector(output_len))

model.add(BatchNormalization())

model.add(LSTM(config.hidden_size, return_sequences=True))
model.add(TimeDistributed(Dense(len(chars), activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
             metrics=['accuracy'])
model.summary()

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 200):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(x_train, y_train,
              batch_size=config.batch_size,
              epochs=1,
              validation_data=(x_val, y_val),callbacks=[WandbCallback()])
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Q', q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print('☑', end=' ')
        else:
            print('☒', end=' ')
        print('G',guess, end='\n')


Total train prices: 2300
Total validation prices: 62
Vectorization...
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_8 (LSTM)                (None, 512)               1189888   
_________________________________________________________________
repeat_vector_5 (RepeatVecto (None, 10, 512)           0         
_________________________________________________________________
batch_normalization (BatchNo (None, 10, 512)           2048      
_________________________________________________________________
lstm_9 (LSTM)                (None, 10, 512)           2099200   
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 68)            34884     
Total params: 3,326,020
Trainable params: 3,324,996
Non-trainable params: 1,024
_________________________________________________________________

-----------------------------------

Q $1,888,888                     T 1888888    ☒ G 8888888   
Q $166,666                       T 166666     ☒ G 666666    
Q $177,777                       T 177777     ☒ G 777777    
Q $177,777                       T 177777     ☒ G 777777    
Q $3,333,333                     T 3333333    ☒ G 333333    
Q $833,999                       T 833999     ☒ G 223333    
Q $37,777 - $388,577             T 37777      ☒ G 77777     
Q $1,711,111                     T 1711111    ☒ G 777711    

--------------------------------------------------
Iteration 10
Train on 2300 samples, validate on 62 samples
Q $119,333                       T 119333     ☒ G 343333    
Q $69,111                        T 69111      ☒ G 611111    
Q $7,877,777                     T 7877777    ☒ G 7777777   
Q $979,798                       T 979798     ☒ G 749999    
Q -                              T 2          ☒ G           
Q $636,666                       T 636666     ☒ G 666666    
Q $167,777                       T 

Q -                              T 9          ☒ G           
Q $1,122,222                     T 1122222    ☒ G 2222222   
Q $888,888                       T 888888     ☑ G 888888    
Q $8,444,444                     T 8444444    ☒ G 4444444   
Q $8,639,999                     T 8639999    ☒ G 99999999  
Q $474,444                       T 474444     ☒ G 444444    
Q $9,999,999                     T 9999999    ☑ G 9999999   
Q $6,277,777                     T 6277777    ☒ G 66777777  
Q $733,333                       T 733333     ☒ G 333333    
Q $1,711,111                     T 1711111    ☒ G 4451111   

--------------------------------------------------
Iteration 20
Train on 2300 samples, validate on 62 samples
Q $833,999                       T 833999     ☒ G 333999    
Q $376,266                       T 376266     ☒ G 666666    
Q $376,266                       T 376266     ☒ G 666666    
Q -                              T 5          ☒ G           
Q -                              T 

Q $832,999                       T 832999     ☒ G 333999    
Q $37,777 - $388,577             T 37777      ☒ G 77777     

--------------------------------------------------
Iteration 29
Train on 2300 samples, validate on 62 samples
Q $334,466                       T 334466     ☒ G 33366     
Q $4,444,444                     T 4444444    ☑ G 4444444   
Q $39,913,333                    T 39913333   ☒ G 933333333 
Q $2,127,111                     T 2127111    ☒ G 22211111  
Q $4,222,222                     T 4222222    ☒ G 2222222   
Q $7,877,777                     T 7877777    ☒ G 2777777   
Q -                              T 2          ☒ G           
Q $334,466                       T 334466     ☒ G 33366     
Q -                              T 2          ☒ G           
Q $3,936,666                     T 3936666    ☒ G 3336666   

--------------------------------------------------
Iteration 30
Train on 2300 samples, validate on 62 samples
Q $832,999                       T 832999     

Q $37,777 - $388,577             T 37777      ☒ G 377777    
Q $8,033,300 - $3,800,000        T 8033300    ☒ G 33330000  
Q $376,266                       T 376266     ☒ G 7366000   
Q $3,344,444                     T 3344444    ☒ G 33334444  
Q $655,111                       T 655111     ☒ G 6555111   

--------------------------------------------------
Iteration 39
Train on 2300 samples, validate on 62 samples
Q $4,999,999                     T 4999999    ☒ G 49999999  
Q $119,333                       T 119333     ☒ G 1119333   
Q $832,999                       T 832999     ☒ G 8888999   
Q -                              T 2          ☒ G 1         
Q $3,371,135                     T 3371135    ☒ G 3321111   
Q $744,444                       T 744444     ☒ G 4444444   
Q $177,777                       T 177777     ☒ G 117777    
Q $8,444,444                     T 8444444    ☒ G 84444444  
Q $272,222                       T 272222     ☒ G 2222222   
Q -                              T 

Q $3,344,444                     T 3344444    ☒ G 3334444   
Q $474,444                       T 474444     ☒ G 447444    
Q $668,222                       T 668222     ☒ G 666222    
Q $421,111                       T 421111     ☒ G 221111    
Q $225,222                       T 225222     ☒ G 255222    
Q -                              T 9          ☒ G           
Q $445,522                       T 445522     ☑ G 445522    
Q $3,936,666                     T 3936666    ☒ G 3336666   

--------------------------------------------------
Iteration 49
Train on 2300 samples, validate on 62 samples
Q $979,798                       T 979798     ☒ G 777999    
Q $37,777 - $388,577             T 37777      ☑ G 37777     
Q $22,299                        T 22299      ☑ G 22299     
Q $167,777                       T 167777     ☒ G 766777    
Q $3,666,666                     T 3666666    ☑ G 3666666   
Q $1,888,888                     T 1888888    ☒ G 888888    
Q $6,199,333                     T 

Q $69,111                        T 69111      ☒ G 66911     
Q $3,936,666                     T 3936666    ☒ G 3636666   
Q $39,913,333                    T 39913333   ☒ G 99993333  
Q $8,639,999                     T 8639999    ☒ G 9889999   
Q $989,999                       T 989999     ☑ G 989999    
Q $3,936,666                     T 3936666    ☒ G 3636666   
Q $636,666                       T 636666     ☒ G 6666666   
Q $733,333                       T 733333     ☑ G 733333    
Q $77,777                        T 77777      ☑ G 77777     
Q $9,999,999                     T 9999999    ☑ G 9999999   

--------------------------------------------------
Iteration 59
Train on 2300 samples, validate on 62 samples
Q -                              T 2          ☒ G 8         
Q $39,913,333                    T 39913333   ☒ G 39993333  
Q $3,666,666                     T 3666666    ☑ G 3666666   
Q $989,999                       T 989999     ☑ G 989999    
Q $2,127,111                     T 

Q $333,333                       T 333333     ☑ G 333333    
Q $421,111                       T 421111     ☑ G 421111    

--------------------------------------------------
Iteration 68
Train on 2300 samples, validate on 62 samples
Q $636,666                       T 636666     ☒ G 666666    
Q $733,333                       T 733333     ☑ G 733333    
Q $166,666                       T 166666     ☑ G 166666    
Q $22,233,333                    T 22233333   ☒ G 22323333  
Q $8,639,999                     T 8639999    ☒ G 89699999  
Q $936,666                       T 936666     ☒ G 996666    
Q $9,900,000                     T 9900000    ☒ G 9909000   
Q $9,999,999                     T 9999999    ☒ G 99999999  
Q $3,936,666                     T 3936666    ☑ G 3936666   
Q $733,333                       T 733333     ☑ G 733333    

--------------------------------------------------
Iteration 69
Train on 2300 samples, validate on 62 samples
Q $4,949,999                     T 4949999    

Q $22,233,333                    T 22233333   ☒ G 22223333  
Q $72,222                        T 72222      ☑ G 72222     
Q $9,999,999                     T 9999999    ☑ G 9999999   
Q $3,333,333                     T 3333333    ☑ G 3333333   
Q $655,111                       T 655111     ☑ G 655111    

--------------------------------------------------
Iteration 78
Train on 2300 samples, validate on 62 samples
Q $333,333                       T 333333     ☑ G 333333    
Q $1,888,888                     T 1888888    ☑ G 1888888   
Q $177,777                       T 177777     ☒ G 117777    
Q $39,913,333                    T 39913333   ☒ G 31913333  
Q $72,222                        T 72222      ☑ G 72222     
Q $77,777                        T 77777      ☑ G 77777     
Q $744,444                       T 744444     ☑ G 744444    
Q $655,111                       T 655111     ☑ G 655111    
Q $445,522                       T 445522     ☑ G 445522    
Q -                              T 

Q $376,266                       T 376266     ☒ G 372666    
Q $9,999,999                     T 9999999    ☑ G 9999999   
Q $4,444,444                     T 4444444    ☑ G 4444444   
Q $668,222                       T 668222     ☑ G 668222    
Q $9,793,471                     T 9793471    ☒ G 9977777   
Q $77,777                        T 77777      ☑ G 77777     
Q $668,222                       T 668222     ☑ G 668222    
Q $7,877,777                     T 7877777    ☒ G 7887777   

--------------------------------------------------
Iteration 88
Train on 2300 samples, validate on 62 samples
Q $832,999                       T 832999     ☒ G 822999    
Q $4,949,999                     T 4949999    ☑ G 4949999   
Q $3,371,135                     T 3371135    ☒ G 3331111   
Q $8,884,444                     T 8884444    ☒ G 8888444   
Q $668,222                       T 668222     ☑ G 668222    
Q $119,333                       T 119333     ☒ G 1191333   
Q $3,344,444                     T 

Q -                              T 1          ☒ G 3         
Q $22,233,333                    T 22233333   ☒ G 2223333   
Q $6,277,777                     T 6277777    ☑ G 6277777   
Q $119,333                       T 119333     ☑ G 119333    
Q $8,884,444                     T 8884444    ☑ G 8884444   
Q $7,877,777                     T 7877777    ☑ G 7877777   
Q -                              T 9          ☒ G 3         
Q $888,888                       T 888888     ☑ G 888888    
Q $8,884,444                     T 8884444    ☑ G 8884444   
Q $655,111                       T 655111     ☑ G 655111    

--------------------------------------------------
Iteration 98
Train on 2300 samples, validate on 62 samples
Q $167,777                       T 167777     ☑ G 167777    
Q $4,949,999                     T 4949999    ☑ G 4949999   
Q $444,666 - $4,434,666          T 444666     ☑ G 444666    
Q $77,777                        T 77777      ☑ G 77777     
Q $9,999,999                     T 

Q $22,299                        T 22299      ☑ G 22299     
Q $166,666                       T 166666     ☑ G 166666    

--------------------------------------------------
Iteration 107
Train on 2300 samples, validate on 62 samples
Q $474,444                       T 474444     ☑ G 474444    
Q -                              T 5          ☒ G 7         
Q $69,111                        T 69111      ☑ G 69111     
Q $22,233,333                    T 22233333   ☑ G 22233333  
Q $1,122,222                     T 1122222    ☑ G 1122222   
Q $77,777                        T 77777      ☑ G 77777     
Q $979,798                       T 979798     ☒ G 999977    
Q $4,455,555                     T 4455555    ☑ G 4455555   
Q $376,266                       T 376266     ☒ G 372666    
Q $744,444                       T 744444     ☑ G 744444    

--------------------------------------------------
Iteration 108
Train on 2300 samples, validate on 62 samples
Q $39,913,333                    T 39913333 

Q $188,888                       T 188888     ☒ G 118888    
Q $3,371,135                     T 3371135    ☒ G 3331111   
Q $833,999                       T 833999     ☒ G 333999    
Q $8,033,300 - $3,800,000        T 8033300    ☒ G 0033003   
Q $4,222,222                     T 4222222    ☑ G 4222222   

--------------------------------------------------
Iteration 117
Train on 2300 samples, validate on 62 samples
Q $9,999,999                     T 9999999    ☑ G 9999999   
Q $9,999,999                     T 9999999    ☑ G 9999999   
Q $2,127,111                     T 2127111    ☒ G 2122111   
Q $744,444                       T 744444     ☑ G 744444    
Q $3,371,135                     T 3371135    ☒ G 333111    
Q $8,033,300 - $3,800,000        T 8033300    ☒ G 9333333   
Q $733,333                       T 733333     ☑ G 733333    
Q -                              T 1          ☒ G 4         
Q $8,033,300 - $3,800,000        T 8033300    ☒ G 9333333   
Q $333,333                       T