In [6]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, TimeDistributed, RepeatVector, Dense, BatchNormalization
import numpy as np
import wandb
from wandb.keras import WandbCallback
import random

wandb.init()
config = wandb.config

class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)

# Parameters for the model and dataset.
config.hidden_size = 128
config.batch_size = 32

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
maxlen = 25
output_len = 10
# All the numbers, plus sign and space for padding.
chars = '0123456789+$-,/.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuwxyv '
ctable = CharacterTable(chars)


import pandas as pd
df = pd.read_csv("rent_prices.csv")
df = df.fillna(0)
from sklearn.utils import shuffle
df = shuffle(df)


raw_prices = df["Price on site"].tolist()
parced_prices = []

float_min_price = [float(price) for price in df["Min price"]]
float_max_price = [float(price) for price in df["Max price"]]

#MIN_PRICE MODEL

#train data
#augmentation v2

split_at = len(float_min_price) - len(float_min_price) // 10

raw_prices_train = []
parced_prices_train = []

for i in range(1000):  #augmentation x100 times (from ~500 to 500000 ~datapoints)
    for idx in range(split_at):
        raw_price = str(raw_prices[idx])
        parsed_price = str(float_min_price[idx])
        for i in range(len(parsed_price)):
            to_replace = parsed_price[i]
            replace_with = to_replace
            if str(float_min_price[idx])[0] != '0':
                replace_with = str(random.randint(1,9))
            parsed_price = parsed_price.replace(to_replace,replace_with)
            raw_price = raw_price.replace(to_replace,replace_with)

        parsed_price+=' '*(output_len - len(str(float_min_price[idx])))
        parced_prices_train.append(parsed_price)
        raw_prices_train.append(raw_price)
        
#val data
raw_prices_val = []
parced_prices_val = []
for idx in range(split_at,len(raw_prices)-1):
    raw_price = str(raw_prices[idx])
    parsed_price = str(float_min_price[idx])

    parsed_price+=' '*(output_len - len(str(float_min_price[idx])))
    parced_prices_val.append(parsed_price)
    raw_prices_val.append(raw_price)
        
        
print('Total train prices:', len(raw_prices_train))
print('Total validation prices:', len(raw_prices_val))

print('Vectorization...')
x_train = np.zeros((len(raw_prices_train), maxlen, len(chars)), dtype=np.bool)
y_train = np.zeros((len(parced_prices_train), output_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(raw_prices_train):
    x_train[i] = ctable.encode(str(sentence), maxlen)
for i, sentence in enumerate(parced_prices_train):
    y_train[i] = ctable.encode(str(sentence), output_len)
    
x_val = np.zeros((len(raw_prices_val), maxlen, len(chars)), dtype=np.bool)
y_val = np.zeros((len(parced_prices_val), output_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(raw_prices_val):
    x_val[i] = ctable.encode(str(sentence), maxlen)
for i, sentence in enumerate(parced_prices_val):
    y_val[i] = ctable.encode(str(sentence), output_len)

model = Sequential()
model.add(LSTM(config.hidden_size, input_shape=(maxlen, len(chars))))
model.add(RepeatVector(output_len))

model.add(BatchNormalization())

model.add(LSTM(config.hidden_size, return_sequences=True))
model.add(TimeDistributed(Dense(len(chars), activation='softmax')))
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
             metrics=['accuracy'])
model.summary()

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(x_train, y_train,
              batch_size=config.batch_size,
              epochs=1,
              validation_data=(x_val, y_val),callbacks=[WandbCallback()])
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Q', q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print('☑', end=' ')
        else:
            print('☒', end=' ')
        print('G',guess, end='\n')


Total train prices: 522000
Total validation prices: 57
Vectorization...
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_6 (LSTM)                (None, 128)               100864    
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 10, 128)           0         
_________________________________________________________________
batch_normalization_3 (Batch (None, 10, 128)           512       
_________________________________________________________________
lstm_7 (LSTM)                (None, 10, 128)           131584    
_________________________________________________________________
time_distributed_3 (TimeDist (None, 10, 68)            8772      
Total params: 241,732
Trainable params: 241,476
Non-trainable params: 256
_________________________________________________________________

---------------------------------------

In [None]:
for iteration in range(1, 10):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    model.fit(x_train, y_train,
              batch_size=config.batch_size,
              epochs=1,
              validation_data=(x_val, y_val),callbacks=[WandbCallback()])
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Q', q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print('☑', end=' ')
        else:
            print('☒', end=' ')
        print('G',guess, end='\n')


--------------------------------------------------
Iteration 1
Train on 522000 samples, validate on 57 samples
 20800/522000 [>.............................] - ETA: 9:27 - loss: 0.2754 - acc: 0.9054

In [2]:
print('Min price prediction ----------------------------------------')
for ind in range(len(x_val)):
    rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
    preds = model.predict_classes(rowx, verbose=0)
    q = ctable.decode(rowx[0])
    correct = ctable.decode(rowy[0])
    guess = ctable.decode(preds[0], calc_argmax=False)
    print('Q', q, end=' ')
    print('T', correct, end=' ')
    if correct == guess:
        print('☑', end=' ')
    else:
        print('☒', end=' ')
    print('G',guess, end='\n')

Min price prediction ----------------------------------------
Q $0,000 /month             T 0000       ☑ G 0000      
Q $9,555/mo                 T 9555       ☑ G 9555      
Q $6,638/mo                 T 6638       ☑ G 6638      
Q $1,332 /month             T 1332       ☑ G 1332      
Q $8,247+ /month            T 8247       ☑ G 8247      
Q $2,196 /month             T 2196       ☑ G 2196      
Q $7,161+ /month            T 7161       ☑ G 7161      
Q $1,153+ /month            T 1153       ☑ G 1153      
Q $25.55 SF/YR              T 25         ☑ G 25        
Q $1,444+ /month            T 1444       ☑ G 1444      
Q $2,000/mo                 T 2000       ☑ G 2000      
Q $48,000/mo                T 48000      ☑ G 48000     
Q $7,676/mo                 T 7676       ☑ G 7676      
Q $7,429 /month             T 7429       ☑ G 7429      
Q $93.00 SF/YR              T 93         ☑ G 93        
Q $1,733 /month             T 1733       ☑ G 1733      
Q $4,377/mo                 T 4377       ☑

In [None]:
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, TimeDistributed, RepeatVector, Dense, BatchNormalization
import numpy as np
import wandb
from wandb.keras import WandbCallback
import random

wandb.init()
config = wandb.config

class CharacterTable(object):
    """Given a set of characters:
    + Encode them to a one hot integer representation
    + Decode the one hot integer representation to their character output
    + Decode a vector of probabilities to their character output
    """
    def __init__(self, chars):
        """Initialize character table.
        # Arguments
            chars: Characters that can appear in the input.
        """
        self.chars = sorted(set(chars))
        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))

    def encode(self, C, num_rows):
        """One hot encode given string C.
        # Arguments
            num_rows: Number of rows in the returned one hot encoding. This is
                used to keep the # of rows for each data the same.
        """
        x = np.zeros((num_rows, len(self.chars)))
        for i, c in enumerate(C):
            x[i, self.char_indices[c]] = 1
        return x

    def decode(self, x, calc_argmax=True):
        if calc_argmax:
            x = x.argmax(axis=-1)
        return ''.join(self.indices_char[x] for x in x)

# Parameters for the model and dataset.
config.hidden_size = 128
config.batch_size = 32

# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
# int is DIGITS.
maxlen = 25
output_len = 10
# All the numbers, plus sign and space for padding.
chars = '0123456789+$-,/.ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuwxyv '
ctable = CharacterTable(chars)


import pandas as pd
df = pd.read_csv("rent_prices.csv")
df = df.fillna(0)
from sklearn.utils import shuffle
df = shuffle(df)


raw_prices = df["Price on site"].tolist()
parced_prices = []

float_min_price = [float(price) for price in df["Min price"]]
float_max_price = [float(price) for price in df["Max price"]]



In [3]:
#MAX_PRICE MODEL

#train data
#augmentation v2

split_at = len(float_max_price) - len(float_max_price) // 10

raw_prices_train = []
parced_prices_train = []

for i in range(1000):  #augmentation x100 times (from ~500 to 500000 ~datapoints)
    for idx in range(split_at):
        raw_price = str(raw_prices[idx])
        parsed_price = str(float_max_price[idx])
        for i in range(len(parsed_price)):
            to_replace = parsed_price[i]
            replace_with = to_replace
            if i == 0: # to do not make first digit as zero
                if str(float_max_price[idx])[0] != '0':
                    replace_with = str(random.randint(1,9))
            else:
                replace_with = str(random.randint(0,9))
            parsed_price = parsed_price.replace(to_replace,replace_with)
            raw_price = raw_price.replace(to_replace,replace_with)

        parsed_price+=' '*(output_len - len(str(float_max_price[idx])))
        parced_prices_train.append(parsed_price)
        raw_prices_train.append(raw_price)
        
#val data
raw_prices_val = []
parced_prices_val = []
for idx in range(split_at,len(raw_prices)-1):
    raw_price = str(raw_prices[idx])
    parsed_price = str(float_max_price[idx])

    parsed_price+=' '*(output_len - len(str(float_max_price[idx])))
    parced_prices_val.append(parsed_price)
    raw_prices_val.append(raw_price)
        
        
print('Total train prices:', len(raw_prices_train))
print('Total validation prices:', len(raw_prices_val))

print('Vectorization...')
x_train = np.zeros((len(raw_prices_train), maxlen, len(chars)), dtype=np.bool)
y_train = np.zeros((len(parced_prices_train), output_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(raw_prices_train):
    x_train[i] = ctable.encode(str(sentence), maxlen)
for i, sentence in enumerate(parced_prices_train):
    y_train[i] = ctable.encode(str(sentence), output_len)
    
x_val = np.zeros((len(raw_prices_val), maxlen, len(chars)), dtype=np.bool)
y_val = np.zeros((len(parced_prices_val), output_len, len(chars)), dtype=np.bool)
for i, sentence in enumerate(raw_prices_val):
    x_val[i] = ctable.encode(str(sentence), maxlen)
for i, sentence in enumerate(parced_prices_val):
    y_val[i] = ctable.encode(str(sentence), output_len)

max_price_model = Sequential()
max_price_model.add(LSTM(config.hidden_size, input_shape=(maxlen, len(chars))))
max_price_model.add(RepeatVector(output_len))

max_price_model.add(BatchNormalization())

max_price_model.add(LSTM(config.hidden_size, return_sequences=True))
max_price_model.add(TimeDistributed(Dense(len(chars), activation='softmax')))
max_price_model.compile(loss='categorical_crossentropy',
              optimizer='adam',
             metrics=['accuracy'])
max_price_model.summary()

# Train the model each generation and show predictions against the validation
# dataset.
for iteration in range(1, 5):
    print()
    print('-' * 50)
    print('Iteration', iteration)
    max_price_model.fit(x_train, y_train,
              batch_size=config.batch_size,
              epochs=1,
              validation_data=(x_val, y_val),callbacks=[WandbCallback()])
    # Select 10 samples from the validation set at random so we can visualize
    # errors.
    for i in range(10):
        ind = np.random.randint(0, len(x_val))
        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
        preds = max_price_model.predict_classes(rowx, verbose=0)
        q = ctable.decode(rowx[0])
        correct = ctable.decode(rowy[0])
        guess = ctable.decode(preds[0], calc_argmax=False)
        print('Q', q, end=' ')
        print('T', correct, end=' ')
        if correct == guess:
            print('☑', end=' ')
        else:
            print('☒', end=' ')
        print('G',guess, end='\n')


Total train prices: 522000
Total validation prices: 57
Vectorization...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
lstm_2 (LSTM)                (None, 256)               332800    
_________________________________________________________________
repeat_vector_1 (RepeatVecto (None, 10, 256)           0         
_________________________________________________________________
batch_normalization_1 (Batch (None, 10, 256)           1024      
_________________________________________________________________
lstm_3 (LSTM)                (None, 10, 256)           525312    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 10, 68)            17476     
Total params: 876,612
Trainable params: 876,100
Non-trainable params: 512
_________________________________________________________________

---------------------------------------

In [4]:
print('Max price prediction ----------------------------------------')
for ind in range(len(x_val)):
    rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
    preds = max_price_model.predict_classes(rowx, verbose=0)
    q = ctable.decode(rowx[0])
    correct = ctable.decode(rowy[0])
    guess = ctable.decode(preds[0], calc_argmax=False)
    print('Q', q, end=' ')
    print('T', correct, end=' ')
    if correct == guess:
        print('☑', end=' ')
    else:
        print('☒', end=' ')
    print('G',guess, end='\n')

Max price prediction ----------------------------------------
Q $7,777 /month             T 7777       ☑ G 7777      
Q $7,111/mo                 T 7111       ☑ G 7111      
Q $1,136/mo                 T 1136       ☑ G 1136      
Q $3,667 /month             T 3667       ☑ G 3667      
Q $2,195+ /month            T 0          ☑ G 0         
Q $2,637 /month             T 2637       ☑ G 2637      
Q $2,848+ /month            T 0          ☑ G 0         
Q $1,196+ /month            T 0          ☑ G 0         
Q $75.55 SF/YR              T 75         ☑ G 75        
Q $2,450+ /month            T 0          ☑ G 0         
Q $6,000/mo                 T 6000       ☑ G 6000      
Q $11,669/mo                T 11669      ☑ G 11669     
Q $1,773/mo                 T 1773       ☑ G 1773      
Q $2,645 /month             T 2645       ☑ G 2645      
Q $17.00 SF/YR              T 17         ☑ G 17        
Q $5,822 /month             T 5822       ☑ G 5822      
Q $1,220/mo                 T 1220       ☑