In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from tensorflow import keras

from tensorflow.keras.preprocessing import text_dataset_from_directory
from tensorflow.strings import regex_replace
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from keras.layers import Dense, SimpleRNN, RNN, LSTM,Dropout,Bidirectional,Attention,Embedding
from keras.optimizers import Adam,SGD,RMSprop,schedules
import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

In [2]:
# Config for text vectorizer
# max_features = 5000  # Maximum vocab size.
max_features = 200  # Maximum vocab size. # vocabsize found as 192 see below
max_len = 9  # Sequence length to pad the outputs to.
embedding_dims = 2

In [3]:
# read the text part of the data set
x_train=[]
f = open("hack_txt_reform.txt", "r")
for l in f:
  x_train.append([l.replace("\n","")])
f.close()
x_train

[['CPU1 RAM7 S23 GPU1 DT DT DT Desktop UK'],
 ['CPU3a RAM4 S17 GPU1 DT DT DT Desktop IN'],
 ['CPU3a RAM4 S6 GPU1 15 HDP NonXTouch Laptop DE'],
 ['CPU3a RAM4 S17 GPU1 15 HDP NonXTouch Laptop IN'],
 ['CPU3a RAM7 S23 GPU1 15 HDP NonXTouch Laptop IN'],
 ['CPU3c RAM7 S6 GPU1 13 HDP NonXTouch Laptop US'],
 ['CPU3c RAM7 S18 GPU1 13 FHD NonXTouch Laptop US'],
 ['CPU3c RAM7 S6 GPU1 13 HD NonXTouch Laptop US'],
 ['CPU3c RAM7 S18 GPU1 13 HDP NonXTouch Laptop DE'],
 ['CPU3c RAM7 S11 GPU1 13 HDP NonXTouch Laptop DE'],
 ['CPU3c RAM7 S18 GPU1 13 FHDP Touch Laptop US'],
 ['CPU5b RAM10 S30 GPU4 DT DT DT Desktop US'],
 ['CPU5b RAM4 S6 GPU1 12 QHDP Touch Laptop US'],
 ['CPU5b RAM4 S6 GPU1 14 HDP NonXTouch Laptop BR'],
 ['CPU5b RAM4 S17 GPU1 14 FHD NonXTouch Laptop BR'],
 ['CPU5b RAM4 S17 GPU1 DT DT DT Desktop IN'],
 ['CPU5b RAM4 S23 GPU1 15 FHD NonXTouch Laptop IN'],
 ['CPU5b RAM4 S11 GPU1 12 HDP NonXTouch Laptop CN'],
 ['CPU5b RAM4 S23 GPU1 15 HDP NonXTouch Laptop IN'],
 ['CPU5b RAM4 S11 GPU1 13 HDP Non

In [4]:
# read the prices of the data set
prices=[]
f = open("hack_price_reform.txt", "r")
for l in f:
  prices.append([float(l.replace("\n",""))])
f.close()
prices

[[283.37],
 [559.86],
 [447.22],
 [461.86],
 [615.99],
 [899.99],
 [1549.99],
 [899.99],
 [1804.22],
 [1615.42],
 [1099.99],
 [1149.0],
 [719.95],
 [1043.82],
 [1043.82],
 [763.0],
 [798.0],
 [885.0],
 [545.86],
 [1343.99],
 [539.0],
 [602.0],
 [1948.7],
 [869.0],
 [1999.11],
 [795.0],
 [854.85],
 [839.85],
 [1518.3],
 [1882.98],
 [1010.64],
 [459.0],
 [1768.82],
 [1223.82],
 [867.86],
 [849.66],
 [702.8],
 [1217.99],
 [1844.18],
 [949.0],
 [849.0],
 [566.82],
 [749.0],
 [1007.3],
 [919.0],
 [1499.99],
 [2210.14],
 [2728.7],
 [990.0],
 [2232.53],
 [1899.99],
 [1699.99],
 [1408.84],
 [1680.0],
 [1899.99],
 [1048.6],
 [1700.8],
 [1231.86],
 [2519.86],
 [116.82],
 [2122.82],
 [1532.82],
 [1899.99],
 [1550.0],
 [1245.0],
 [1608.46],
 [1680.0],
 [1499.99],
 [1301.88],
 [1829.0],
 [1129.99],
 [1999.0],
 [1363.63],
 [2605.09],
 [2594.82],
 [3768.7],
 [1899.0],
 [733.35],
 [809.85],
 [899.85],
 [1858.2],
 [1079.82],
 [826.67],
 [1251.99],
 [1049.0],
 [1063.86],
 [1949.99],
 [1801.66],
 [1008.0

In [5]:
#Scale the data
scaler = MinMaxScaler(feature_range=(0,1))
y_train = scaler.fit_transform(prices)

# our trainings values
y_train

array([[0.01499391],
       [0.03823846],
       [0.0287688 ],
       ...,
       [0.16770563],
       [0.24336876],
       [0.50416694]])

In [6]:
#Convert the x_train and y_train to numpy arrays, Keras needs it for model.fit 
x_train, y_train = np.array(x_train), np.array(y_train)
x_train.shape

(3414, 1)

In [7]:
# Create the text vectorization layerlayer.
vectorize_layer = TextVectorization(
 max_tokens=max_features,
 output_mode='int',
 output_sequence_length=max_len)

# dapadt builds the vocab for later vectorization
vectorize_layer.adapt(x_train)

In [8]:
# print the length of the vocab and the vocab, just to see it
print(len(vectorize_layer.get_vocabulary()))
vectorize_layer.get_vocabulary()

192


['',
 '[UNK]',
 'dt',
 'laptop',
 'nonxtouch',
 'fhd',
 'ram7',
 'cpu10c',
 '15',
 'gpu1',
 'us',
 'ram10',
 'desktop',
 's23',
 'cpu10b',
 'de',
 'cn',
 'touch',
 '17',
 's11',
 'ram14',
 's18',
 'ram4',
 'cpu11c',
 'uhd',
 's33',
 '13',
 'gpu6',
 's30',
 'gpu14',
 'gpu18',
 '14',
 'uk',
 'hdp',
 's45',
 'cpu8b',
 'br',
 'cpu18c',
 'in',
 's6',
 'gpu4',
 'gpu21',
 'fr',
 'gpu26',
 'gpu25',
 'jp',
 'qhdp',
 's36',
 'cpu11b',
 's17',
 'gpu27',
 'cpu10d',
 'gpu16',
 'ram9',
 'cpu8a',
 'au',
 's53',
 'gpu5',
 '12',
 's56',
 'cpu21b',
 'gpu29',
 'fhdp',
 'gpu30',
 's26',
 'gpu15',
 'hd',
 'cpu5c',
 'gpu13',
 'cpu20',
 's25',
 'qhd',
 's2',
 'cpu18a',
 'ram6',
 '23',
 's50',
 'cpu12b',
 'gpu22',
 '27',
 'cpu5b',
 's10',
 'ram2',
 'gpu3',
 'ram15',
 's37',
 's3',
 'ram12',
 's31',
 'cpu12a',
 's51',
 'gpu2',
 's64',
 'gpu24',
 '10',
 'cpu17',
 '21',
 'cpu21a',
 'gpu17',
 'cpu14',
 's43',
 'gpu31',
 'cpu16',
 'cpu15',
 'gpu12',
 's68',
 's57',
 'gpu19',
 'cpu9',
 's9',
 's71',
 's58',
 's40',

In [9]:
# start to build the model
model = tf.keras.models.Sequential()
# define the input shape
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
#add the text vectorizer
model.add(vectorize_layer)

In [10]:
# let's see what the vectorizer will do 
model.predict(x_train)

array([[191,   6,  13, ...,   2,  12,  32],
       [147,  22,  49, ...,   2,  12,  38],
       [147,  22,  39, ...,   4,   3,  15],
       ...,
       [153,  11,  56, ...,   4,  12,  16],
       [153,  20,  34, ...,   4,  12,  16],
       [153,  20,  49, ...,   4,  12,  36]])

In [11]:
# that's how the model looks like at the moment
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 9)                 0         
Total params: 0
Trainable params: 0
Non-trainable params: 0
_________________________________________________________________


In [12]:
# add the embdeing layer the intergers will be transformed to in this case 128 element vector
model.add(Embedding(max_features + 1, 128))

In [13]:
# that's how the model looks like at the moment
# the embeding is trainable, so model.predict does not make sense here
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 9)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 9, 128)            25728     
Total params: 25,728
Trainable params: 25,728
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.add(LSTM(128))
model.add(Dropout(0.1))
model.add(Dense(128))
model.add(Dropout(0.1))
model.add(Dense(64))
model.add(Dense(1))

In [15]:
# that's how the model looks like at the moment
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_vectorization (TextVect (None, 9)                 0         
_________________________________________________________________
embedding (Embedding)        (None, 9, 128)            25728     
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               16512     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)                8

In [16]:
#Compile the model
lr_schedule = schedules.ExponentialDecay(
    initial_learning_rate=3e-3,
    decay_steps=10000,
    decay_rate=0.8)
opt = Adam(learning_rate=lr_schedule)
model.compile(optimizer=opt, loss='mean_squared_error')

In [17]:
#Train the model
model_info=model.fit(x_train, y_train, batch_size=4, epochs=100,verbose=1, shuffle=True, validation_split=0.1)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [19]:
pred_price = model.predict([['CPU3a RAM4 S17 GPU1 DT DT DT Desktop IN']])
#undo the scaling 
pred_price = scaler.inverse_transform(pred_price)
print(pred_price)

[[495.52524]]


In [20]:
model.save("hack_model/lstm_01")

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: hack_model/lstm_01/assets
