In [21]:
import pandas as pd
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.svm import LinearSVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error
from sklearn.model_selection import KFold, RandomizedSearchCV, GridSearchCV ,cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from tensorflow.keras import layers, models, optimizers, losses
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
from xgboost import XGBRegressor
import tensorflow as tf
import pickle
import string
import re
import numpy as np
import statistics
import math
import time
import warnings

warnings.filterwarnings('ignore')

In [2]:
data = pd.read_pickle('preprocess_data.pkl')

train = data.loc[data.flag=='train',:]
print(train.shape)
test = data.loc[data.flag=='test',:]
print(test.shape)

train.drop(columns=['flag'],inplace=True)
test.drop(columns=['flag'],inplace=True)


(55284, 30)
(29769, 30)


In [3]:
train.isnull().sum().sort_values(ascending=False)

neighborhood_overview          19506
host_response_time             17802
host_response_rate             17802
space                          16881
summary                         2954
host_has_profile_pic             111
host_identity_verified           111
host_since                       111
host_is_superhost                111
name                              14
guests_included                    0
review_scores_value                0
review_scores_location             0
review_scores_communication        0
reviews_per_month                  0
review_scores_checkin              0
review_scores_cleanliness          0
review_scores_accuracy             0
review_scores_rating               0
bedrooms                           0
price                              0
amenities                          0
bed_type                           0
beds                               0
bathrooms                          0
accommodates                       0
property_type                      0
h

In [4]:
##### Null treatment

def treat_null(df):
    for c in ['host_has_profile_pic','host_is_superhost','host_identity_verified','host_response_rate','host_response_time']:
        df[c].fillna(0,inplace=True)
    df['neighborhood_overview'].fillna('',inplace=True)
    df['space'].fillna('',inplace=True)
    df['summary'].fillna('',inplace=True)
    df['host_since'].fillna(statistics.median(df['host_since'].dropna().tolist()),inplace=True)
    return df


train = treat_null(train)
test = treat_null(test)

train.isnull().sum().sort_values(ascending=False)
    

name                           14
beds                            0
reviews_per_month               0
review_scores_value             0
review_scores_location          0
review_scores_communication     0
review_scores_checkin           0
review_scores_cleanliness       0
review_scores_accuracy          0
review_scores_rating            0
guests_included                 0
price                           0
amenities                       0
bed_type                        0
bedrooms                        0
summary                         0
bathrooms                       0
accommodates                    0
property_type                   0
host_identity_verified          0
host_has_profile_pic            0
host_total_listings_count       0
host_is_superhost               0
host_response_rate              0
host_response_time              0
host_since                      0
neighborhood_overview           0
space                           0
listing_id                      0
dtype: int64

In [5]:
###### Feature identification

target_col = 'price'
id_col = 'listing_id'
text_cols = ['name','summary','space','neighborhood_overview']
numeric_cols = [c for c in train.columns if c not in [target_col]+[id_col]+text_cols]


### Modeling with only numeric features

In [6]:
features = train.dropna()
X = features[[c for c in numeric_cols]]
y = features[target_col]

In [7]:
X.isnull().sum().sort_values(ascending=False)


host_since                     0
bed_type                       0
review_scores_value            0
review_scores_location         0
review_scores_communication    0
review_scores_checkin          0
review_scores_cleanliness      0
review_scores_accuracy         0
review_scores_rating           0
guests_included                0
amenities                      0
beds                           0
host_response_time             0
bedrooms                       0
bathrooms                      0
accommodates                   0
property_type                  0
host_identity_verified         0
host_has_profile_pic           0
host_total_listings_count      0
host_is_superhost              0
host_response_rate             0
reviews_per_month              0
dtype: int64

In [8]:
print(X.shape)
print(len(y))

(55270, 23)
55270


In [9]:
X_train, X_val,y_train,y_val = train_test_split(X,y, test_size=0.2, random_state=1)
print(X_train.shape, X_val.shape)

(44216, 23) (11054, 23)


### Models with only numeric features

##### Baseline Model preparation

- models : Linear Regression, Support Vector Machine, Decision Tree, Random Forest, Adaptive Boosting, Extreme Gradient Boosting 
- included all the features with no hyperparameter tuning and no outlier removal
- evaluation metrics: R-square, mean absolute error, mean percentage error

In [10]:


models = [LinearRegression(), LinearSVR(), DecisionTreeRegressor(),
        RandomForestRegressor(), AdaBoostRegressor(), XGBRegressor()]

cv = KFold(n_splits=10)


def multiple_scoring(actual, pred):
        rmse = round(mean_squared_error(actual, pred, squared=False),3) 
        r2 =  round(r2_score(actual, pred),3)
        mae = round(mean_absolute_error(actual, pred),3)
        mape = round(mean_absolute_percentage_error(actual, pred),3)
        return [rmse, r2, mae, mape]
        



def show_baseline_performance(Xtrain, ytrain, Xtest, ytest):
        baseline_df = pd.DataFrame([], columns=['model_name','train_rmse','train_r2','train_mae','train_mape',
                                                'val_rmse','val_r2','val_mae','val_mape','time_taken'])
        for model in models:
                result_list = []
                result_list.append(str(model).split('(')[0])
                start = time.time()
                model.fit(Xtrain, ytrain)
                yfit = model.predict(Xtrain)
                result_list.extend(multiple_scoring(ytrain, yfit))
                ypred = model.predict(Xtest)
                result_list.extend(multiple_scoring(ytest, ypred))
                stop = time.time()
                result_list.append(round(stop - start, 2))
                baseline_df.loc[len(baseline_df.index)] = result_list
        return baseline_df


In [11]:
baseline_df = show_baseline_performance(X_train, y_train, X_val, y_val)
baseline_df.sort_values(by=['val_rmse','val_mape'], ascending=True)

Unnamed: 0,model_name,train_rmse,train_r2,train_mae,train_mape,val_rmse,val_r2,val_mae,val_mape,time_taken
5,XGBRegressor,46.941,0.687,30.535,0.389,54.994,0.561,35.407,0.431,1.86
3,RandomForestRegressor,25.452,0.908,15.239,0.199,56.173,0.542,36.47,0.453,39.63
0,LinearRegression,62.607,0.444,41.07,0.551,62.16,0.439,41.346,0.54,0.09
4,AdaBoostRegressor,65.074,0.399,48.066,0.736,64.293,0.4,47.719,0.717,1.25
1,LinearSVR,67.029,0.362,40.145,0.408,66.753,0.353,40.524,0.399,10.36
2,DecisionTreeRegressor,16.157,0.963,3.488,0.055,76.907,0.141,48.177,0.567,0.68


#### Linear Regression

In [12]:
# feature selection

def select_linear_features(X_train, y_train, X_val, score_func):
	scale = StandardScaler().fit(X_train)
	X_train = scale.transform(X_train)
	X_val = scale.transform(X_train)
	fs = SelectKBest(score_func=score_func, k=15)
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_val_fs = fs.transform(X_val)
	return X_train_fs, X_val_fs



cv = KFold(n_splits=10)
# feature selection
X_train_k, X_val_k = select_linear_features(X_train, y_train, X_val, mutual_info_regression)
model = LinearRegression()
cv_score = cross_val_score(model,X_train_k,y_train,scoring='neg_mean_absolute_percentage_error',cv=cv)
print("MSE: mean {:.2f} std {:.3f}".format(statistics.mean(cv_score)*-1, statistics.stdev(cv_score)))

# model.fit(X_train_fs, y_train)
# # evaluate the model
# yhat = model.predict(X_test_fs)
# # evaluate predictions
# mae = mean_absolute_error(y_test, yhat)
# print('MAE: %.3f' % mae)





MSE: mean 0.55 std 0.008


##### Decision Tree Regressor

In [13]:

params = {'max_depth':np.arange(2,6),
        'min_samples_leaf':np.arange(0,1.0,0.1),
        }


def tune_dt():
    result_df = pd.DataFrame([],columns=['type','rmse','r2','mae','mape'])
    train_results = ['train']
    val_results = ['val']
    tuner = GridSearchCV(DecisionTreeRegressor(), params, refit=True,
                            scoring='neg_mean_absolute_percentage_error',
                            cv=10)
    tuner.fit(X_train, y_train)
    print(f"Best Parameters: \n{tuner.best_params_}")
    best_model = tuner.best_estimator_
    yfit = best_model.predict(X_train)
    train_results.extend(multiple_scoring(y_train, yfit))
    result_df.loc[len(result_df.index)] = train_results
    ypred = best_model.predict(X_val)
    val_results.extend(multiple_scoring(y_val,ypred))
    result_df.loc[len(result_df.index)] = val_results
    return result_df

dt_results = tune_dt()
dt_results.head()


Best Parameters: 
{'max_depth': 5, 'min_samples_leaf': 0.1}


Unnamed: 0,type,rmse,r2,mae,mape
0,train,65.749,0.386,43.055,0.545
1,val,64.486,0.396,42.584,0.537


##### RandomForest Regression

In [14]:

params = {'n_estimators':[100,200,500],
        'max_depth':np.arange(2,6),
        'min_samples_leaf':np.arange(0,0.5,0.1),
        'min_weight_fraction_leaf':np.arange(0,0.5,0.1),
        }


def tune():
    result_df = pd.DataFrame([],columns=['type','rmse','r2','mae','mape'])
    train_results = ['train']
    val_results = ['val']
    tuner = RandomizedSearchCV(RandomForestRegressor(n_jobs=-1), params, refit=True,
                            scoring='neg_mean_absolute_percentage_error',
                            verbose=10, cv=10, n_iter=50)
    tuner.fit(X_train, y_train)
    print(f"Best Parameters: \n{tuner.best_params_}")
    best_model = tuner.best_estimator_
    yfit = best_model.predict(X_train)
    train_results.extend(multiple_scoring(y_train, yfit))
    result_df.loc[len(result_df.index)] = train_results
    ypred = best_model.predict(X_val)
    val_results.extend(multiple_scoring(y_val,ypred))
    result_df.loc[len(result_df.index)] = val_results
    return result_df

rf_results = tune()
rf_results.head()


Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV 1/10; 1/50] START max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200
[CV 1/10; 1/50] END max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200;, score=-0.956 total time=   4.7s
[CV 2/10; 1/50] START max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200
[CV 2/10; 1/50] END max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200;, score=-0.921 total time=   0.5s
[CV 3/10; 1/50] START max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200
[CV 3/10; 1/50] END max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200;, score=-0.927 total time=   0.6s
[CV 4/10; 1/50] START max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200
[CV 4/10; 1/50] END max_depth=4, min_samples_leaf=0.4, min_weight_fraction_leaf=0.1, n_estimators=200;, score=-0.

Unnamed: 0,type,rmse,r2,mae,mape
0,train,67.904,0.345,45.115,0.592
1,val,66.325,0.361,44.498,0.58


##### Extreme Gradient Boosting Regression

objective


In [19]:

params = {'n_estimators':np.arange(100,600,100),
        'max_depth':np.arange(2,7),
        'min_child_weight':np.arange(0,0.5,0.1),
        'colsample_bytree':np.arange(0.5,1.,0.1)
        }


def tune():
    result_df = pd.DataFrame([],columns=['type','rmse','r2','mae','mape'])
    train_results = ['train']
    val_results = ['val']
    tuner = RandomizedSearchCV(XGBRegressor(booster='gbtree',n_jobs=-1), params, refit=True,
                            scoring='neg_mean_absolute_percentage_error',
                            verbose=2, cv=10, n_iter=50)
    tuner.fit(X_train, y_train)
    print(f"Best Parameters: \n{tuner.best_params_}")
    best_model = tuner.best_estimator_
    yfit = best_model.predict(X_train)
    train_results.extend(multiple_scoring(y_train, yfit))
    result_df.loc[len(result_df.index)] = train_results
    ypred = best_model.predict(X_val)
    val_results.extend(multiple_scoring(y_val,ypred))
    result_df.loc[len(result_df.index)] = val_results
    return result_df

xgb_results = tune()
xgb_results.head()


Fitting 10 folds for each of 50 candidates, totalling 500 fits
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   4.2s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.9s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.8s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.8s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.8s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.7s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.7s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.9s
[CV] END colsample_bytree=0.7, max_depth=4, min_child_weight=0.1, n_estimators=500; total time=   3.8s
[CV] END c

Unnamed: 0,type,rmse,r2,mae,mape
0,train,47.063,0.686,30.602,0.39
1,val,54.971,0.561,35.5,0.432


In [31]:

def create_ann():
    model = models.Sequential()
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dropout(0.1))
    model.add(layers.Dense(16, activation='relu'))
    model.add(layers.Dense(1, activation='linear'))
    model.compile(optimizer=optimizers.Adam(learning_rate=1e-4), loss='mse', metrics=['mae'])
    return model

ann_model = create_ann()
ann_model.fit(X_train, y_train, epochs=30, batch_size=32, validation_data=(X_val, y_val), verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x219749dc280>

### Modeling with Text data included

In [129]:
#### load word2vec embeddings


from gensim.models import Word2Vec, KeyedVectors
from gensim.models.phrases import Phrases, Phraser

phrases = Phrases.load('bigram_model.pkl')

weights = KeyedVectors.load_word2vec_format('w2v_tmp.model')
vocab_size = len(weights)
vocab = [weights.index_to_key[i] for i in range(vocab_size)]
embedding_matrix = weights.vectors




In [106]:
text_features = features[[c for c in text_cols]]
for c in text_features.columns:
    text_features[c] = text_features.apply(lambda x: ' '.join(x[c]), axis=1)

text_features.head(5)

Unnamed: 0,name,summary,space,neighborhood_overview
0,attic room historic greenwich,room door discreet staircase light airy open p...,double room available historic royal greenwich...,
1,lovely garden studio private access,garden studio private entrance minute crouch e...,beautiful studio king size bed sofa coffee tab...,crouch end hip friendly neighbourhood filled e...
2,comfi apartment close wimbledon tennis court,,,
3,luxury room heart london sw,luxury first floor victorian split level maiso...,room fully furnished include double bed mirror...,gail coffee shop round corner well local indep...
4,excellent city apartment private patio,new luxury apartment private outside patio gre...,modern well equipped cosy apartment close vict...,


In [75]:
sumamry = np.array(list(map(lambda x:len(x), features['summary'].tolist())))
print(np.percentile(sumamry,[25,50,75,90,99.7],))


space = np.array(list(map(lambda x:len(x), features['space'].tolist())))
print(np.percentile(space,[25,50,75,90,99.7],))


name = np.array(list(map(lambda x:len(x), features['name'].tolist())))
print(np.percentile(name,[25,50,75,90,99.7],))


neighborhood_overview = np.array(list(map(lambda x:len(x), features['neighborhood_overview'].tolist())))
print(np.percentile(neighborhood_overview,[25,50,75,90,99.7],))






[ 231.     350.     473.     522.    1026.193]
[   0.  205.  565.  953. 1157.]
[42. 51. 59. 65. 79.]
[   0.  163.  423.  826. 1097.]


In [123]:
features['summary']
vocab_temp = []
for c in features.summary.tolist():
    vocab_temp.extend(c)
# vocab_temp.extend([c for c in features['summary'].tolist()])
len(set(vocab_temp))

20288

In [132]:

Vectorizer = tf.keras.layers.TextVectorization()


#fit the vectorizer on the text and extract the corpus vocabulary
Vectorizer.adapt(text_features['summary'])
# vocab = weights


#generate the embedding matrix
num_tokens = len(vocab)
embedding_dim = 100
embedding_matrix = np.zeros((num_tokens, embedding_dim))
for i, word in enumerate(vocab):
    embedding_matrix[i] = weights.get_vector(word)

#Load the embedding matrix as the weights matrix for the embedding layer and set trainable to False
Embedding_layer=layers.Embedding(
    num_tokens,
    embedding_dim,
    embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),
    trainable=False)

In [136]:
def create_custom_text_model():
    model = models.Sequential()
    model.add(layers.Input(shape=(1,), dtype=tf.string))
    model.add(Vectorizer)
    model.add(Embedding_layer)
    model.add(layers.LSTM(16, return_sequences=False))
    model.add(layers.Dense(1, activation="relu"))
    return model

temp_model = create_custom_text_model()
temp_model.predict([t]).shape

##### LSTM

In [None]:
train_desc = train['description'].fillna('empty')
train_ngbr = train['neighborhood_overview'].fillna('empty')

y_train = train['price']


print(len(train_desc), len(train_ngbr), len(y_train))


def our_standardization(text_data):
  lowercase = tf.strings.lower(text_data) # convert to lowercase
  remove_html = tf.strings.regex_replace(lowercase, '<br />', ' ') # remove HTML tags
  pattern_remove_punctuation = '[%s]' % re.escape(string.punctuation) # pattern to remove punctuation
  remove_punct = tf.strings.regex_replace(remove_html, pattern_remove_punctuation, '') # apply pattern
  remove_double_spaces = tf.strings.regex_replace(remove_punct, '\s+', ' ') # remove double space
  return remove_double_spaces



vocab_size = 10000
seq_length = 500

# Create a vectorization layer
vectorize_layer = TextVectorization(
    standardize = our_standardization,
    max_tokens = vocab_size,
    output_sequence_length = seq_length
    )

emb_size = 32
rnn_units = 16


def create_text_model(text_list):
    model = layers.Sequential()
    model.add(layers.Input(shape=(1,), dtype=tf.string))
    vectorize_layer.adapt(text_list)
    model.add(vectorize_layer)
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=emb_size))
    model.add(layers.LSTM(rnn_units, return_sequences=False))
    model.add(layers.Dense(1, activation="relu"))
    return model

def create_combined_model(X):
    X = layers.Flatten()(X)
    X = layers.Dense(1, activation="linear")(X)
    print(X.shape)
    return X


def create_model():
    
    ### Initialize Input layers
    input_desc = layers.Input(shape=(1,), dtype=tf.string)
    input_ngbr = layers.Input(shape=(1,), dtype=tf.string)
    
    ### Create Vectorisation models from text features
    desc_model = create_text_model(train_desc)
    ngbr_model = create_text_model(train_ngbr)
#     combined_model = create_combined_model()
    
    ### Create Data flow
    emb_desc = desc_model(input_desc)
    emb_ngbr = ngbr_model(input_ngbr)
    concat_combined = layers.Concatenate()([emb_desc,emb_ngbr])
    print(concat_combined.shape)
    output = create_combined_model(concat_combined)
    
    ### Finalize the model
    model = tf.keras.Model(inputs = [input_desc, input_ngbr], outputs = output)
    model.compile(optimizer='adam',loss='mse', metrics=['mae'])
    return model


model = create_model()
print(model.summary())

history = model.fit(
    [train_desc, train_ngbr],
    y_train,
    validation_split=0.2,
    epochs = 5,
    batch_size = 32,
    verbose =1)

##### CNN-LSTM

In [35]:
train = pd.read_csv('train.csv',encoding='utf-8')
test = pd.read_csv('test.csv',encoding='utf-8')

In [36]:
train_desc = train['summary'].fillna('')
train_ngbr = train['neighborhood_overview'].fillna('')

y_train = train['price']


print(len(train_desc), len(train_ngbr), len(y_train))


def our_standardization(text_data):
    lowercase = tf.strings.lower(text_data) # convert to lowercase
    remove_html = tf.strings.regex_replace(lowercase, '<br />', ' ') # remove HTML tags
    pattern_remove_punctuation = '[%s]' % re.escape(string.punctuation) # pattern to remove punctuation
    remove_punct = tf.strings.regex_replace(remove_html, pattern_remove_punctuation, '') # apply pattern
    remove_double_spaces = tf.strings.regex_replace(remove_punct, '\s+', ' ') # remove double space
    return remove_double_spaces



vocab_size = 10000
seq_length = 500

# Create a vectorization layer
vectorize_layer = TextVectorization(
    standardize = our_standardization,
    max_tokens = vocab_size,
    output_sequence_length = seq_length
    )

emb_size = 100
rnn_units = 64


def create_text_model(text_list):
    model = models.Sequential()
    model.add(layers.Input(shape=(1,), dtype=tf.string))
    vectorize_layer.adapt(text_list)
    model.add(vectorize_layer)
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=emb_size))
    model.add(layers.Conv1D(filters=16,kernel_size=5))
    model.add(layers.AveragePooling1D(pool_size=2,strides=2))
    model.add(layers.LSTM(rnn_units, return_sequences=True))
    model.add(layers.Dense(1, activation="relu"))
    return model

def create_combined_model(X):
    X = layers.Flatten()(X)
#     X = layers.Dense(128, activation="relu")(X)
    X = layers.Dense(1, activation="linear")(X)
    print(X.shape)
    return X


def create_model():
    
    ### Initialize Input layers
    input_desc = layers.Input(shape=(1,), dtype=tf.string)
    input_ngbr = layers.Input(shape=(1,), dtype=tf.string)
    
    ### Create Vectorisation models from text features
    desc_model = create_text_model(train_desc)
    ngbr_model = create_text_model(train_ngbr)
#     combined_model = create_combined_model()
    
    ### Create Data flow
    emb_desc = desc_model(input_desc)
    emb_ngbr = ngbr_model(input_ngbr)
    concat_combined = layers.Concatenate()([emb_desc,emb_ngbr])
    print(concat_combined.shape)
    output = create_combined_model(concat_combined)
    
    ### Finalize the model
    model = tf.keras.Model(inputs = [input_desc, input_ngbr], outputs = output)
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=1e-5),
                  loss=losses.LogCosh(),
                  metrics=['mae'])
    return model


model = create_model()
print(model.summary())

history = model.fit(
    [train_desc, train_ngbr],
    y_train,
    validation_split=0.2,
    epochs = 5,
    batch_size = 64,
    verbose =1)

55284 55284 55284
(None, 248, 2)
(None, 1)
Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_7 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_8 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
sequential_12 (Sequential)      (None, 248, 1)       1028817     input_7[0][0]                    
__________________________________________________________________________________________________
sequential_13 (Sequential)      (None, 248, 1)       1028817     input_8[0][0]                    
___________________________________________________

##### Bidirectional LSTM

In [37]:
train_desc = train['description'].fillna('')
train_ngbr = train['neighborhood_overview'].fillna('')

y_train = train['price']


print(len(train_desc), len(train_ngbr), len(y_train))


def our_standardization(text_data):
    lowercase = tf.strings.lower(text_data) # convert to lowercase
    remove_html = tf.strings.regex_replace(lowercase, '<br />', ' ') # remove HTML tags
    pattern_remove_punctuation = '[%s]' % re.escape(string.punctuation) # pattern to remove punctuation
    remove_punct = tf.strings.regex_replace(remove_html, pattern_remove_punctuation, '') # apply pattern
    remove_double_spaces = tf.strings.regex_replace(remove_punct, '\s+', ' ') # remove double space
    return remove_double_spaces



vocab_size = 10000
seq_length = 500

# Create a vectorization layer
vectorize_layer = TextVectorization(
    standardize = our_standardization,
    max_tokens = vocab_size,
    output_sequence_length = seq_length
    )

emb_size = 100
rnn_units = 32


def create_text_model(text_list):
    model = models.Sequential()
    model.add(layers.Input(shape=(1,), dtype=tf.string))
    vectorize_layer.adapt(text_list)
    model.add(vectorize_layer)
    model.add(layers.Embedding(input_dim=vocab_size, output_dim=emb_size))
    model.add(layers.Bidirectional(layers.LSTM(rnn_units, return_sequences=True)))
    model.add(layers.Dense(1, activation="relu"))
    return model

def create_combined_model(X):
    X = layers.Flatten()(X)
#     X = layers.Dense(128, activation="relu")(X)
    X = layers.Dense(1, activation="linear")(X)
    print(X.shape)
    return X


def create_model():
    
    ### Initialize Input layers
    input_desc = layers.Input(shape=(1,), dtype=tf.string)
    input_ngbr = layers.Input(shape=(1,), dtype=tf.string)
    
    ### Create Vectorisation models from text features
    desc_model = create_text_model(train_desc)
    ngbr_model = create_text_model(train_ngbr)
#     combined_model = create_combined_model()
    
    ### Create Data flow
    emb_desc = desc_model(input_desc)
    emb_ngbr = ngbr_model(input_ngbr)
    concat_combined = layers.Concatenate()([emb_desc,emb_ngbr])
    print(concat_combined.shape)
    output = create_combined_model(concat_combined)
    
    ### Finalize the model
    model = tf.keras.Model(inputs = [input_desc, input_ngbr], outputs = output)
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=1e-5),
                  loss=losses.LogCosh(),
                  metrics=['mae'])
    return model


model = create_model()
print(model.summary())

history = model.fit(
    [train_desc, train_ngbr],
    y_train,
    validation_split=0.2,
    epochs = 5,
    batch_size = 64,
    verbose =1)

55284 55284 55284
(None, 500, 2)
(None, 1)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           [(None, 1)]          0                                            
__________________________________________________________________________________________________
sequential_14 (Sequential)      (None, 500, 1)       1034113     input_11[0][0]                   
__________________________________________________________________________________________________
sequential_15 (Sequential)      (None, 500, 1)       1034113     input_12[0][0]                   
_________________________________________________

#### Combining numerical and text models

In [None]:
print(len(train_desc), len(train_ngbr), len(y_train))

In [None]:
def our_standardization(text_data):
  lowercase = tf.strings.lower(text_data) # convert to lowercase
  remove_html = tf.strings.regex_replace(lowercase, '<br />', ' ') # remove HTML tags
  pattern_remove_punctuation = '[%s]' % re.escape(string.punctuation) # pattern to remove punctuation
  remove_punct = tf.strings.regex_replace(remove_html, pattern_remove_punctuation, '') # apply pattern
  remove_double_spaces = tf.strings.regex_replace(remove_punct, '\s+', ' ') # remove double space
  return remove_double_spaces

In [None]:
our_standardization(train_desc[0])

In [None]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

vocab_size = 10000
seq_length = 500

# Create a vectorization layer
vectorize_layer = TextVectorization(
    standardize = our_standardization,
    max_tokens = vocab_size,
    output_sequence_length = seq_length
    )

In [None]:
desc1, desc2 = train_desc[2], train_desc[3]
print(desc1+'\n\n'+desc2)

ngbr1, ngbr2 = train_ngbr[2], train_ngbr[3]
print(ngbr1+'\n\n'+ngbr2)

In [None]:
!nvidia-smi

tf.test.is_gpu_available()

In [None]:
emb_size = 32
rnn_units = 16


def create_text_model(text_list):
    model = Sequential()
    model.add(Input(shape=(1,), dtype=tf.string))
    vectorize_layer.adapt(text_list)
    model.add(vectorize_layer)
    model.add(Embedding(input_dim=vocab_size, output_dim=emb_size))
    model.add(LSTM(rnn_units, return_sequences=False))
    model.add(Dense(1, activation="relu"))
    return model

def create_combined_model(X):
    X = Flatten()(X)
    X = Dense(1, activation="linear")(X)
    print(X.shape)
    return X

def create_model():
    
    ### Initialize Input layers
    input_desc = Input(shape=(1,), dtype=tf.string)
    input_ngbr = Input(shape=(1,), dtype=tf.string)
    
    ### Create Vectorisation models from text features
    desc_model = create_text_model(train_desc)
    ngbr_model = create_text_model(train_ngbr)
#     combined_model = create_combined_model()
    
    ### Create Data flow
    emb_desc = desc_model(input_desc)
    emb_ngbr = ngbr_model(input_ngbr)
    concat_combined = Concatenate()([emb_desc,emb_ngbr])
    print(concat_combined.shape)
    output = create_combined_model(concat_combined)
    
    ### Finalize the model
    model = Model(inputs = [input_desc, input_ngbr], outputs = output)
    model.compile(optimizer='adam',loss='mse', metrics=['mae'])
    return model








# desc_embedded = desc_model.predict([desc1,desc2])
# ngbr_model = create_text_model(train_ngbr)

# dense_desc = desc_model([desc2])
# dense_ngbr = ngbr_model([ngbr2])

# concatenated_values = layers.Concatenate([dense_model, ngbr_model])

# print(concatenated_values.shape)

# def create_model(text_features_list):
#   vec_model = 
#   for feature in text_features_list:
    


In [None]:
# Create model with LSTM
emb_size = 100
rnn_units = 64

input_ngbr = tf.keras.Input(shape=(seq_length,), dtype="int64") 
input_desc = tf.keras.Input(shape=(seq_length,), dtype="int64") 
emb_desc = layers.Embedding(input_dim=vocab_size, output_dim=emb_size)(input_desc) 
x_desc = layers.GRU(rnn_units)(emb_desc)
dense_desc = layers.Dense(1, activation="relu")(x_desc)

emb_ngbr = layers.Embedding(input_dim=vocab_size, output_dim=emb_size)(input_ngbr) 
x_ngbr = layers.GRU(rnn_units)(emb_ngbr)
dense_ngbr = layers.Dense(1, activation="relu")(x_ngbr)

concat = concatenate([dense_desc, dense_ngbr])
norm = layers.BatchNormalization()(concat)
dense_full = Dense(128, activation="relu")(norm)
#dense_full = Dense(64, activation="relu")(dense_full)
output_layer = Dense(1, activation="relu")(dense_full)


model = tf.keras.Model(inputs = [input_desc, input_ngbr], outputs = output_layer) 

model.compile(optimizer='adam', 
    loss='mse', 
    metrics=['mae']) 
model.summary()

In [None]:
# tf.convert_to_tensor([zip(train_desc,train_ngbr))
y_train = tf.convert_to_tensor(y_train)

In [None]:
train_text_ds_raw = tf.data.Dataset.from_tensor_slices(
            tf.cast(train_features.values, tf.string)
) 
train_cat_ds_raw = tf.data.Dataset.from_tensor_slices(
            tf.cast(train_targets.values, tf.int64),

) 


def convert_text_input(sample):
    text = sample
    text = tf.expand_dims(text, -1)  
    #return tf.squeeze(vectorize_layer(text))
    return tf.squeeze(vectorize_layer(text))

convert_text_input(["what is this misery","what is your story"]).shape

In [None]:
# model = create_model()
# desc_model = create_text_model(train_desc)

# print(desc_model.summary())
# desc_model.compile(optimizer='adam', loss='mse', metrics=['mae'])
story = model.fit(
   [train_desc, train_ngbr],
   y_train,
#     validation_split=0.2,
    epochs = 5,
    batch_size = 32,
    verbose =1)