In [1]:
from keras.models import Model
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from tensorflow.keras.optimizers import Adam

Using TensorFlow backend.


In [2]:
!unzip /content/268833_611395_compressed_AB_NYC_2019.csv.zip

Archive:  /content/268833_611395_compressed_AB_NYC_2019.csv.zip
replace AB_NYC_2019.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: AB_NYC_2019.csv         


In [3]:
df = pd.read_csv('AB_NYC_2019.csv')

In [4]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [5]:
df.shape

(48895, 16)

In [6]:
# Encode data

encoder = OneHotEncoder()
label_encoder = LabelEncoder()

In [7]:
# Replace NaN w/ 0

df.dropna(subset=['last_review'], inplace=True)

In [8]:
df.dropna(inplace=True)

In [9]:
words = ['name', 'host_name', 'neighbourhood_group', 'neighbourhood', 'room_type']


In [10]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0
5,5099,Large Cozy 1 BR Apartment In Midtown East,7322,Chris,Manhattan,Murray Hill,40.74767,-73.975,Entire home/apt,200,3,74,2019-06-22,0.59,1,129


In [11]:
df.isna().sum()

id                                0
name                              0
host_id                           0
host_name                         0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
last_review                       0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [12]:
df.drop('last_review', axis=1, inplace=True)

In [13]:
for col in words:
  df[col] = label_encoder.fit_transform(df[col])
  df[col] = df[col].astype(int)

In [14]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,9933,2787,4303,1,107,40.64749,-73.97237,1,149,1,9,0.21,6,365
1,2595,30014,2845,4120,2,126,40.75362,-73.98377,0,225,1,45,0.38,2,355
3,3831,11945,4869,5363,1,41,40.68514,-73.95976,0,89,1,270,4.64,1,194
4,5022,15102,7192,5114,2,61,40.79851,-73.94399,0,80,10,9,0.1,1,0
5,5099,19649,7322,1666,2,136,40.74767,-73.975,0,200,3,74,0.59,1,129


In [15]:
train, test = train_test_split(df, test_size=0.045, random_state=42)

print(f'train: {train.shape}')
print(f'test: {test.shape}\n')

X_train = train.drop(columns='price')
y_train = train['price']
X_test = test.drop(columns='price')
y_test = test['price']

print(f'X_train shape: {X_train.shape}')
print(f'y_train shape: {y_train.shape}')
print(f'X_test shape: {X_test.shape}')
print(f'y_test shape: {y_test.shape}')

train: (37074, 15)
test: (1747, 15)

X_train shape: (37074, 14)
y_train shape: (37074,)
X_test shape: (1747, 14)
y_test shape: (1747,)


In [16]:
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,9933,2787,4303,1,107,40.64749,-73.97237,1,149,1,9,0.21,6,365
1,2595,30014,2845,4120,2,126,40.75362,-73.98377,0,225,1,45,0.38,2,355
3,3831,11945,4869,5363,1,41,40.68514,-73.95976,0,89,1,270,4.64,1,194
4,5022,15102,7192,5114,2,61,40.79851,-73.94399,0,80,10,9,0.1,1,0
5,5099,19649,7322,1666,2,136,40.74767,-73.975,0,200,3,74,0.59,1,129


In [17]:
# Normalizing the data
sc = StandardScaler()
X = sc.fit_transform(X_train)

In [28]:
# Model

inputs = X.shape[1]

model = Sequential()

# Add to model

model.add(Dense(32, activation='relu', input_dim=inputs))
model.add(Dense(16, activation='relu'))
model.add(Dense(1))

# Compile model
model.compile(loss='mse', optimizer='RMSprop', metrics=['mae'])

# Fit model
model.fit(X, y_train, epochs=10, batch_size=32, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f7aee8815f8>

In [23]:
model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_12 (Dense)             (None, 32)                480       
_________________________________________________________________
dense_13 (Dense)             (None, 16)                528       
_________________________________________________________________
dense_14 (Dense)             (None, 1)                 17        
Total params: 1,025
Trainable params: 1,025
Non-trainable params: 0
_________________________________________________________________
