In [0]:
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pandas as pd
import nltk
import pickle
import tensorflow as tf
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn import datasets, linear_model
from sklearn import svm


# **Loading data from Github repo**

In [2]:
names = ['id',	'name',	'host_id',	'host_name',	'neighbourhood_group',	'neighbourhood',	'latitude',	
         'longitude',	'room_type',	'price',	'minimum_nights',	'number_of_reviews',	'last_review',	
         'reviews_per_month',	'calculated_host_listings_count',	'availability_365']
df = pd.read_csv('https://raw.githubusercontent.com/lmxy0212/ML_project/master/new-york-city-airbnb-open-data/AB_NYC_2019.csv',
                names=names,na_values='?',header=None)
df = df.dropna()
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
1,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
2,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
4,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
5,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0


# **Formating data to be np array**

In [3]:
airbnb_name = np.array(df['name'][1:])
reviews_per_month = np.array(df['reviews_per_month'][1:]).astype(np.float)
n = np.shape(airbnb_name)[0]
print(n)
print(airbnb_name)
print(np.array(reviews_per_month))
print(np.shape(airbnb_name))
print(np.shape(reviews_per_month))
print(np.sort(reviews_per_month)[-50])
print(np.mean(reviews_per_month), np.max(reviews_per_month), np.median(reviews_per_month))


38821
['Clean & quiet apt home by the park' 'Skylit Midtown Castle'
 'Cozy Entire Floor of Brownstone' ... 'Seas The Moment'
 '1B-1B apartment near by Metro' 'Cozy Private Room in Bushwick, Brooklyn']
[0.21 0.38 4.64 ... 1.   2.   1.  ]
(38821,)
(38821,)
11.16
1.3732291800829448 58.5 0.72


# **Creating bag of words for each airbnb name**

In [4]:
vectorizer = CountVectorizer(analyzer='word')
bow = vectorizer.fit(airbnb_name)
bow = vectorizer.transform(airbnb_name)
bow

<38821x6961 sparse matrix of type '<class 'numpy.int64'>'
	with 222889 stored elements in Compressed Sparse Row format>

CountVectorizer takes the words of each sentence and creates a vocabulary of all the unique words in the sentences. This vocabulary can then be used to create a feature vector of the count of the words.
CountVectorizer performs tokenization which separates the sentences into a set of tokens. It additionally removes punctuation and special characters and can apply other preprocessing to each word. 
We can use customed tokenizer from the NLTK library with the CountVectorizer or use any number of the customizations which you can explore to improve the performance of your model

In [0]:
# print(bow)
# # (seq_num, feature_num)  count
# print(vectorizer.vocabulary_.get("great")) #feature number of "great"

In [5]:
X = bow.toarray()
print(X)
print(np.shape(X))

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
(38821, 6961)


# **Train Test Split**

In [9]:
Xtr, Xts, ytr, yts = train_test_split(X, reviews_per_month, test_size = 1/3)
print("Xtr:\n", Xtr, "\nXts:\n",Xts,"\nytr:\n", ytr, "\nyts:\n", yts)
print("\nXtr.shape:", Xtr.shape, "Xts.shape", Xts.shape)
print("\nytr.shape:", ytr.shape, "yts.shape:", yts.shape)
ytr = ytr.astype(np.float)
yts = yts.astype(np.float)

Xtr:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 
Xts:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]] 
ytr:
 [0.12 0.18 0.43 ... 0.73 0.94 0.64] 
yts:
 [0.03 2.07 2.   ... 4.86 1.02 0.06]

Xtr.shape: (25880, 6961) Xts.shape (12941, 6961)

ytr.shape: (25880,) yts.shape: (12941,)


## **Using linear regression to fit the data and saving in linear_reg.p**

In [111]:
regr = linear_model.LinearRegression()
regr.fit(Xtr,ytr)
print(regr.intercept_)
print(regr.coef_)


0.7393837097616061
[ 1.18616409e+13 -1.31445312e+00 -2.49511719e+00 ...  0.00000000e+00
  0.00000000e+00  1.70352554e+00]


In [0]:
# with open( "linear_reg.p", "wb" ) as fp:
#     pickle.dump( [regr, Xtr, ytr, Xts, yts],  fp)

In [113]:
# with open( "linear_reg.p", "rb" ) as fp:
#     regr, Xtr, ytr, Xts, yts = pickle.load(fp)
ytr_pred = regr.predict(Xtr)
lossm = np.linalg.norm(ytr_pred - ytr)**2
lossm = (1/n) * np.linalg.norm(ytr_pred - ytr)**2
print("Average loss on training data:",lossm)
yts_pred = regr.predict(Xts)
lossm = (1/n) * np.linalg.norm(yts_pred - yts)**2
print("Average loss on test data:", lossm)


Average loss on training data: 1.3590508488822457
Average loss on test data: 3.8979136227124014e+23


# **Using Binary Classification**

In [0]:
# if review_per_month > 10, we classify the airbnb as popular
ytr = (ytr > 10).astype(int)
yts = (yts > 10).astype(int)

##**Setting base line** 

In [12]:
yhat_zeros = np.zeros(np.shape(Xtr)[0])
acc_zeros = np.mean(yhat_zeros == ytr)
print('Accuaracy for all zeros for training set = {0:f}'.format(acc_zeros))
yhat_zeros = np.zeros(np.shape(Xts)[0])
acc_zeros = np.mean(yhat_zeros == yts)
print('Accuaracy for all zeros for test set = {0:f}'.format(acc_zeros))

Accuaracy for all zeros for training set = 0.998145
Accuaracy for all zeros for test set = 0.997450


## **Logistics Regression**

In [0]:
from sklearn import preprocessing
Xtr = preprocessing.normalize(Xtr)
Xts = preprocessing.normalize(Xts)

In [60]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LogisticRegression

tscv = TimeSeriesSplit(n_splits=10)

tuned_parameters = { 'C': [10**-4, 10**-2, 10**-1, 10**0, 10**1, 10**2, 10**4],
              'penalty':['l1','l2']}
model_precision = RandomizedSearchCV(LogisticRegression(n_jobs = -1), tuned_parameters, cv = tscv,
                     scoring = "precision", n_jobs = -1)
model_precision.fit(Xtr, ytr)

print("Best C and penalty",model_precision.best_params_)
print("precision on train data",model_precision.best_score_*100)



Best C and penalty {'penalty': 'l2', 'C': 10000}
precision on train data 4.019230769230769


In [63]:
logreg = linear_model.LogisticRegression(solver='lbfgs',\
                                         multi_class='multinomial')
logreg.fit(Xtr,ytr)
# with open( "logreg.p", "wb" ) as fp:
#     pickle.dump( [logreg, Xtr, ytr, Xts, yts],  fp)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='multinomial', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [64]:
# with open( "logreg.p", "rb" ) as fp:
#     logreg, Xtr, ytr, Xts, yts = pickle.load(fp)

yhat_tr = logreg.predict(Xtr)
acc_tr = np.mean(yhat_tr == ytr)
print('Accuaracy on training set = {0:f}'.format(acc_tr))

yhat = logreg.predict(Xts)
acc = np.mean(yhat == yts)
print('Accuaracy on test set = {0:f}'.format(acc))

Accuaracy on training set = 0.998145
Accuaracy on test set = 0.997450


In [11]:
coef = logreg.coef_
sorted_coef = np.sort(coef)
top10 = sorted_coef[:,:20]
top10 = np.reshape(top10,(20,))
# print(top10)
print(np.shape(coef))
inds = np.where(coef == top10)
# print(np.reshape(coef,(6961,)))
inds = [np.where(coef==ind) for ind in top10]
ind = np.where(coef==top10[0])
# print(inds)
word_list = [vectorizer.get_feature_names()[int(i[1])] for i in inds]
print(word_list)

(1, 6961)
['subway', 'train', '15', 'spacious', 'large', 'and', 'sunny', 'house', 'master', 'astoria', 'apt', 'bath', 'village', 'br', 'location', 'loft', 'modern', 'midtown', 'charming', 'on']


  import sys


## **SVM**

In [65]:
svc = svm.SVC(probability=False,  kernel="rbf")
svc.fit(Xtr,ytr)
# with open( "svm.p", "wb" ) as fp:
#     pickle.dump( [svc, Xtr, ytr, Xts, yts],  fp)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [66]:
# with open( "svm.p", "rb" ) as fp:
#     svc, Xtr, ytr, Xts, yts = pickle.load(fp)


yhat_tr = svc.predict(Xtr)
acc_tr = np.mean(yhat_tr == ytr)
print('Accuaracy on training set = {0:f}'.format(acc_tr))

yhat = svc.predict(Xts)
acc = np.mean(yhat == yts)
print('Accuaracy on test set = {0:f}'.format(acc))

Accuaracy on training set = 0.998145
Accuaracy on test set = 0.997450


## **Neural Network**

In [0]:
import tensorflow as tf
import tensorflow.keras.backend as K
from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix

from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.preprocessing import text, sequence
from keras import utils
K.clear_session()

In [16]:
batch_size = 10
epochs = 20
model = Sequential()
model.add(Dense(200, input_shape=(np.shape(Xtr)[1],)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('softmax'))
model.summary()


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 200)               1392400   
_________________________________________________________________
activation_1 (Activation)    (None, 200)               0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 402       
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 1,392,802
Trainable params: 1,392,802
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
history = model.fit(Xtr, ytr,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(Xts,yts))

Train on 25880 samples, validate on 12941 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [18]:
score, acc = model.evaluate(Xtr, ytr, verbose=1)
print("Accuaracy on traing set = %f" % acc)
score, acc1 = model.evaluate(Xts, yts, verbose=1)
print("Accuaracy on test set = %f" % acc1)

Accuaracy on traing set = 0.999498
Accuaracy on test set = 0.996909


# **Keras BOW**

In [0]:
K.clear_session()

In [45]:
X1 = airbnb_name
y1 = (reviews_per_month > 10).astype(int)

Xtr1, Xts1, ytr1, yts1 = train_test_split(X1, y1, test_size = 1/3)

max_words = 1000
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

tokenize.fit_on_texts(Xtr1)
print("word index:\n",tokenize.word_index)

x_train = tokenize.texts_to_matrix(Xtr1)
x_test = tokenize.texts_to_matrix(Xts1)
print('x_train shape:', x_train.shape)
print('y_train shape:', ytr1.shape)
ytr1

word index:
 {'in': 1, 'room': 2, 'bedroom': 3, 'private': 4, 'apartment': 5, 'cozy': 6, 'apt': 7, 'brooklyn': 8, '1': 9, 'studio': 10, 'to': 11, 'the': 12, 'spacious': 13, '2': 14, 'manhattan': 15, 'park': 16, 'with': 17, 'sunny': 18, 'of': 19, 'east': 20, 'and': 21, 'williamsburg': 22, 'beautiful': 23, 'near': 24, 'village': 25, 'nyc': 26, 'loft': 27, 'bed': 28, 'heart': 29, 'large': 30, 'a': 31, 'w': 32, 'home': 33, 'central': 34, 'modern': 35, 'bright': 36, 'from': 37, 'location': 38, 'new': 39, 'charming': 40, 'luxury': 41, 'west': 42, '1br': 43, 'bushwick': 44, 'side': 45, 'brownstone': 46, 'upper': 47, 'quiet': 48, 'one': 49, '3': 50, 'great': 51, 'br': 52, 'for': 53, 'clean': 54, 'midtown': 55, 'harlem': 56, 'close': 57, 'garden': 58, 'subway': 59, 'square': 60, 'on': 61, 'bath': 62, 'huge': 63, 'heights': 64, 'min': 65, 'times': 66, 'duplex': 67, 'prime': 68, 'train': 69, 'house': 70, 'city': 71, 'amazing': 72, '2br': 73, 'renovated': 74, 'by': 75, 'suite': 76, 'lovely': 77, '

array([0, 0, 0, ..., 0, 0, 0])

In [0]:
batch_size = 10
epochs = 20
model = Sequential()
model.add(Dense(200, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(2))
model.add(Activation('softmax'))

model.compile(loss='sparse_categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [48]:
history = model.fit(x_train, ytr1,
                    batch_size=batch_size,
                    epochs=epochs,
                    validation_data=(x_test,yts1))


Train on 25880 samples, validate on 12941 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


##Base line

In [46]:
ytr_zeros = np.zeros(np.shape(ytr1)[0])
acc_zeros = np.mean(ytr_zeros == ytr1)
print('Accuaracy for all zeros for training set = {0:f}'.format(acc_zeros))
yts_zeros = np.zeros(np.shape(yts1)[0])
acc_zeros = np.mean(yts_zeros == yts1)
print('Accuaracy for all zeros for test set = {0:f}'.format(acc_zeros))

Accuaracy for all zeros for training set = 0.998145
Accuaracy for all zeros for test set = 0.997450


## Calculated results

In [49]:
score, acc = model.evaluate(x_train, ytr1, verbose=1)
print("Accuaracy on train set = %f" % acc)
score, acc = model.evaluate(x_test, yts1, verbose=1)
print("Accuaracy on test set = %f" % acc)

Accuaracy on train set = 0.999730
Accuaracy on test set = 0.996909
