In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [66]:
columns = [
    'neighbourhood_group', 'room_type', 'latitude', 'longitude',
    'minimum_nights', 'number_of_reviews','reviews_per_month',
    'calculated_host_listings_count', 'availability_365',
    'price'
]

df = pd.read_csv('bnb.csv', usecols=columns)
df.reviews_per_month = df.reviews_per_month.fillna(0)

In [67]:
df.price

0        149
1        225
2        150
3         89
4         80
        ... 
48890     70
48891     40
48892    115
48893     55
48894     90
Name: price, Length: 48895, dtype: int64

In [68]:
price_logs=np.log1p(df.price)


In [69]:
price_logs

0        5.010635
1        5.420535
2        5.017280
3        4.499810
4        4.394449
           ...   
48890    4.262680
48891    3.713572
48892    4.753590
48893    4.025352
48894    4.510860
Name: price, Length: 48895, dtype: float64

In [70]:
from sklearn.model_selection import train_test_split

df_full_train ,df_test =train_test_split(df, test_size=0.2,random_state =11)
df_train ,df_val =train_test_split(df_full_train, test_size=0.25,random_state =11)




In [71]:
len(df_train),len(df_test),len(df_val)

(29337, 9779, 9779)

In [106]:

y_train=(df_train.neighbourhood_group == 'Manhattan').astype('int').values
y_test=(df_test.neighbourhood_group == 'Manhattan').astype('int').values
y_val=(df_val.neighbourhood_group == 'Manhattan').astype('int').values

In [115]:
del df_train['neighbourhood_group']
del df_val['neighbourhood_group']
del df_test['neighbourhood_group']

df_train

Unnamed: 0,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
44736,40.73070,-73.98299,Entire home/apt,120,365,4,1.90,5,305
29386,40.73767,-73.95732,Private room,60,6,3,0.16,1,0
39619,40.67683,-73.94702,Private room,30,5,2,0.32,1,220
8515,40.77323,-73.95567,Private room,85,20,2,0.06,3,128
11409,40.67381,-73.96496,Private room,115,2,2,0.92,1,362
...,...,...,...,...,...,...,...,...,...
47088,40.77300,-73.94759,Private room,90,8,0,0.00,2,77
36561,40.75049,-73.99809,Entire home/apt,69,1,2,0.24,1,1
14670,40.67170,-73.99247,Entire home/apt,295,4,2,0.07,1,326
42505,40.72141,-73.98866,Entire home/apt,200,3,8,2.03,1,319


In [124]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text

In [125]:
train_dicts=df_train.fillna(0).to_dict(orient = 'records')


In [126]:
dv = DictVectorizer(sparse=False)
x_train=dv.fit_transform(train_dicts)

In [119]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'price',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [120]:
len(x_train),x_train.shape


(29337, (29337, 11))

In [121]:
len(y_train),y_train.shape



(29337, (29337,))

In [122]:
dt = DecisionTreeClassifier(max_depth=1)
dt.fit(x_train,y_train1)


DecisionTreeClassifier(max_depth=1)

In [127]:
y_pred = dt.predict_proba(x_train)[:,1]
auc=roc_auc_score(y_train,y_pred)
print('train:', auc)


train: 0.8373221428103076


In [64]:
df.dtypes


neighbourhood_group                object
latitude                          float64
longitude                         float64
room_type                          object
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

In [130]:
print(export_text(dt , feature_names=dv.get_feature_names()))

|--- latitude <= 40.72
|   |--- class: 0
|--- latitude >  40.72
|   |--- class: 1



In [27]:
df.neighbourhood_group  

0         Brooklyn
1        Manhattan
2        Manhattan
3         Brooklyn
4        Manhattan
           ...    
48890     Brooklyn
48891     Brooklyn
48892    Manhattan
48893    Manhattan
48894    Manhattan
Name: neighbourhood_group, Length: 48895, dtype: object

In [30]:
df.neighbourhood_group.nunique()

5

In [100]:
df


Unnamed: 0,neighbourhood_group,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Manhattan,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,Manhattan,40.80902,-73.94190,Private room,150,3,0,0.00,1,365
3,Brooklyn,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Manhattan,40.79851,-73.94399,Entire home/apt,80,10,9,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...
48890,Brooklyn,40.67853,-73.94995,Private room,70,2,0,0.00,2,9
48891,Brooklyn,40.70184,-73.93317,Private room,40,4,0,0.00,2,36
48892,Manhattan,40.81475,-73.94867,Entire home/apt,115,10,0,0.00,1,27
48893,Manhattan,40.75751,-73.99112,Shared room,55,1,0,0.00,6,2


In [134]:
from sklearn.ensemble import RandomForestClassifier

In [135]:
rf = RandomForestClassifier(n_estimators=10,random_state=1,n_jobs=-1)
rf.fit(x_train,y_train)


RandomForestClassifier(n_estimators=10, n_jobs=-1, random_state=1)

In [137]:
val_dicts=df_val.fillna(0).to_dict(orient='records')
x_val= dv.transform(val_dicts)
y_pred=rf.predict_proba(x_val)[:,1]

In [144]:
roc_auc_score(y_val,y_pred)

0.9999969823663943

In [None]:
scores=[]
for d in [10,15,20,25]:
    for n in range (10,201,10):
        rf = RandomForestClassifier(n_estimators=n,max_depth=d,random_state=1,n_jobs=-1)
        rf.fit(x_train,y_train)
        y_pred=rf.predict_proba(x_val)[:,1]
        auc=roc_auc_score(y_val,y_pred)
        scores.append((d,n,auc))


In [None]:
columns=['max_depth','n_estimators','auc']
df_scores=pd.DataFrame(scores,columns=columns)
df_scores

In [None]:
#gradient boosting
import xgboost as xgb
features =dv.get_feature_names()

dtrain=xgb.DMatrix(x_train,label=y_train,feature_names=features)
dval=xgb.DMatrix(x_val,label=y_val,feature_names=features)
xgb_params={
    'eta':0.3,
    'max_depth':6,
    'min_child_weight':1,
    
    
    'objective':'binary:logistic',
    'nthreads':8,
    
    'seed':1,
    'verbosity':1,
}
model =xgb.train(xgb_params,dtrain,num_boost_round=10)
y_pred=model.predict(dval)
roc_auc_score(y_val,y_pred)

In [None]:
#gradient boosting
import xgboost as xgb
features =dv.get_feature_names()

dtrain=xgb.DMatrix(x_train,label=y_train,feature_names=features)
dval=xgb.DMatrix(x_val,label=y_val,feature_names=features)
xgb_params={
    'eta':0.1,
    'max_depth':6,
    'min_child_weight':1,
    
    
    'objective':'binary:logistic',
    'nthreads':8,
    
    'seed':1,
    'verbosity':1,
}
model =xgb.train(xgb_params,dtrain,num_boost_round=10)
y_pred=model.predict(dval)
roc_auc_score(y_val,y_pred)

In [None]:
#gradient boosting
import xgboost as xgb
features =dv.get_feature_names()

dtrain=xgb.DMatrix(x_train,label=y_train,feature_names=features)
dval=xgb.DMatrix(x_val,label=y_val,feature_names=features)
xgb_params={
    'eta':0.01,
    'max_depth':6,
    'min_child_weight':1,
    
    
    'objective':'binary:logistic',
    'nthreads':8,
    
    'seed':1,
    'verbosity':1,
}
model =xgb.train(xgb_params,dtrain,num_boost_round=10)
y_pred=model.predict(dval)
roc_auc_score(y_val,y_pred)