In [1]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv' 

In [2]:
import pandas as pd
import numpy as np


In [3]:
df_raw = pd.read_csv(data)

In [4]:
df_raw

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [5]:
features = ['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']

In [6]:
df = df_raw.copy()

In [7]:
df = df[features]

In [8]:
df.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,,1,365
3,Brooklyn,Entire home/apt,40.68514,-73.95976,89,1,270,4.64,1,194
4,Manhattan,Entire home/apt,40.79851,-73.94399,80,10,9,0.1,1,0


In [9]:
df.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [10]:
df.fillna(0,inplace=True)

In [11]:
df.neighbourhood_group.value_counts()

Manhattan        21661
Brooklyn         20104
Queens            5666
Bronx             1091
Staten Island      373
Name: neighbourhood_group, dtype: int64

In [12]:
df.neighbourhood_group.mode()[0]

'Manhattan'

In [13]:
from sklearn.model_selection import train_test_split


In [14]:
df_full_train, df_test = train_test_split(df,test_size = 0.2, random_state=42)

In [15]:
len(df_full_train), len(df_test)

(39116, 9779)

In [16]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=1)

In [17]:
df_train.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
2777,Brooklyn,Entire home/apt,40.68084,-73.92804,170,3,134,2.0,3,209
26060,Manhattan,Private room,40.8204,-73.93867,50,1,2,0.09,1,0
39854,Brooklyn,Private room,40.69007,-73.94326,40,7,13,2.39,1,93
31667,Manhattan,Private room,40.80931,-73.93982,65,4,8,0.64,4,0
3790,Brooklyn,Entire home/apt,40.71969,-73.9583,500,1,0,0.0,1,0


In [18]:
y_train = df_train.price
y_val = df_val.price
y_test = df_test.price


del df_train['price']
del df_val['price']
del df_test['price']


In [19]:
df_train.columns

Index(['neighbourhood_group', 'room_type', 'latitude', 'longitude',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [20]:
type(y_train)

pandas.core.series.Series

In [21]:
df_train.dtypes == 'int64'

neighbourhood_group               False
room_type                         False
latitude                          False
longitude                         False
minimum_nights                     True
number_of_reviews                  True
reviews_per_month                 False
calculated_host_listings_count     True
availability_365                   True
dtype: bool

In [22]:
strings_num = list(df_train.dtypes[df_train.dtypes != 'object'].index)
strings_cat = list(df_train.dtypes[df_train.dtypes == 'object'].index)

In [23]:
df_train_num = df_train[strings_num]
df_train_cat = df_train[strings_cat]

In [24]:
df_train_num.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.077832,0.022611,-0.009668,-0.016642,0.018823,-0.008696
longitude,0.077832,1.0,-0.064854,0.057652,0.131194,-0.117221,0.08489
minimum_nights,0.022611,-0.064854,1.0,-0.078089,-0.122739,0.124147,0.142787
number_of_reviews,-0.009668,0.057652,-0.078089,1.0,0.578046,-0.072686,0.179343
reviews_per_month,-0.016642,0.131194,-0.122739,0.578046,1.0,-0.045882,0.166073
calculated_host_listings_count,0.018823,-0.117221,0.124147,-0.072686,-0.045882,1.0,0.224027
availability_365,-0.008696,0.08489,0.142787,0.179343,0.166073,0.224027,1.0


In [25]:
type(df_train_num.corr())

pandas.core.frame.DataFrame

In [26]:
df_train_num[(df_train_num.corr().max())&(df_train_num.corr()<1)]

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
2777,,,,,,,
26060,,,,,,,
39854,,,,,,,
31667,,,,,,,
3790,,,,,,,
...,...,...,...,...,...,...,...
37462,,,,,,,
35693,,,,,,,
2086,,,,,,,
26057,,,,,,,


In [27]:
type(df_train_num)

pandas.core.frame.DataFrame

In [28]:
y_train 

2777     170
26060     50
39854     40
31667     65
3790     500
        ... 
37462    100
35693    154
2086     179
26057     74
6559      90
Name: price, Length: 29337, dtype: int64

In [29]:
ydf_train = pd.DataFrame(y_train)
ydf_val = pd.DataFrame(y_val)


In [30]:
ydf_train

Unnamed: 0,price
2777,170
26060,50
39854,40
31667,65
3790,500
...,...
37462,100
35693,154
2086,179
26057,74


In [31]:
ydf_train['above_average'] = ydf_train.price.apply(lambda x: 1 if x >= 152 else  0 )
ydf_val['above_average'] = ydf_val.price.apply(lambda x: 1 if x >= 152 else  0 )

In [32]:
ydf_val

Unnamed: 0,price,above_average
2156,275,1
44936,160,1
31942,150,0
29084,200,1
39176,70,0
...,...,...
22121,51,0
39478,46,0
5795,73,0
9905,150,0


In [33]:
y_train_binary = ydf_train.above_average.values
y_val_binary = ydf_val.above_average.values

In [34]:
y_train_binary

array([1, 0, 0, ..., 1, 0, 0], dtype=int64)

In [35]:
from sklearn.metrics import mutual_info_score

In [36]:
mutual_info_score(df_train_cat.neighbourhood_group,ydf_train['above_average'])

0.0456848367671059

In [37]:
mutual_info_score(df_train_cat.room_type,ydf_train['above_average'])

0.14257297658111734

In [38]:
 def mutual_info_score_abav(series):
        return mutual_info_score(series, ydf_train['above_average'])

In [39]:
round(df_train_cat[strings_cat].apply(mutual_info_score_abav),2)

neighbourhood_group    0.05
room_type              0.14
dtype: float64

In [40]:
from sklearn.feature_extraction import DictVectorizer

In [41]:
dv = DictVectorizer(sparse=False)

train_dict = df_train[strings_num + strings_cat].to_dict(orient='records')
train_val = df_val[strings_num + strings_cat].to_dict(orient='records')

In [42]:
train_val[:3]

[{'latitude': 40.69248,
  'longitude': -73.96983,
  'minimum_nights': 4,
  'number_of_reviews': 81,
  'reviews_per_month': 1.07,
  'calculated_host_listings_count': 2,
  'availability_365': 310,
  'neighbourhood_group': 'Brooklyn',
  'room_type': 'Entire home/apt'},
 {'latitude': 40.81396,
  'longitude': -73.94488,
  'minimum_nights': 6,
  'number_of_reviews': 8,
  'reviews_per_month': 5.22,
  'calculated_host_listings_count': 1,
  'availability_365': 43,
  'neighbourhood_group': 'Manhattan',
  'room_type': 'Entire home/apt'},
 {'latitude': 40.63074,
  'longitude': -73.95849,
  'minimum_nights': 4,
  'number_of_reviews': 1,
  'reviews_per_month': 0.08,
  'calculated_host_listings_count': 1,
  'availability_365': 20,
  'neighbourhood_group': 'Brooklyn',
  'room_type': 'Entire home/apt'}]

In [43]:
X_train = dv.fit_transform(train_dict)


val_dict = df_val[strings_num + strings_cat].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [44]:
X_train[0]

array([209.     ,   3.     ,  40.68084, -73.92804,   3.     ,   0.     ,
         1.     ,   0.     ,   0.     ,   0.     , 134.     ,   2.     ,
         1.     ,   0.     ,   0.     ])

In [45]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

In [46]:
from sklearn.linear_model import LogisticRegression

In [56]:
model = LogisticRegression(solver="liblinear", C=1.0, random_state=42)

In [57]:
model.fit(X_train, y_train_binary)

LogisticRegression(random_state=42, solver='liblinear')

In [58]:
model.coef_[0].round(3)

array([ 3.000e-03,  4.000e-03, -5.911e+00, -3.218e+00, -1.300e-02,
       -1.760e-01,  1.770e-01,  1.608e+00, -6.000e-03, -1.693e+00,
       -3.000e-03, -5.400e-02,  1.952e+00, -8.390e-01, -1.202e+00])

In [59]:
y_pred = model.predict_proba(X_val)[:, 1]

In [60]:
y_pred

array([0.53271482, 0.47532141, 0.463219  , ..., 0.3789714 , 0.2402133 ,
       0.35802305])

In [61]:
y_val_binary.mean()


0.3040188158298395

In [62]:
y_pred.mean()

0.30525622113713

In [63]:
price_level = (y_pred>= 0.5)

In [64]:
(y_val_binary == price_level).mean()

0.7948665507720626

In [65]:
df_chck = pd.DataFrame()

In [85]:
df_chck['y_val_bin'] = y_val_binary
df_chck['y_pred'] = y_pred
df_chck['y_pred_bin']= (df_chck.y_pred>=0.5).astype(int)
df_chck['true'] = (df_chck['y_val_bin'] == df_chck['y_pred_bin']).astype(int)

In [87]:
df_chck.describe()

Unnamed: 0,y_val_bin,y_pred,true,y_pred_bin
count,9779.0,9779.0,9779.0,9779.0
mean,0.304019,0.305256,0.794867,0.29093
std,0.460014,0.273983,0.40382,0.454214
min,0.0,2e-06,0.0,0.0
25%,0.0,0.04889,1.0,0.0
50%,0.0,0.225064,1.0,0.0
75%,1.0,0.557821,1.0,1.0
max,1.0,0.966042,1.0,1.0


In [125]:
for series in df_train:
    new_df_train = df_train.copy()
    new_df_val = df_val.copy()
    new_df_train = new_df_train.drop(series,1)
    new_df_val = new_df_val.drop(series,1)
    train_dict = new_df_train.to_dict(orient='records')
    val_dict =new_df_val.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dict)
    X_val = dv.transform(val_dict)
    model = LogisticRegression(solver="liblinear", C=1.0, random_state=42)
    model.fit(X_train, y_train_binary)
    y_pred = model.predict_proba(X_val)[:, 1]
    price_level = (y_pred>= 0.5)
    accu = (y_val_binary == price_level).mean()
    print(f"accuracy without {series} is: {accu}. Difference vs full-features-train is: {abs(accu-0.7948665507720626)}")
        

accuracy without neighbourhood_group is: 0.7457817772778402. Difference vs full-features-train is: 0.049084773494222356
accuracy without room_type is: 0.7374987217506902. Difference vs full-features-train is: 0.057367829021372385
accuracy without latitude is: 0.7909806728704366. Difference vs full-features-train is: 0.003885877901625978
accuracy without longitude is: 0.7907761529808774. Difference vs full-features-train is: 0.004090397791185141
accuracy without minimum_nights is: 0.7951733306064015. Difference vs full-features-train is: 0.00030677983433891054
accuracy without number_of_reviews is: 0.793639431434707. Difference vs full-features-train is: 0.0012271193373555311
accuracy without reviews_per_month is: 0.7954801104407404. Difference vs full-features-train is: 0.0006135596686778211
accuracy without calculated_host_listings_count is: 0.7945597709377237. Difference vs full-features-train is: 0.00030677983433891054
accuracy without availability_365 is: 0.7865834952449126. Differ

In [122]:
new_df_train

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
2777,Brooklyn,Entire home/apt,40.68084,-73.92804,3,134,2.00,3,209
26060,Manhattan,Private room,40.82040,-73.93867,1,2,0.09,1,0
39854,Brooklyn,Private room,40.69007,-73.94326,7,13,2.39,1,93
31667,Manhattan,Private room,40.80931,-73.93982,4,8,0.64,4,0
3790,Brooklyn,Entire home/apt,40.71969,-73.95830,1,0,0.00,1,0
...,...,...,...,...,...,...,...,...,...
37462,Brooklyn,Entire home/apt,40.69293,-73.90537,1,4,0.48,1,35
35693,Manhattan,Entire home/apt,40.73917,-73.99412,4,7,0.77,1,0
2086,Manhattan,Entire home/apt,40.76662,-73.99302,3,209,2.70,1,202
26057,Brooklyn,Entire home/apt,40.68462,-73.93724,4,1,0.05,1,0


In [142]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [132]:
y_train_log

2777     5.141664
26060    3.931826
39854    3.713572
31667    4.189655
3790     6.216606
           ...   
37462    4.615121
35693    5.043425
2086     5.192957
26057    4.317488
6559     4.510860
Name: price, Length: 29337, dtype: float64

In [133]:
from sklearn.linear_model import Ridge
model = Ridge()

In [137]:
model.fit(X_train, y_train_log)

Ridge()

In [139]:
y_pred = model.predict(X_val)

In [140]:
y_pred

array([5.01074603, 5.1851424 , 5.05577026, ..., 5.03188429, 4.60941028,
       5.00384015])

In [141]:
def rmse(y, y_pred):
    error = y_pred - y
    mse = (error ** 2).mean()
    return np.sqrt(mse)

In [143]:
rmse(y_val_log,y_pred)

0.511839889225903

In [151]:
for alfa in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alfa)
    model.fit(X_train, y_train_log)
    y_pred = model.predict(X_val)
    print(f"rmse with alpha =  {alfa:.3f} \t is: \t {round(rmse(y_val_log,y_pred),3)}")

rmse with alpha =  0.000 	 is: 	 0.512
rmse with alpha =  0.010 	 is: 	 0.512
rmse with alpha =  0.100 	 is: 	 0.512
rmse with alpha =  1.000 	 is: 	 0.512
rmse with alpha =  10.000 	 is: 	 0.513
