In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("../../AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [3]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,2539,2595,3647,3831,5022
name,Clean & quiet apt home by the park,Skylit Midtown Castle,THE VILLAGE OF HARLEM....NEW YORK !,Cozy Entire Floor of Brownstone,Entire Apt: Spacious Studio/Loft by central park
host_id,2787,2845,4632,4869,7192
host_name,John,Jennifer,Elisabeth,LisaRoxanne,Laura
neighbourhood_group,Brooklyn,Manhattan,Manhattan,Brooklyn,Manhattan
neighbourhood,Kensington,Midtown,Harlem,Clinton Hill,East Harlem
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
room_type,Private room,Entire home/apt,Private room,Entire home/apt,Entire home/apt
price,149,225,150,89,80


In [4]:
columns = [
    'neighbourhood_group',
    'room_type',
    'latitude',
    'longitude',
    'price',
    'minimum_nights',
    'number_of_reviews',
    'reviews_per_month',
    'calculated_host_listings_count',
    'availability_365'
]

In [5]:
df = df[columns]

In [6]:
df = df.fillna(0)

In [7]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [8]:
df.head().T

Unnamed: 0,0,1,2,3,4
neighbourhood_group,brooklyn,manhattan,manhattan,brooklyn,manhattan
room_type,private_room,entire_home/apt,private_room,entire_home/apt,entire_home/apt
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
price,149,225,150,89,80
minimum_nights,1,1,3,1,10
number_of_reviews,9,45,0,270,9
reviews_per_month,0.21,0.38,0.0,4.64,0.1
calculated_host_listings_count,6,2,1,1,1
availability_365,365,355,365,194,0


In [9]:
df.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
dtype: object

## EDA /Q1

In [10]:
df['neighbourhood_group'].value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

### data split

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

In [13]:
len(df_full_train), len(df_test)

(39116, 9779)

In [14]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=42)

In [15]:
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [16]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
y_train = df_train['price']
y_val = df_val['price']
y_test = df_test['price']

In [18]:
y_train_price = df_train['price']
y_val_price = df_val['price']
y_test_price = df_test['price']

In [19]:
del df_train['price']
del df_val['price']
del df_test['price']

### EDA

In [20]:
df_full_train = df_full_train.reset_index(drop=True)

In [21]:
df_full_train.isnull().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [22]:
df_full_train["above_average"] = (df_full_train["price"] >= 152).astype(int) 

In [23]:
df_full_train["above_average"].value_counts(normalize=True)

0    0.693987
1    0.306013
Name: above_average, dtype: float64

In [24]:
above_average_mean = df_full_train["above_average"].mean()
above_average_mean

0.3060128847530422

In [25]:
df_full_train.dtypes

neighbourhood_group                object
room_type                          object
latitude                          float64
longitude                         float64
price                               int64
minimum_nights                      int64
number_of_reviews                   int64
reviews_per_month                 float64
calculated_host_listings_count      int64
availability_365                    int64
above_average                       int32
dtype: object

In [26]:
numerical = ["latitude", "longitude", 
             "minimum_nights", "number_of_reviews", 
             "reviews_per_month", "calculated_host_listings_count", 
             "availability_365"]

In [27]:
categorical = ["neighbourhood_group",
              "room_type"]

In [28]:
df_full_train[categorical].nunique()

neighbourhood_group    5
room_type              3
dtype: int64

In [29]:
df_full_train[numerical].describe()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,39116.0,39116.0,39116.0,39116.0,39116.0,39116.0,39116.0
mean,40.728927,-73.952311,7.144775,23.334978,1.094376,7.188925,113.445368
std,0.054555,0.046029,21.472973,44.499352,1.604249,33.172629,131.84643
min,40.49979,-74.24285,1.0,0.0,0.0,1.0,0.0
25%,40.690067,-73.98308,1.0,1.0,0.04,1.0,0.0
50%,40.723035,-73.95575,3.0,5.0,0.37,1.0,46.0
75%,40.76318,-73.936498,5.0,24.0,1.59,2.0,229.0
max,40.91306,-73.71299,1250.0,629.0,58.5,327.0,365.0


### Feature importance

In [30]:
df_full_train.head()

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
0,brooklyn,entire_home/apt,40.71577,-73.9553,295,3,11,0.87,1,1,1
1,manhattan,private_room,40.84917,-73.94048,70,2,2,0.16,1,0,0
2,brooklyn,private_room,40.68993,-73.95947,58,2,0,0.0,2,0,0
3,brooklyn,entire_home/apt,40.68427,-73.93118,75,3,87,4.91,1,267,0
4,queens,private_room,40.74705,-73.89564,38,5,13,0.25,1,0,0


In [31]:
brooklyn_price = df_full_train[df_full_train.neighbourhood_group == 
                             "brooklyn"]["above_average"].mean()
brooklyn_price

0.21278578528827038

In [32]:
manhattan_price = df_full_train[df_full_train.neighbourhood_group == 
                             "manhattan"]["above_average"].mean()
manhattan_price

0.4551783659378596

In [33]:
from IPython.display import display

In [34]:
for c in categorical:
    print(c)
    df_group = df_full_train.groupby(c)["above_average"].agg(['mean', 'count'])
    df_group['diff'] = df_group['mean'] - above_average_mean
    df_group['increase_ratio'] = df_group['mean'] / above_average_mean
    display(df_group)
    print()

neighbourhood_group


Unnamed: 0_level_0,mean,count,diff,increase_ratio
neighbourhood_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bronx,0.072917,864,-0.233096,0.23828
brooklyn,0.212786,16096,-0.093227,0.695349
manhattan,0.455178,17380,0.149165,1.487448
queens,0.119366,4482,-0.186647,0.39007
staten_island,0.122449,294,-0.183564,0.400143



room_type


Unnamed: 0_level_0,mean,count,diff,increase_ratio
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
entire_home/apt,0.528999,20380,0.222986,1.728682
private_room,0.063703,17817,-0.24231,0.208172
shared_room,0.05876,919,-0.247253,0.192016





### mutual information

In [35]:
from sklearn.metrics import mutual_info_score

In [36]:
mutual_info_score(df_full_train["above_average"], df_full_train["number_of_reviews"])

0.009805871458272327

In [37]:
mutual_info_score(df_full_train["above_average"], df_full_train["room_type"])

0.1423898076642953

In [38]:
mutual_info_score(df_full_train["above_average"], df_full_train["calculated_host_listings_count"])

0.03660486188096422

In [39]:
def mutual_info_ab_score(series):
    if series.isnull().sum() == 0:
        return mutual_info_score(series, df_full_train["above_average"])
    else:
        return 0

In [40]:
mi = df_full_train[categorical].apply(mutual_info_ab_score)
mi.sort_values(ascending=False)

room_type              0.142390
neighbourhood_group    0.046223
dtype: float64

### Correlation

In [41]:
df_full_train[numerical].corrwith(df_full_train["above_average"])

latitude                          0.056281
longitude                        -0.267852
minimum_nights                    0.032320
number_of_reviews                -0.053921
reviews_per_month                -0.055888
calculated_host_listings_count    0.171793
availability_365                  0.102623
dtype: float64

### Q2

In [42]:
df_train.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


In [43]:
y_train = (y_train > 152).astype(int)
y_val = (y_val > 152).astype(int)
y_test = (y_test > 152).astype(int)

## Q3

In [44]:
round(mutual_info_score(df_train["neighbourhood_group"], y_train), 2)

0.05

In [45]:
round(mutual_info_score(df_train["room_type"], y_train), 2)

0.14

## One hot encoding

In [46]:
from sklearn.feature_extraction import DictVectorizer

In [47]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [48]:
dv = DictVectorizer(sparse=False)

In [49]:
X_train = dv.fit_transform(train_dicts)

In [50]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=bronx',
 'neighbourhood_group=brooklyn',
 'neighbourhood_group=manhattan',
 'neighbourhood_group=queens',
 'neighbourhood_group=staten_island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=entire_home/apt',
 'room_type=private_room',
 'room_type=shared_room']

In [51]:
def get_encoded_df(df, dv):
    dicts = df[categorical + numerical].to_dict(orient='records')
    return dv.transform(dicts)

In [52]:
X_val = get_encoded_df(df_val, dv)

In [53]:
X_test = get_encoded_df(df_test, dv)

### Logistic regression

In [54]:
from sklearn.linear_model import LogisticRegression

In [55]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=42)

In [56]:
model.coef_[0].round(3)

array([ 0.003,  0.004, -0.228, -0.095, -0.011, -0.396,  0.124,  1.216,
       -0.817, -0.13 , -0.003, -0.041,  1.646, -1.179, -0.469])

In [57]:
model.intercept_[0]

-0.002606208874445309

In [58]:
model.predict_proba(X_train)

array([[0.61938225, 0.38061775],
       [0.90364173, 0.09635827],
       [0.82173025, 0.17826975],
       ...,
       [0.88621228, 0.11378772],
       [0.9825017 , 0.0174983 ],
       [0.45510561, 0.54489439]])

In [59]:
y_val_predict = model.predict(X_val)

### Q4

In [60]:
round((y_val_predict == y_val).mean(), 2)

0.79

### Model interpretation

In [61]:
dict(zip(dv.get_feature_names(), model.coef_[0]))

{'availability_365': 0.0029614639405289834,
 'calculated_host_listings_count': 0.004331309996018585,
 'latitude': -0.22821604110554924,
 'longitude': -0.09464556237664266,
 'minimum_nights': -0.011460662557817181,
 'neighbourhood_group=bronx': -0.39558707411181826,
 'neighbourhood_group=brooklyn': 0.12408532573887529,
 'neighbourhood_group=manhattan': 1.2157632203713544,
 'neighbourhood_group=queens': -0.8167132688942499,
 'neighbourhood_group=staten_island': -0.13014902314296029,
 'number_of_reviews': -0.0034099823383884394,
 'reviews_per_month': -0.04147831461993476,
 'room_type=entire_home/apt': 1.6456501944939808,
 'room_type=private_room': -1.1790347270848718,
 'room_type=shared_room': -0.46921628744943134}

### Q5

In [62]:
def get_model_performance(df, column_list):
    train_dicts = df[column_list].to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(X_train, y_train)
    y_train_predict = model.predict(X_train)
    accuracy = (y_train_predict == y_train).mean()
    return accuracy

In [63]:
original_accuracy = get_model_performance(df_train, categorical + numerical)
for column in categorical + numerical:
    column_list = categorical + numerical
    column_list.remove(column)
    accuracy = get_model_performance(df_train, column_list)
    print(f"Column {column} is removed, the accuracy difference is {abs(original_accuracy - accuracy)}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column neighbourhood_group is removed, the accuracy difference is 0.04066537137403281


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column room_type is removed, the accuracy difference is 0.07086614173228345


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column latitude is removed, the accuracy difference is 6.81732965197579e-05


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column longitude is removed, the accuracy difference is 0.0002726931860790316


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column minimum_nights is removed, the accuracy difference is 0.0010907727443161264


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column number_of_reviews is removed, the accuracy difference is 0.0008862528547568527


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column reviews_per_month is removed, the accuracy difference is 0.00047721307563819426


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Column calculated_host_listings_count is removed, the accuracy difference is 0.0011930326890957632
Column availability_365 is removed, the accuracy difference is 0.0038177046051062202


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Q6

In [64]:
from sklearn.linear_model import Ridge

In [66]:
y_train_log, y_val_log = np.log1p(y_train_price), np.log1p(y_val_price) 

In [67]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dicts)

In [70]:
from sklearn.metrics import mean_squared_error

for alpha in [0, 0.01, 0.1, 1, 10]:
    model = Ridge(alpha=alpha).fit(X_train, y_train_log)
    y_val_pred = model.predict(X_val)
    rmse = mean_squared_error(y_val_log, y_val_pred, squared=False)
    print(f"For alpha={alpha} rmse is {round(rmse, 3)}")

For alpha=0 rmse is 0.497
For alpha=0.01 rmse is 0.497
For alpha=0.1 rmse is 0.497
For alpha=1 rmse is 0.497
For alpha=10 rmse is 0.498


  return linalg.solve(A, Xy, sym_pos=True,
