In [119]:
import pandas as pd
import numpy as np
 
import seaborn as sns
from matplotlib import pyplot as plt

from sklearn.model_selection import train_test_split

from sklearn.metrics import mutual_info_score

from sklearn.feature_extraction import DictVectorizer

from sklearn.linear_model import LogisticRegression, Ridge

Get The Data: New York City Airbnb Open Data

In [2]:
!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv

--2021-09-25 01:30:42--  https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7077973 (6.8M) [text/plain]
Saving to: ‘AB_NYC_2019.csv’


2021-09-25 01:30:43 (147 MB/s) - ‘AB_NYC_2019.csv’ saved [7077973/7077973]



In [3]:
df = pd.read_csv('AB_NYC_2019.csv')
df.head(3)

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365


In [4]:
HW_columns = [
              'neighbourhood_group',
              'room_type',
              'latitude',
              'longitude',
              'price',
              'minimum_nights',
              'number_of_reviews',
              'reviews_per_month',
              'calculated_host_listings_count',
              'availability_365'
              ]
HW_df = df[HW_columns]
HW_df.head(3)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Brooklyn,Private room,40.64749,-73.97237,149,1,9,0.21,6,365
1,Manhattan,Entire home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,Manhattan,Private room,40.80902,-73.9419,150,3,0,,1,365


In [5]:
HW_df.columns

Index(['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365'],
      dtype='object')

In [6]:
HW_df.isna().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

------------------------------------------------------------------------------
Fill Missing Values 

In [7]:
def X_fill_NA(df, base, val):
  df_fill = df[base]
  df_fill = df_fill.fillna(val)

  return df_fill

In [8]:
HW_df = X_fill_NA(HW_df, HW_columns, val=0)
HW_df.isna().sum()

neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

------------------------------------------------------------------------------
ANS Q1:

In [9]:
HW_df.neighbourhood_group.mode()

0    Manhattan
dtype: object

------------------------------------------------------------------------------
Split Data (Sklearn Library)

In [125]:
#80% Full Train, 20% Test
df_full_train, df_test = train_test_split(HW_df, test_size=0.2, random_state=42)

------------------------------------------------------------------------------
Make price binary

In [126]:
df_full_train['above_average'] = (df_full_train.price >= 152).astype(int)
df_full_train.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
32645,Brooklyn,Entire home/apt,40.71577,-73.9553,295,3,11,0.87,1,1,1
23615,Manhattan,Private room,40.84917,-73.94048,70,2,2,0.16,1,0,0
31183,Brooklyn,Private room,40.68993,-73.95947,58,2,0,0.0,2,0,0
29260,Brooklyn,Entire home/apt,40.68427,-73.93118,75,3,87,4.91,1,267,0
7275,Queens,Private room,40.74705,-73.89564,38,5,13,0.25,1,0,0
26011,Manhattan,Entire home/apt,40.76354,-73.99283,100,30,3,0.21,31,270,0
46572,Manhattan,Entire home/apt,40.76786,-73.95639,260,7,1,1.0,1,43,1
19902,Manhattan,Entire home/apt,40.73316,-74.00476,147,1,42,1.32,1,4,0
14159,Brooklyn,Private room,40.72527,-73.94803,34,1,0,0.0,1,0,0
28528,Brooklyn,Private room,40.6752,-73.94366,65,1,0,0.0,1,0,0


In [127]:
#60% Train, 20% Val (25% from Full Train -> 20%/80%)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.above_average.values
y_val = df_val.above_average.values

y_train_q6 = np.log1p(df_train.price.values)
y_val_q6 = np.log1p(df_val.price.values)
y_test = np.log1p(df_test.price.values)

del df_train['price']
del df_val['price']
del df_test['price']

In [124]:
print('Train Full:', len(df_full_train), 
      '(', (len(df_full_train)/len(HW_df))*100, '%)') 
print('Train:', len(df_train), 
      '(', (len(df_train)/len(HW_df))*100, '%)') 
print('Validation:', len(df_val), 
      '(', (len(df_val)/len(HW_df))*100, '%)') 
print('Test:', len(df_test), 
      '(', (len(df_test)/len(HW_df))*100, '%)') 

Train Full: 39116 ( 80.0 %)
Train: 29337 ( 60.0 %)
Validation: 9779 ( 20.0 %)
Test: 9779 ( 20.0 %)


------------------------------------------------------------------------------
ANS Q2:

In [14]:
corr_train = df_full_train.corr()
corr_train

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
latitude,1.0,0.080704,0.035015,0.025497,-0.011836,-0.013809,0.020299,-0.008341,0.056281
longitude,0.080704,1.0,-0.14908,-0.063498,0.05757,0.134401,-0.115289,0.082994,-0.267852
price,0.035015,-0.14908,1.0,0.04274,-0.048926,-0.051978,0.055336,0.080562,0.41125
minimum_nights,0.025497,-0.063498,0.04274,1.0,-0.07786,-0.121687,0.121748,0.140596,0.03232
number_of_reviews,-0.011836,0.05757,-0.048926,-0.07786,1.0,0.584935,-0.072603,0.175428,-0.053921
reviews_per_month,-0.013809,0.134401,-0.051978,-0.121687,0.584935,1.0,-0.047368,0.165565,-0.055888
calculated_host_listings_count,0.020299,-0.115289,0.055336,0.121748,-0.072603,-0.047368,1.0,0.223328,0.171793
availability_365,-0.008341,0.082994,0.080562,0.140596,0.175428,0.165565,0.223328,1.0,0.102623
above_average,0.056281,-0.267852,0.41125,0.03232,-0.053921,-0.055888,0.171793,0.102623,1.0


------------------------------------------------------------------------------
ANS Q3:

In [15]:
categorical = ['neighbourhood_group', 'room_type']

In [16]:
df_train[categorical].nunique() 

neighbourhood_group    5
room_type              3
dtype: int64

In [17]:
df_train[categorical]

Unnamed: 0,neighbourhood_group,room_type
13575,Brooklyn,Entire home/apt
48476,Manhattan,Private room
44499,Bronx,Entire home/apt
17382,Brooklyn,Entire home/apt
14638,Manhattan,Private room
...,...,...
13198,Brooklyn,Private room
14583,Brooklyn,Private room
6168,Manhattan,Private room
12248,Brooklyn,Private room


In [18]:
def mutual_info_price_scores(series):
  return mutual_info_score(series, df_full_train.above_average)

In [19]:
MI = df_full_train[categorical].apply(mutual_info_price_scores)
round(MI.sort_values(ascending=False), 2)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

In [20]:
numerical = ['latitude',
             'longitude',
             'minimum_nights',
             'number_of_reviews',
             'reviews_per_month',
             'calculated_host_listings_count',
             'availability_365']

In [21]:
dv = DictVectorizer(sparse=False)

------------------------------------------------------------------------------
OHE for Categorical Data, and Create X_train and X_val

In [81]:
def OHE_DV(df):
  dicts = df[categorical+numerical].to_dict(orient='records')
  X_data = dv.fit_transform(dicts)

  return X_data

In [82]:
X_train = OHE_DV(df_train)
X_train[0]

array([ 50.     ,  13.     ,  40.7276 , -73.94495,   3.     ,   0.     ,
         1.     ,   0.     ,   0.     ,   0.     ,  29.     ,   0.7    ,
         1.     ,   0.     ,   0.     ])

In [83]:
X_val = OHE_DV(df_val)
X_val.shape

(9779, 15)

In [84]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=Bronx',
 'neighbourhood_group=Brooklyn',
 'neighbourhood_group=Manhattan',
 'neighbourhood_group=Queens',
 'neighbourhood_group=Staten Island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=Entire home/apt',
 'room_type=Private room',
 'room_type=Shared room']

------------------------------------------------------------------------------
ANS Q4:

In [85]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42, max_iter=1000)
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [86]:
y_pred_val = model.predict(X_val)
y_pred_val

array([0, 1, 0, ..., 0, 0, 1])

In [87]:
from sklearn.metrics import accuracy_score

acc = accuracy_score(y_pred_val, y_val)
print('Model Accuracy : ', round(acc, 2))

Model Accuracy :  0.79


-------------------------------------------------------------------------------
ANS Q5:

In [101]:
df_train.head(1)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
13575,Brooklyn,Entire home/apt,40.7276,-73.94495,3,29,0.7,13,50,0


In [100]:
def OHE_DV_q5(df,elim):
  dicts = df[categorical+numerical]
  del dicts[elim]
  
  dicts = dicts.to_dict(orient='records')
  
  X_data = dv.fit_transform(dicts)

  return X_data

In [102]:
elim = ['neighbourhood_group', 'room_type',
        'number_of_reviews', 'reviews_per_month'
        ]

In [103]:
for e in elim:
  print(e)

neighbourhood_group
room_type
number_of_reviews
reviews_per_month


In [107]:
  df_train_q5 = df_train
  df_val_q5 = df_val

In [108]:
df_train_q5.head(1)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,above_average
13575,Brooklyn,Entire home/apt,40.7276,-73.94495,3,29,0.7,13,50,0


In [118]:
acc_list_q5 = []

for e in elim:

  df_train_q5 = df_train
  df_val_q5 = df_val

  X_train_q5 = OHE_DV_q5(df_train_q5, e)
  X_val_q5 = OHE_DV_q5(df_val_q5, e)

  model_q5 = LogisticRegression(solver='liblinear', C=1.0, random_state=42, max_iter=1000)
  model_q5.fit(X_train_q5, y_train)

  y_pred_val_q5 = model_q5.predict(X_val_q5)
  
  acc_q5 = accuracy_score(y_pred_val_q5, y_val)

  print(e, X_train_q5.shape, X_val_q5.shape, round(acc_q5, 2))

  acc_list_q5.append(round(acc_q5, 2))

neighbourhood_group (29337, 10) (9779, 10) 0.75
room_type (29337, 12) (9779, 12) 0.73
number_of_reviews (29337, 14) (9779, 14) 0.79
reviews_per_month (29337, 14) (9779, 14) 0.79


In [110]:
acc_list_q5

[0.75, 0.73, 0.79, 0.79]

In [117]:
acc_diff = round(acc, 2) - acc_list_q5
acc_diff = acc_diff.tolist()
acc_diff

[0.040000000000000036, 0.06000000000000005, 0.0, 0.0]

------------------------------------------------------------------------------
ANS Q6:

In [129]:
def RMSE(y, y_pred):
  e = y_pred - y
  mse = (e ** 2).mean()
  rmse_val = np.sqrt(mse)

  return rmse_val

In [134]:
RMSE_list = []
for alpha in [0, 0.01, 0.1, 1, 10]:
  X_train_q6 = OHE_DV(df_train)
  X_val_q6 = OHE_DV(df_val)

  model_q6 = Ridge(alpha=alpha, solver='svd')
  model_q6.fit(X_train_q6, y_train_q6)

  y_pred_val_q6 = model_q6.predict(X_val_q6)

  RMSE_score = round(RMSE(y_val_q6, y_pred_val_q6), 3)

  RMSE_list.append(RMSE_score)

  print('Alpha = %4s' %alpha, '-> RMSE value = ', RMSE_score)

Alpha =    0 -> RMSE value =  0.497
Alpha = 0.01 -> RMSE value =  0.497
Alpha =  0.1 -> RMSE value =  0.497
Alpha =    1 -> RMSE value =  0.497
Alpha =   10 -> RMSE value =  0.498
