In [42]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn import preprocessing

%matplotlib inline

In [29]:
df = pd.read_csv("data/AB_NYC_2019.csv")
df.head()

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [35]:
df.describe()

Unnamed: 0,id,host_id,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,19017140.0,67620010.0,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,10983110.0,78610970.0,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,2539.0,2438.0,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,9471945.0,7822033.0,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,19677280.0,30793820.0,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,29152180.0,107434400.0,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,36487240.0,274321300.0,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


Some of these columns are irrelevant to the number of reviews being posted. For example, there's no mechanism by which id, host_name, and host_id could impact the number of reviews. Last review is also not helpful because it is based on a date from 18 months ago (18-07-2019).

In [30]:
# for now:

features = df.drop(columns = ['id', 'host_id', 'host_name', 'last_review'])

features.head()

Unnamed: 0,name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,Clean & quiet apt home by the park,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365
1,Skylit Midtown Castle,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355
2,THE VILLAGE OF HARLEM....NEW YORK !,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,1,365
3,Cozy Entire Floor of Brownstone,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194
4,Entire Apt: Spacious Studio/Loft by central park,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0


## Handling missing values

In [31]:
# examine missing values

features.isna().sum()

name                                 16
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [32]:
features[features['reviews_per_month'].isna()]['number_of_reviews'].unique()

array([0], dtype=int64)

Missing values in the 'reviews per month' column arise from the number of reviews being 0. These can be replaced with zero as if there are no reviews at all then the number of reviews per month must also be zero.

In [34]:
features['reviews_per_month'] = features['reviews_per_month'].fillna(0)
features.isna().sum()

name                              16
neighbourhood_group                0
neighbourhood                      0
latitude                           0
longitude                          0
room_type                          0
price                              0
minimum_nights                     0
number_of_reviews                  0
reviews_per_month                  0
calculated_host_listings_count     0
availability_365                   0
dtype: int64

Since only 16 rows have missing names out of more than 48000 observations, we can safely drop them.

In [36]:
features.dropna(axis = 0, inplace = True)
features.isna().sum()

name                              0
neighbourhood_group               0
neighbourhood                     0
latitude                          0
longitude                         0
room_type                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

## Feature engineering

In [37]:
neighbourhood_groups = pd.get_dummies(df['neighbourhood_group'])
features[neighbourhood_groups.columns] = neighbourhood_groups
features.drop(columns= ['neighbourhood_group'], inplace = True)
features.head()

Unnamed: 0,name,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Bronx,Brooklyn,Manhattan,Queens,Staten Island
0,Clean & quiet apt home by the park,Kensington,40.64749,-73.97237,Private room,149,1,9,0.21,6,365,0,1,0,0,0
1,Skylit Midtown Castle,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,0.38,2,355,0,0,1,0,0
2,THE VILLAGE OF HARLEM....NEW YORK !,Harlem,40.80902,-73.9419,Private room,150,3,0,0.0,1,365,0,0,1,0,0
3,Cozy Entire Floor of Brownstone,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,4.64,1,194,0,1,0,0,0
4,Entire Apt: Spacious Studio/Loft by central park,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,0.1,1,0,0,0,1,0,0


In [38]:
room_types = pd.get_dummies(df['room_type'])
features[room_types.columns] = room_types
features.drop(columns = ['room_type'], inplace = True)
features.head()

Unnamed: 0,name,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Bronx,Brooklyn,Manhattan,Queens,Staten Island,Entire home/apt,Private room,Shared room
0,Clean & quiet apt home by the park,Kensington,40.64749,-73.97237,149,1,9,0.21,6,365,0,1,0,0,0,0,1,0
1,Skylit Midtown Castle,Midtown,40.75362,-73.98377,225,1,45,0.38,2,355,0,0,1,0,0,1,0,0
2,THE VILLAGE OF HARLEM....NEW YORK !,Harlem,40.80902,-73.9419,150,3,0,0.0,1,365,0,0,1,0,0,0,1,0
3,Cozy Entire Floor of Brownstone,Clinton Hill,40.68514,-73.95976,89,1,270,4.64,1,194,0,1,0,0,0,1,0,0
4,Entire Apt: Spacious Studio/Loft by central park,East Harlem,40.79851,-73.94399,80,10,9,0.1,1,0,0,0,1,0,0,1,0,0


In [39]:
# we actually don't need all of the neighbourhood groups and all of the room types - 
#     - if an observation is not entire home/apt and is not private room then we can infer it is a shared room

features.drop(columns = ['Staten Island', 'Shared room'], inplace = True)

In [43]:
# extract some information from the name - how does the owner describe the listing?
def extract_adjectives(list_of_tagged_words):
    adjectives = []
    for word, tag in list_of_tagged_words:
        if tag == 'JJ' and word not in adjectives:
            adjectives.append(word)
    return adjectives

def entity_extract(sentence):
    if not isinstance(sentence,str):
        return ''
    lower_sentence = sentence.lower()
    tokenized_sentence = word_tokenize(lower_sentence)
    pos_tagged_sentence = pos_tag(tokenized_sentence)
    return extract_adjectives(pos_tagged_sentence)

features['adjectives'] = features['name'].apply(entity_extract)
features[['name', 'adjectives']].head(20)

Unnamed: 0,name,adjectives
0,Clean & quiet apt home by the park,"[clean, quiet, apt]"
1,Skylit Midtown Castle,[skylit]
2,THE VILLAGE OF HARLEM....NEW YORK !,[new]
3,Cozy Entire Floor of Brownstone,[entire]
4,Entire Apt: Spacious Studio/Loft by central park,"[entire, spacious, central]"
5,Large Cozy 1 BR Apartment In Midtown East,"[large, midtown]"
6,BlissArtsSpace!,[]
7,Large Furnished Room Near B'way,[large]
8,Cozy Clean Guest Room - Family Apt,"[cozy, clean]"
9,Cute & Cozy Lower East Side 1 bdrm,[east]
