In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sn

In [2]:
df = pd.read_csv('nyc_listings.csv')

In [3]:
df.dtypes

listing_id                                        int64
listing_url                                      object
scrape_id                                       float64
last_scraped                                     object
name                                             object
                                                 ...   
calculated_host_listings_count                    int64
calculated_host_listings_count_entire_homes       int64
calculated_host_listings_count_private_rooms      int64
calculated_host_listings_count_shared_rooms       int64
reviews_per_month                               float64
Length: 74, dtype: object

In [4]:
df.isnull().sum()
# there is missing data! what do we do?!?
# there is enought missing rows from host_response_time, host_response_rate, and host_acceptance_rate 
# (greater than 15,000 rows) to warrant getting rid of these columns

# there is enough missing rows from reviews_scores_rating, review_scores_accuracy, 
# review_scores_cleanliness, review_scores_checkin, review_scores_communication, review_scores_location,
# review_scores_value, and reviews_per_month (greater than 9,000 rows) to warrant getting rid of these columns

listing_id                                         0
listing_url                                        0
scrape_id                                          0
last_scraped                                       0
name                                              13
                                                ... 
calculated_host_listings_count                     0
calculated_host_listings_count_entire_homes        0
calculated_host_listings_count_private_rooms       0
calculated_host_listings_count_shared_rooms        0
reviews_per_month                               9296
Length: 74, dtype: int64

In [5]:
df.drop(['listing_url'], axis = 1, inplace = True)
df.drop(['scrape_id'], axis = 1, inplace = True)
df.drop(['last_scraped'], axis = 1, inplace = True)
df.drop(['name'], axis = 1, inplace = True)
df.drop(['description'], axis = 1, inplace = True) # don't have the resources/time/knowledge to extract anything from this
df.drop(['neighborhood_overview'], axis = 1, inplace = True)
df.drop(['picture_url'], axis = 1, inplace = True)
df.drop(['host_url'], axis = 1, inplace = True)
df.drop(['host_name'], axis = 1, inplace = True)
df.drop(['host_since'], axis = 1, inplace = True)
df.drop(['host_location'], axis = 1, inplace = True)
df.drop(['host_about'], axis = 1, inplace = True)
df.drop(['host_response_time'], axis = 1, inplace = True)
df.drop(['host_response_rate'], axis = 1, inplace = True)
df.drop(['host_acceptance_rate'], axis = 1, inplace = True)
df.drop(['host_thumbnail_url'], axis = 1, inplace = True)
df.drop(['host_picture_url'], axis = 1, inplace = True)
df.drop(['host_neighbourhood'], axis = 1, inplace = True)
df.drop(['neighbourhood'], axis = 1, inplace = True)
df.drop(['latitude'], axis = 1, inplace = True)
df.drop(['longitude'], axis = 1, inplace = True)
df.drop(['bathrooms'], axis = 1, inplace = True) # entire column missing
df.drop(['minimum_minimum_nights'], axis = 1, inplace = True)
df.drop(['maximum_minimum_nights'], axis = 1, inplace = True)
df.drop(['minimum_maximum_nights'], axis = 1, inplace = True)
df.drop(['maximum_maximum_nights'], axis = 1, inplace = True)
df.drop(['calendar_updated'], axis = 1, inplace = True)
df.drop(['calendar_last_scraped'], axis = 1, inplace = True)
df.drop(['first_review'], axis = 1, inplace = True)
df.drop(['last_review'], axis = 1, inplace = True)
df.drop(['review_scores_rating'], axis = 1, inplace = True)
df.drop(['review_scores_accuracy'], axis = 1, inplace = True)
df.drop(['review_scores_cleanliness'], axis = 1, inplace = True)
df.drop(['review_scores_checkin'], axis = 1, inplace = True)
df.drop(['review_scores_communication'], axis = 1, inplace = True)
df.drop(['review_scores_location'], axis = 1, inplace = True)
df.drop(['review_scores_value'], axis = 1, inplace = True)
df.drop(['reviews_per_month'], axis = 1, inplace = True)
df.drop(['license'], axis = 1, inplace = True)

# this may be able to be parse in the same way that the amenities column is
df.drop(['host_verifications'], axis = 1, inplace = True)

# find out how korosh is parsing through amenities
df.drop(['amenities'], axis = 1, inplace = True)

In [6]:
df.dtypes
# do we care about id, name, host_id, host_name (identifiers)?

listing_id                                        int64
host_id                                           int64
host_is_superhost                                object
host_listings_count                             float64
host_total_listings_count                       float64
host_has_profile_pic                             object
host_identity_verified                           object
neighbourhood_cleansed                           object
neighbourhood_group_cleansed                     object
property_type                                    object
room_type                                        object
accommodates                                      int64
bathrooms_text                                   object
bedrooms                                        float64
beds                                            float64
price                                            object
minimum_nights                                    int64
maximum_nights                                  

In [7]:
df.isnull().sum()

listing_id                                         0
host_id                                            0
host_is_superhost                                111
host_listings_count                              111
host_total_listings_count                        111
host_has_profile_pic                             111
host_identity_verified                           111
neighbourhood_cleansed                             0
neighbourhood_group_cleansed                       0
property_type                                      0
room_type                                          0
accommodates                                       0
bathrooms_text                                   105
bedrooms                                        3936
beds                                             611
price                                              0
minimum_nights                                     0
maximum_nights                                     0
minimum_nights_avg_ntm                        

In [8]:
df['has_availability'] = df['has_availability'].map({'t' : 1, 'f' : 0})
df['instant_bookable'] = df['instant_bookable'].map({'t' : 1, 'f' : 0})
df['host_is_superhost'] = df['host_is_superhost'].map({'t' : 1, 'f' : 0})

df['price']=df['price'].str.replace(',','')
df['price'] = pd.to_numeric(df['price'])

df['host_has_profile_pic'] = df['host_has_profile_pic'].map({'t' : 1, 'f' : 0})
df['host_identity_verified'] = df['host_identity_verified'].map({'t' : 1, 'f' : 0})

df['bathrooms_text'] = pd.Categorical(df['bathrooms_text'])
df['bathrooms_text'] = df['bathrooms_text'].cat.codes

df['neighbourhood_cleansed'] = pd.Categorical(df['neighbourhood_cleansed'])
df['neighbourhood_cleansed'] = df['neighbourhood_cleansed'].cat.codes

df['neighbourhood_group_cleansed'] = pd.Categorical(df['neighbourhood_group_cleansed'])
df['neighbourhood_group_cleansed'] = df['neighbourhood_group_cleansed'].cat.codes

df['property_type'] = pd.Categorical(df['property_type'])
df['property_type'] = df['property_type'].cat.codes

df['room_type'] = pd.Categorical(df['room_type'])
df['room_type'] = df['room_type'].cat.codes

In [9]:
df.dtypes

listing_id                                        int64
host_id                                           int64
host_is_superhost                               float64
host_listings_count                             float64
host_total_listings_count                       float64
host_has_profile_pic                            float64
host_identity_verified                          float64
neighbourhood_cleansed                            int16
neighbourhood_group_cleansed                       int8
property_type                                      int8
room_type                                          int8
accommodates                                      int64
bathrooms_text                                     int8
bedrooms                                        float64
beds                                            float64
price                                           float64
minimum_nights                                    int64
maximum_nights                                  

In [11]:
df['minimum_nights_avg_ntm'] = df['minimum_nights_avg_ntm'].fillna(method = "ffill")
df['maximum_nights_avg_ntm'] = df['maximum_nights_avg_ntm'].fillna(method = "ffill")

In [12]:
df.isnull().sum()

listing_id                                         0
host_id                                            0
host_is_superhost                                111
host_listings_count                              111
host_total_listings_count                        111
host_has_profile_pic                             111
host_identity_verified                           111
neighbourhood_cleansed                             0
neighbourhood_group_cleansed                       0
property_type                                      0
room_type                                          0
accommodates                                       0
bathrooms_text                                     0
bedrooms                                        3936
beds                                             611
price                                              0
minimum_nights                                     0
maximum_nights                                     0
minimum_nights_avg_ntm                        

In [None]:
df.corr()
plt.rcParams['figure.figsize'] = [25, 30]
sn.heatmap(df.corr(), vmin=df.corr().values.min(), vmax=1, cmap="YlGnBu", square=True, 
        linewidths=0.1, annot=True, annot_kws={"size":8})  