# Load and Inspect Data

In [26]:
# Ingest config file and create python variables for parameters in config file
import yaml

# Open the file and load the file
with open("prepare_data_config.yaml", 'r') as stream:
    try:
        config = yaml.safe_load(stream)
    except yaml.YAMLError as exc:
        print(exc)

# Assign variables from the config file
load_from_scratch = config["general"]["load_from_scratch"]
save_raw_dataframe = config["general"]["save_raw_dataframe"]
save_transformed_dataframe = config["general"]["save_transformed_dataframe"]
remove_bad_values = config["general"]["remove_bad_values"]

categorical_columns = config["columns"]["categorical"]
continuous_columns = config["columns"]["continuous"]
date_columns = config["columns"]["date"]
text_columns = config["columns"]["text"]
excluded_columns = config["columns"]["excluded"]

bounding_box = config["bounding_box"]
newark_bounding_box = config["newark_bounding_box"]

geo_columns = config["geo_columns"]

file_names = config["file_names"]


In [8]:
# read the csv file 
import pandas as pd

df = pd.read_csv('AB_NYC_2019.csv')

# Show the first few rows of the data
df.head()


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.9419,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.1,1,0


In [11]:
df.columns

Index(['id', 'name', 'host_id', 'host_name', 'neighbourhood_group',
       'neighbourhood', 'latitude', 'longitude', 'room_type', 'price',
       'minimum_nights', 'number_of_reviews', 'last_review',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365'],
      dtype='object')

Categorical Columns:
- neighbourhood_group
- neighbourhood
- room_type

Continuous Columns:
- latitude
- longitude
- price
- minimum_nights
- number_of_reviews
- reviews_per_month
- calculated_host_listings_count
- availability_365

Text Columns:
- name
- host_name

In [51]:
# Count of missing values for each column
# have the ouput look like Missing values in 'column name" : "count of missing values"
for column in df.columns:
    print("Missing values in '{}' : {}".format(column, df[column].isnull().sum()))


Missing values in 'id' : 0
Missing values in 'name' : 16
Missing values in 'host_id' : 0
Missing values in 'host_name' : 21
Missing values in 'neighbourhood_group' : 0
Missing values in 'neighbourhood' : 0
Missing values in 'latitude' : 0
Missing values in 'longitude' : 0
Missing values in 'room_type' : 0
Missing values in 'price' : 0
Missing values in 'minimum_nights' : 0
Missing values in 'number_of_reviews' : 0
Missing values in 'last_review' : 10052
Missing values in 'reviews_per_month' : 10052
Missing values in 'calculated_host_listings_count' : 0
Missing values in 'availability_365' : 0


last_review and reviews_per_month columns have quite a few missing values. 

If a listing (a property available for rent) has not yet received any reviews, then these fields will be missing. 

These listings might be new, less popular, located in less frequented areas, priced unattractively, or simply have not been reviewed by the guests who stayed there.

In [27]:
# Check for negative values in continuous columns
continuous_columns_no_lat_long = ['price',
                      'minimum_nights',
                      'number_of_reviews',
                      'reviews_per_month',
                      'calculated_host_listings_count',
                      'availability_365']


for col in continuous_columns_no_lat_long:
    negative_values = df[df[col] < 0] 
    # print the column name and the number of negative values
    print(f"Column: {col} has {negative_values.shape[0]} negative values")


Column: price has 0 negative values
Column: minimum_nights has 0 negative values
Column: number_of_reviews has 0 negative values
Column: reviews_per_month has 0 negative values
Column: calculated_host_listings_count has 0 negative values
Column: availability_365 has 0 negative values


In [42]:
# Check for lantitude and longitude values outside new york city
NYC_bounding_box = config['bounding_box']

invalid_values_geo = df[
    (df['latitude'] < NYC_bounding_box['min_lat']) |
    (df['latitude'] > NYC_bounding_box['max_lat']) |
    (df['longitude'] < NYC_bounding_box['min_long']) |
    (df['longitude'] > NYC_bounding_box['max_long'])
]
# print the invalid values
print(NYC_bounding_box)
invalid_values_geo

{'max_long': -73.70018092, 'max_lat': 40.91617849, 'min_long': -74.25909008, 'min_lat': 40.47739894}


Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365


In [31]:
# Check the number of distinct value in each categorical column
for col in categorical_columns:
    print(col, len(df[col].unique()))


neighbourhood_group 5
neighbourhood 221
room_type 3


In [38]:
# summary stats
# exclude the id and host id columns from the summary stats
df.loc[:,~df.columns.isin(['id','host_id'])].describe()

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,38843.0,48895.0,48895.0
mean,40.728949,-73.95217,152.720687,7.029962,23.274466,1.373221,7.143982,112.781327
std,0.05453,0.046157,240.15417,20.51055,44.550582,1.680442,32.952519,131.622289
min,40.49979,-74.24442,0.0,1.0,0.0,0.01,1.0,0.0
25%,40.6901,-73.98307,69.0,1.0,1.0,0.19,1.0,0.0
50%,40.72307,-73.95568,106.0,3.0,5.0,0.72,1.0,45.0
75%,40.763115,-73.936275,175.0,5.0,24.0,2.02,2.0,227.0
max,40.91306,-73.71299,10000.0,1250.0,629.0,58.5,327.0,365.0


In [46]:
# save the dataframe to pickle file
df.to_pickle('data.pkl')