In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from matplotlib import cm
from tqdm import tqdm

from mpl_toolkits.basemap import Basemap

#pip install https://github.com/matplotlib/basemap/archive/master.zip

In [None]:
data_path = "../data/AB_NYC_2019.csv"
!wc {data_path}

In [None]:
df_tmp = pd.read_csv(
    data_path,
    nrows=5
)

columns = list(df_tmp.columns)

print("\nFeatures in dataset:")
for column in columns:
    print(column)

traintypes = {
    'id': 'int32',
    'name': 'str',
    'host_id': 'int32',
    'host_name': 'str',
    'neighbourhood_group': 'str',
    'neighbourhood': 'str',
    'latitude': 'float32',
    'longitude': 'float32',
    'room_type': 'str',
    'price': 'uint16',
    'minimum_nights': 'uint16',
    'number_of_reviews': 'uint16',
    'last_review': 'str',
    'reviews_per_month': 'float16',
    'calculated_host_listings_count': 'uint16',
    'availability_365': 'uint16',
}
cols = list(traintypes.keys())

df_list = []

chunksize = 1_000_000

for df_chunk in tqdm(
    pd.read_csv(
        data_path, 
        dtype=traintypes, 
        chunksize=chunksize
    )
):
    df_chunk['last_review'] = df_chunk['last_review'].str.slice(0, 16)
    df_chunk['last_review'] = pd.to_datetime(df_chunk['last_review'], utc=True, format='%Y-%m-%d %H:%M')
    
    df_list.append(df_chunk)
    
bnb_df = pd.concat(df_list[0:1])
bnb_df = bnb_df[~bnb_df.isin([np.nan, np.inf, -np.inf]).any(1)]

bnb_df = bnb_df[bnb_df["neighbourhood_group"] == "Manhattan"]

del df_list

bnb_df.head()

In [None]:
neighbourhoods = bnb_df["neighbourhood"].unique()

print("There are {} AirBnB listings in Manhattan.".format(bnb_df.shape[0]))

plt.figure(figsize=(20,10))
n_plot = sns.countplot(bnb_df["neighbourhood"])
n_plot.set_xticklabels(labels=neighbourhoods, rotation=80)
plt.show()

In [None]:
harlem_df = bnb_df[bnb_df["neighbourhood"] == "Harlem"]

harlem_lon = harlem_df["longitude"].values
harlem_lat = harlem_df["latitude"].values

print(harlem_lon[:10])
print(harlem_lat[:10])

harlem_df.head()

In [None]:
fig = plt.figure(figsize=(20,10))

# Manhattan Bounding Box
bbox_raw = [
    [-74.0551665059,40.6825717946], # lower-left
    [-73.8642790547,40.6825717946], # upper-left
    [-73.8642790547,40.8988332574], # upper-right
    [-74.0551665059,40.8988332574], # lower-right
    [-74.0551665059,40.6825717946]  # lower-left
]

min_lat = min([coord[1] for coord in bbox_raw])
min_lon = min([coord[0] for coord in bbox_raw])

max_lat = max([coord[1] for coord in bbox_raw])
max_lon = max([coord[0] for coord in bbox_raw])

center_lat = min_lat + max_lat / 2
center_lon = min_lon + max_lon / 2

ll_crnr = (min_lat, min_lon)
lr_crnr = (min_lat, max_lon)

ur_crnr = (max_lat, max_lon)
ul_crnr = (max_lat, min_lon)

shape_path = "../data/shapefiles/nyc_neighborhoods.shp"

expansion = 0.0

m = Basemap(
    llcrnrlon=min_lon - expansion,
    llcrnrlat=min_lat - expansion,
    urcrnrlon=max_lon + expansion,
    urcrnrlat=max_lat + expansion,
    epsg=2263
    #projection="mill",
    #resolution="f"
)

m.arcgisimage(service='World_Street_Map', xpixels = 3000)

# m.drawmapboundary()
# m.drawcoastlines()
# m.drawcountries()
# m.drawrivers()
# m.drawstates()

x, y = m(bnb_df["longitude"].values, bnb_df["latitude"].values)
m.scatter(
    x,y,
    s=5,
    alpha=0.2,
    c=np.log10(np.add(bnb_df["price"].values, 1)),
    cmap="viridis"
)

#m.hexbin(x, y, gridsize=1000, bins='log', cmap=cm.YlOrRd_r)