## Import the database from MongoDB

In [1]:
from pymongo import MongoClient
import pandas as pd
from json import loads, dumps

In [2]:
mongo = MongoClient(port=27017)

In [3]:
print(mongo.list_database_names())

['admin', 'classDB', 'config', 'epa', 'fruits_db', 'listings_db', 'local', 'travel_db', 'uk_food']


In [4]:
listings = mongo['listings_db'].nyc_listings
print(listings)

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'listings_db'), 'nyc_listings')


In [5]:
listings_arr = listings.find()
listings_df = pd.DataFrame(listings_arr)
listings_df.head()

Unnamed: 0,_id,id,listing_url,name,neighborhood_overview,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,first_review,last_review,review_scores_rating,reviews_per_month
0,666c99367177588cd31b62d7,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,Centrally located in the heart of Manhattan ju...,Midtown,Manhattan,40.75356,-73.98559,Entire rental unit,...,1 bath,,1.0,"[Long term stays allowed, Cooking basics, Ethe...",$240.00,49,2009-11-21,2022-06-21,4.68,0.29
1,666c99367177588cd31b62d8,5121,https://www.airbnb.com/rooms/5121,Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...,,Bedford-Stuyvesant,Brooklyn,40.68535,-73.95512,Private room in rental unit,...,,,1.0,"[Air conditioning, Wifi, Kitchen, Heating]",$66.00,50,2009-05-28,2019-12-02,4.52,0.28
2,666c99367177588cd31b62d9,6848,https://www.airbnb.com/rooms/6848,Rental unit in Brooklyn · ★4.58 · 2 bedrooms ·...,,Williamsburg,Brooklyn,40.70935,-73.95342,Entire rental unit,...,1 bath,,1.0,"[Cooking basics, Microwave, Extra pillows and ...",$81.00,191,2009-05-25,2023-08-14,4.58,1.08
3,666c99367177588cd31b62da,6990,https://www.airbnb.com/rooms/6990,Rental unit in New York · ★4.88 · 1 bedroom · ...,"Location: Five minutes to Central Park, Museum...",East Harlem,Manhattan,40.78778,-73.94759,Private room in rental unit,...,1 shared bath,,1.0,"[Hair dryer, Air conditioning, Fire extinguish...",$70.00,246,2009-10-28,2023-08-14,4.88,1.43
4,666c99367177588cd31b62db,6872,https://www.airbnb.com/rooms/6872,Condo in New York · 1 bedroom · 1 bed · 1 shar...,This sweet Harlem sanctuary is a 10-20 minute ...,East Harlem,Manhattan,40.80107,-73.94255,Private room in condo,...,1 shared bath,,1.0,"[Fire extinguisher, Long term stays allowed, H...",$65.00,1,2022-06-05,2022-06-05,5.0,0.05


## Clean the data

In [6]:
listings_df.dtypes

_id                                     object
id                                       int64
listing_url                             object
name                                    object
neighborhood_overview                   object
neighbourhood_cleansed                  object
neighbourhood_group_cleansed            object
latitude                               float64
longitude                              float64
property_type                           object
room_type                               object
accommodates                             int64
bathrooms                               object
bathrooms_text                          object
bedrooms                               float64
beds                                   float64
amenities                               object
price                                   object
number_of_reviews                        int64
first_review                    datetime64[ns]
last_review                     datetime64[ns]
review_scores

In [7]:
reduced_listings = listings_df[['name', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed'\
                               ,'latitude','longitude','room_type', 'amenities', 'price'\
                                , 'number_of_reviews', 'review_scores_rating']]
reduced_listings.dtypes

name                             object
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
latitude                        float64
longitude                       float64
room_type                        object
amenities                        object
price                            object
number_of_reviews                 int64
review_scores_rating            float64
dtype: object

In [8]:
reduced_listings['price'] = reduced_listings['price'].str.replace(",", "")
reduced_listings['price'] = reduced_listings['price'].str.replace("$", "")

reduced_listings['price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_listings['price'] = reduced_listings['price'].str.replace(",", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_listings['price'] = reduced_listings['price'].str.replace("$", "")


0        240.00
1         66.00
2         81.00
3         70.00
4         65.00
          ...  
28127    225.00
28128    101.00
28129    105.00
28130    118.00
28131    285.00
Name: price, Length: 28132, dtype: object

In [9]:
reduced_listings = reduced_listings.astype({
    "price": float
})

reduced_listings.dtypes

name                             object
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
latitude                        float64
longitude                       float64
room_type                        object
amenities                        object
price                           float64
number_of_reviews                 int64
review_scores_rating            float64
dtype: object

In [10]:
reduced_listings.to_json("data/cleaned_listings.json", orient="records")


# Aggregate Functions

In [11]:
boroughs = reduced_listings["neighbourhood_group_cleansed"].unique()

## Find Average Price and Count of Room Types:
### Grouped by borough, neighbourhood, and room type

In [12]:
# avg price
def nbhd_price_avg(df):
    avg_price = df.groupby(['neighbourhood_group_cleansed', "neighbourhood_cleansed", 'room_type'])["price"].mean()
    avg_price_boro = avg_price.round(2).reset_index() 
    return avg_price_boro

price_per_nbhd = nbhd_price_avg(reduced_listings)

In [13]:
# room counts
def nbhd_room_type_count(df):
    roomtype_grouped = df.groupby(['neighbourhood_group_cleansed','neighbourhood_cleansed', 'room_type']).size().reset_index(name='room_count')
    return roomtype_grouped

num_rooms_per_nbhd = nbhd_room_type_count(reduced_listings)

In [14]:
# Merge both DataFrames
combined_df = price_per_nbhd.merge(num_rooms_per_nbhd, how='inner', on=['neighbourhood_group_cleansed','neighbourhood_cleansed', 'room_type'])

combined_df = combined_df.rename(columns={'neighbourhood_group_cleansed': 'Borough', 'neighbourhood_cleansed': 'Neighbourhood', 'room_type': "Room Type", 'price': "Average Price", 'room_count': 'Room Count'})

combined_df.to_json("data/group_aggregates.json", orient="records")

combined_df.head()

Unnamed: 0,Borough,Neighbourhood,Room Type,Average Price,Room Count
0,Bronx,Allerton,Entire home/apt,122.36,22
1,Bronx,Allerton,Private room,71.53,15
2,Bronx,Baychester,Entire home/apt,112.8,10
3,Bronx,Baychester,Private room,76.31,13
4,Bronx,Baychester,Shared room,180.0,2


## Specific Amenity counts per borough: 

In [51]:
keywords = ["Wifi", "Kitchen", "Air conditioning", "Backyard", "Pets allowed", "Washer", "Dryer", "Long term stays allowed", "Elevator", "Dedicated workspace"]

def amenities_by_nbhd(df, keywords):

    # Explode the amenities column:
    
    listings_exploded = reduced_listings.explode('amenities').copy()
    
    # Count the occurrences of each keyword in the amenities column by neighborhood

    listings_exploded["amenities"] = listings_exploded["amenities"].apply(lambda x : x.lower() if type(x)==str else x) 
    keywords = [x.lower() for x in keywords]
    listings_exploded = listings_exploded[listings_exploded["amenities"].isin(keywords)].reset_index(drop=1)
    listings_exploded = listings_exploded.groupby(["neighbourhood_cleansed", "neighbourhood_group_cleansed", "amenities"]).size().reset_index(name='count')
    listings_exploded = listings_exploded.rename(columns={'neighbourhood_group_cleansed':'Borough', 'neighbourhood_cleansed':'Neighbourhood'})
    return listings_exploded

amenities_per_nbhd = amenities_by_nbhd(reduced_listings,keywords)
amenities_per_nbhd.to_json("data/amenities_aggregates.json", orient="records")

## Create the Endpoints

In [16]:
from flask import Flask, jsonify
from flask_cors import CORS
from pymongo import MongoClient

app = Flask(__name__)
CORS(app)

results = combined_df.to_json(orient='records')

@app.route('/aggregates', methods=['GET'])
def get_aggregates():
    # aggregates =   # Use find() instead of findAll()
    # data = []
    # for listings in reduced_listings:
    #     data.append({
    #         'type': feature.get('type'),
    #         'properties': {
    #             'name': listings['name'],
    #             'latitude': properties.get('latitude'),
    #             'longitude': properties.get('longitude'),
    #             'operator': properties.get('operator'),
    #             'status': properties.get('status'),
    #             'location_type': properties.get('location_type'),
    #             'facility_name': properties.get('facility_name'),
    #             'hours_of_operation': properties.get('hours_of_operation')
    #         },
    #         'geometry': {
    #             'type': geometry.get('type'),
    #             'coordinates': geometry.get('coordinates')
    #         }
    #     })
    return jsonify(results)

if __name__ == '__main__':
    app.run(debug=True)

 * Serving Flask app '__main__'
 * Debug mode: on


 * Running on http://127.0.0.1:5000
[33mPress CTRL+C to quit[0m
 * Restarting with stat
0.00s - make the debugger miss breakpoints. Please pass -Xfrozen_modules=off
0.00s - to python to disable frozen modules.
0.00s - Note: Debugging will proceed. Set PYDEVD_DISABLE_FILE_VALIDATION=1 to disable this validation.
Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "/opt/anaconda3/envs/dev/lib/python3.12/site-packages/ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "/opt/anaconda3/envs/dev/lib/python3.12/site-packages/traitlets/config/application.py", line 991, in launch_instance
    app.initialize(argv)
  File "/opt/anaconda3/envs/dev/lib/python3.12/site-packages/traitlets/config/application.py", line 113, in inner
    return method(app, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/envs/dev/lib/python3.12/site-packages/ipyke

SystemExit: 1

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


## ToDo Graphing:

bar chart
    stacked amenities by neighbourhood
        (potential drop-down for specified amenities)

pie chart
    percent of each room type for the borough

scatter plot
    average price distribution by room type
        (a scatter plot for each room type with regression line)

box plot
    price by borough
        (the price of all)

