## Import the database from MongoDB

In [2]:
pwd

'/Users/jonathanmunoz/Desktop/Classwork/Projects/NYC-Airbnb-Listings/static'

In [3]:
from pymongo import MongoClient
import pandas as pd
from json import loads, dumps

In [4]:
mongo = MongoClient(port=27017)

In [5]:
print(mongo.list_database_names())

['admin', 'classDB', 'config', 'epa', 'fruits_db', 'listings_db', 'local', 'travel_db', 'uk_food']


In [6]:
listings = mongo['listings_db'].nyc_listings
print(listings)

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'listings_db'), 'nyc_listings')


In [7]:
listings_arr = listings.find()
listings_df = pd.DataFrame(listings_arr)
listings_df.head()

Unnamed: 0,_id,id,listing_url,name,neighborhood_overview,neighbourhood_cleansed,neighbourhood_group_cleansed,latitude,longitude,property_type,...,bathrooms_text,bedrooms,beds,amenities,price,number_of_reviews,first_review,last_review,review_scores_rating,reviews_per_month
0,666c99367177588cd31b62d7,2595,https://www.airbnb.com/rooms/2595,Rental unit in New York · ★4.68 · Studio · 1 b...,Centrally located in the heart of Manhattan ju...,Midtown,Manhattan,40.75356,-73.98559,Entire rental unit,...,1 bath,,1.0,"[Long term stays allowed, Cooking basics, Ethe...",$240.00,49,2009-11-21,2022-06-21,4.68,0.29
1,666c99367177588cd31b62d8,5121,https://www.airbnb.com/rooms/5121,Rental unit in Brooklyn · ★4.52 · 1 bedroom · ...,,Bedford-Stuyvesant,Brooklyn,40.68535,-73.95512,Private room in rental unit,...,,,1.0,"[Air conditioning, Wifi, Kitchen, Heating]",$66.00,50,2009-05-28,2019-12-02,4.52,0.28
2,666c99367177588cd31b62d9,6848,https://www.airbnb.com/rooms/6848,Rental unit in Brooklyn · ★4.58 · 2 bedrooms ·...,,Williamsburg,Brooklyn,40.70935,-73.95342,Entire rental unit,...,1 bath,,1.0,"[Cooking basics, Microwave, Extra pillows and ...",$81.00,191,2009-05-25,2023-08-14,4.58,1.08
3,666c99367177588cd31b62da,6990,https://www.airbnb.com/rooms/6990,Rental unit in New York · ★4.88 · 1 bedroom · ...,"Location: Five minutes to Central Park, Museum...",East Harlem,Manhattan,40.78778,-73.94759,Private room in rental unit,...,1 shared bath,,1.0,"[Hair dryer, Air conditioning, Fire extinguish...",$70.00,246,2009-10-28,2023-08-14,4.88,1.43
4,666c99367177588cd31b62db,6872,https://www.airbnb.com/rooms/6872,Condo in New York · 1 bedroom · 1 bed · 1 shar...,This sweet Harlem sanctuary is a 10-20 minute ...,East Harlem,Manhattan,40.80107,-73.94255,Private room in condo,...,1 shared bath,,1.0,"[Fire extinguisher, Long term stays allowed, H...",$65.00,1,2022-06-05,2022-06-05,5.0,0.05


## Clean the data

In [8]:
listings_df.dtypes

_id                                     object
id                                       int64
listing_url                             object
name                                    object
neighborhood_overview                   object
neighbourhood_cleansed                  object
neighbourhood_group_cleansed            object
latitude                               float64
longitude                              float64
property_type                           object
room_type                               object
accommodates                             int64
bathrooms                               object
bathrooms_text                          object
bedrooms                               float64
beds                                   float64
amenities                               object
price                                   object
number_of_reviews                        int64
first_review                    datetime64[ns]
last_review                     datetime64[ns]
review_scores

In [9]:
reduced_listings = listings_df[['name', 'neighbourhood_cleansed', 'neighbourhood_group_cleansed'\
                               ,'latitude','longitude','room_type', 'amenities', 'price'\
                                , 'number_of_reviews', 'review_scores_rating']]
reduced_listings.dtypes

name                             object
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
latitude                        float64
longitude                       float64
room_type                        object
amenities                        object
price                            object
number_of_reviews                 int64
review_scores_rating            float64
dtype: object

In [10]:
reduced_listings['price'] = reduced_listings['price'].str.replace(",", "")
reduced_listings['price'] = reduced_listings['price'].str.replace("$", "")

reduced_listings['price']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_listings['price'] = reduced_listings['price'].str.replace(",", "")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  reduced_listings['price'] = reduced_listings['price'].str.replace("$", "")


0        240.00
1         66.00
2         81.00
3         70.00
4         65.00
          ...  
28127    225.00
28128    101.00
28129    105.00
28130    118.00
28131    285.00
Name: price, Length: 28132, dtype: object

In [11]:
reduced_listings = reduced_listings.astype({
    "price": float
})

reduced_listings.dtypes

name                             object
neighbourhood_cleansed           object
neighbourhood_group_cleansed     object
latitude                        float64
longitude                       float64
room_type                        object
amenities                        object
price                           float64
number_of_reviews                 int64
review_scores_rating            float64
dtype: object

In [12]:
reduced_listings.to_json("data/cleaned_listings.json", orient="records")


# Aggregate Functions

In [13]:
boroughs = reduced_listings["neighbourhood_group_cleansed"].unique()

## Find Average Price and Count of Room Types:
### Grouped by borough, neighbourhood, and room type

In [14]:
# avg price
def nbhd_price_avg(df):
    avg_price = df.groupby(['neighbourhood_group_cleansed', "neighbourhood_cleansed", 'room_type'])["price"].mean()
    avg_price_boro = avg_price.round(2).reset_index() 
    return avg_price_boro

price_per_nbhd = nbhd_price_avg(reduced_listings)

In [15]:
# room counts
def nbhd_room_type_count(df):
    roomtype_grouped = df.groupby(['neighbourhood_group_cleansed','neighbourhood_cleansed', 'room_type']).size().reset_index(name='room_count')
    return roomtype_grouped

num_rooms_per_nbhd = nbhd_room_type_count(reduced_listings)

In [16]:
# Merge both DataFrames
combined_df = price_per_nbhd.merge(num_rooms_per_nbhd, how='inner', on=['neighbourhood_group_cleansed','neighbourhood_cleansed', 'room_type'])

combined_df = combined_df.rename(columns={'neighbourhood_group_cleansed': 'Borough', 'neighbourhood_cleansed': 'Neighbourhood', 'room_type': "Room Type", 'price': "Average Price", 'room_count': 'Room Count'})

combined_df.to_json("data/group_aggregates.json", orient="records")

combined_df.head()

Unnamed: 0,Borough,Neighbourhood,Room Type,Average Price,Room Count
0,Bronx,Allerton,Entire home/apt,122.36,22
1,Bronx,Allerton,Private room,71.53,15
2,Bronx,Baychester,Entire home/apt,112.8,10
3,Bronx,Baychester,Private room,76.31,13
4,Bronx,Baychester,Shared room,180.0,2


## Specific Amenity counts per borough: 

In [123]:
def amenities_by_nbhd(borough, keywords):
    
    listings = borough_listings(borough)
    keywords = ["Wifi", "Kitchen", "Air conditioning", "Backyard", "Pets allowed", "Washer", "Dryer", "Long term stays allowed", "Elevator", "Dedicated workspace"]
    
    # Convert amenities column to string and split into elements using 'strip':
    
    listings['amenities'] = listings['amenities'].astype(str).str.strip('[]').str.split(', ')
    
    # Explode the amenities column:
    
    listings_exploded = listings.explode('amenities').copy()
    
    # Initialize a DataFrame to store the counts
    
    neighborhood_counts = listings['neighbourhood_cleansed'].drop_duplicates().to_frame()
    
    # Count the occurrences of each keyword in the amenities column by neighborhood
    
    for keyword in keywords:
        keyword_count = listings_exploded[listings_exploded['amenities'].str.contains(keyword, case=False)]
        keyword_count = keyword_count.groupby('neighbourhood_cleansed').size().reset_index(name=keyword)
        neighborhood_counts = pd.merge(neighborhood_counts, keyword_count, on='neighbourhood_cleansed', how='left')
        
    # Fill NaN values with 0
    neighborhood_counts = neighborhood_counts.fillna(0)
    neighborhood_counts.to_json(f"data/amenities_counts_nbhd_{borough}.json", orient="records")
    return neighborhood_counts
    
for borough in boroughs:
    print(amenities_by_nbhd(borough, keywords))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['amenities'] = listings['amenities'].astype(str).str.strip('[]').str.split(', ')


   neighbourhood_cleansed  Wifi  Kitchen  Air conditioning  Backyard  \
0                 Midtown  1063      831               898      22.0   
1             East Harlem   550      506               416      53.0   
2          Hell's Kitchen  1042      910               885      56.0   
3            East Village   772      746               610      54.0   
4                 Chelsea   559      507               451      33.0   
5         Lower East Side   473      443               387      17.0   
6         Upper East Side  1040     1009               886      63.0   
7                Kips Bay   263      252               213       9.0   
8                  Harlem  1397     1295               951     158.0   
9            West Village   360      350               290      32.0   
10        Upper West Side   963      899               738      48.0   
11    Morningside Heights   129      129                95       1.0   
12              Chinatown   218      183               161      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['amenities'] = listings['amenities'].astype(str).str.strip('[]').str.split(', ')


       neighbourhood_cleansed  Wifi  Kitchen  Air conditioning  Backyard  \
0          Bedford-Stuyvesant  2061     2000              1466     419.0   
1                Williamsburg  1773     1708              1387     237.0   
2                 Fort Greene   251      239               181      40.0   
3                  Greenpoint   514      464               422      53.0   
4                Clinton Hill   274      265               216      56.0   
5             Carroll Gardens   129      126               105      25.0   
6                  Park Slope   246      233               183      60.0   
7                 South Slope   123      118                97      30.0   
8               East Flatbush   430      399               259     115.0   
9                 Boerum Hill    91       87                69      17.0   
10           Prospect Heights   162      157               124      27.0   
11           Brooklyn Heights    65       62                43       5.0   
12          

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['amenities'] = listings['amenities'].astype(str).str.strip('[]').str.split(', ')


   neighbourhood_cleansed  Wifi  Kitchen  Air conditioning  Backyard  \
0        Long Island City   327    285.0             255.0      61.0   
1                Woodside   253    235.0              90.0      14.0   
2               Sunnyside   219    201.0             128.0      21.0   
3               Ridgewood   221    210.0             136.0      35.0   
4          Middle Village    23     23.0              15.0       4.0   
5        Ditmars Steinway   160    150.0              90.0      32.0   
6                 Jamaica   258    223.0             157.0     100.0   
7                 Astoria   512    471.0             340.0      39.0   
8         Jackson Heights   170    156.0             104.0      24.0   
9          Rockaway Beach    57     55.0              32.0      20.0   
10              Rego Park    64     58.0              33.0      10.0   
11                Maspeth    79     71.0              48.0      16.0   
12              Briarwood    38     27.0              26.0      

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['amenities'] = listings['amenities'].astype(str).str.strip('[]').str.split(', ')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  listings['amenities'] = listings['amenities'].astype(str).str.strip('[]').str.split(', ')


## Create the Endpoints

In [None]:
from flask import Flask, jsonify
from flask_cors import CORS
from pymongo import MongoClient

app = Flask(__name__)
CORS(app)


client = MongoClient("mongodb://localhost:27017/")
db = client['listings_db']
collection = db['nyc_listings']

@app.route('/aggregates', methods=['GET'])
def get_aggregates():
    aggregates = collection.find()  # Use find() instead of findAll()
    data = []
    for data in aggregates:
        features = restroom.get('features', [])
        for feature in features:
            properties = feature.get('properties', {})
            geometry = feature.get('geometry', {})
            data.append({
                'type': feature.get('type'),
                'properties': {
                    'website': properties.get('website'),
                    'latitude': properties.get('latitude'),
                    'longitude': properties.get('longitude'),
                    'operator': properties.get('operator'),
                    'status': properties.get('status'),
                    'location_type': properties.get('location_type'),
                    'facility_name': properties.get('facility_name'),
                    'hours_of_operation': properties.get('hours_of_operation')
                },
                'geometry': {
                    'type': geometry.get('type'),
                    'coordinates': geometry.get('coordinates')
                }
            })
    return jsonify(data)

if __name__ == '__main__':
    app.run(debug=True)

## ToDo Graphing:

bar chart
    stacked amenities by neighbourhood
        (potential drop-down for specified amenities)

pie chart
    percent of each room type for the borough

scatter plot
    average price distribution by room type
        (a scatter plot for each room type with regression line)

box plot
    price by borough
        (the price of all)

