# NYC Apartment Search by Group 46

_[Project prompt](https://docs.google.com/document/d/1BYVyFBDcTywdUlanH0ysfOrNWPgl7UkqXA7NeewTzxA/edit#heading=h.bpxu7uvknnbk)_


Imagine that your apartment lease is nearing its end, and it's time to find  a new home in the heart of New York City! To guide us in this quest, we rely on a prudent budget, a preference for a serene neighborhood, and a desire for a touch of greenery. Leveraging the NYC Open Data, including 311 complaints, tree census, and Zillow's historic monthly rent averages, we embark on a data-driven exploration. Let's get started!

## Setup

We begin by importing the necessary libraries and modules. These include tools for data manipulation, visualization, and database interactions. Also, our project relies on a PostgreSQL database for storing and retrieving data. Below are the configuration details. We specify the locations for data files, such as shapefiles and CSVs, as well as constants like API tokens and base URLs for accessing external data sources.

In [1]:
# All import statements needed for the project, for example:

import json
import os
import pathlib
import subprocess
import urllib.parse
from math import ceil
import geoalchemy2 as gdb
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import requests
import seaborn as sns
import shapely
import sqlalchemy as db
from geopy.distance import geodesic
from shapely import wkt
from shapely.geometry import Point, Polygon, mapping

# SQLAlchemy imports for database interaction
from sqlalchemy import create_engine, Column, Integer, String, Float, DateTime, MetaData, Table, text, func
from sqlalchemy.ext.hybrid import hybrid_property
from sqlalchemy.orm import Session, sessionmaker, declarative_base, column_property
from geoalchemy2 import Geometry, WKTElement, functions as geo_func

# GeoAlchemy2 extensions for geospatial data in SQLAlchemy
from geoalchemy2.functions import ST_Point, ST_Distance

from sqlalchemy.dialects.postgresql import insert
from geoalchemy2 import WKTElement


from math import ceil
import pandas as pd
import geopandas as gpd
import requests
from pathlib import Path


from matplotlib.animation import FuncAnimation
from IPython.display import HTML

import psycopg2
from shapely import wkb

In [2]:
# Where data files will be read from/written to - this should already exist
DATA_DIR = pathlib.Path("data")
ZIPCODE_DATA_FILE = DATA_DIR / "nyc_zipcodes.shp"
ZILLOW_DATA_FILE = DATA_DIR / "zillow_rent_data.csv"

NYC_DATA_APP_TOKEN = "egpaU4U1YY3mBGHMmdNqtmvpv"
BASE_NYC_DATA_URL = "https://data.cityofnewyork.us/"
NYC_DATA_311 = "erm2-nwe9.geojson"
NYC_DATA_TREES = "5rq2-4hqu.geojson"

DB_NAME = "group46project"
DB_USER = "postgres"
DB_URL = f"postgresql+psycopg2://{DB_USER}@localhost/{DB_NAME}"
DB_SCHEMA_FILE = "schema.sql"

# directory where DB queries for Part 3 will be saved
QUERY_DIR = pathlib.Path("queries")

##add new one
# Create the data directory if it doesn't exist
DATA_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
# Make sure the QUERY_DIRECTORY exists
if not QUERY_DIR.exists():
    QUERY_DIR.mkdir()

## Part 1: Data Preprocessing

In this stage, we take raw data and transform it into a clean and usable format. This process is important for making informed decisions about our future apartment because only with cleaned and organized data, can we analyze and visualize it. We are handling the following data:

**Zipcode Data:** We leverage data on zipcodes, ensuring that each area is accurately represented. This includes removing unnecessary information and aligning the data to a standardized coordinate system.

**311 Complaints Data:** We use the information provided by 311 complaints. By focusing on relevant details like the type of complaints and their locations, we gain insights into the quality of life in each area and make informed decisions about the desirability of potential neighborhoods for our new apartment.

**Tree Data:** We explore the distribution of trees across neighborhoods, looking at factors like health and species diversity. By organizing information of these trees and their location (zipcodes, latitude, and longitude), we gain valuable insights into the green landscape of New York City.

**Zillow's Rent Data:** We clean and organize this data to understand the rental prices over time, helping us make financially sound decisions. Through these steps, we ensure our data is accurate, complete, and ready for finding the apartment in the New York City.


In [4]:
def load_and_clean_zipcodes(zipcode_datafile):
    """
    Load and clean zipcode data from a shapefile.

    This function reads a shapefile containing zipcode data, normalizes its coordinate
    reference system to EPSG 4326 for consistency, retains only relevant columns,
    and converts the zipcode to an integer format.

    Parameters:
    - `zipcode_datafile` (pathlib.Path): The file path to the shapefile containing zipcode data.

    Returns:
    - `gdf_cleaned` (geopandas.GeoDataFrame): A cleaned GeoDataFrame containing zipcode data.
      The DataFrame includes columns 'zipcode' and 'geometry', with zipcode converted to integer.

    Raises:
    - `RuntimeError`: If an unexpected error occurs during the process.

    """
    
    try:
        # Load the shapefile into a GeoDataFrame
        gdf = gpd.read_file(zipcode_datafile)
        
        # Normalize to EPSG 4326 coordinate system for consistency across datasets
        gdf_crs_normalized = gdf.to_crs(epsg=4326)
        
        # Select and rename relevant columns for further analysis
        gdf_cleaned = gdf_crs_normalized[["ZIPCODE", "geometry"]].copy()
        gdf_cleaned.columns = ["zipcode", "geometry"]

        # Convert zipcode to integer for uniform data type
        gdf_cleaned['zipcode'] = gdf_cleaned['zipcode'].astype(int)
    except Exception as e:
        # General exception for unforeseen errors
        raise RuntimeError(f"An unexpected error occurred: {e}")

    return gdf_cleaned

In [5]:
def download_and_clean_311_data(start_date='2015-01-01', end_date='2023-10-01', chunk_size=29999999):
    """
    Download and clean 311 data from New York City for a specified date range.

    This function downloads 311 service requests data from a specified start date to an end date, 
    cleans the data by dropping missing values and converting data types, and then transforms 
    it into a geopandas GeoDataFrame.

    Parameters:
    - `start_date` (str): The start date of the data range. Defaults to '2015-01-01'.
    - `end_date` (str): The end date of the data range. Defaults to '2023-10-01'.
    - `chunk_size` (int, optional): The size of each data chunk to be retrieved. Defaults to 29999999.

    Returns:
    - `geodf_311_data` (geopandas.GeoDataFrame): A GeoDataFrame containing cleaned 311 data, with columns including
      'created_date', 'complaint_type', 'zipcode', and 'geometry'.
      
    Note:
    The 311 data is obtained from the New York City open data API. The resulting GeoDataFrame is saved 
    as a CSV file named '311_DATA.csv' in the 'data' directory.
    """
    
    url = 'https://data.cityofnewyork.us/resource/erm2-nwe9.json'
    full_data = pd.DataFrame()

    # Calculate the number of chunks needed based on the date range and chunk size
    num_chunks = ceil((pd.to_datetime(end_date) - pd.to_datetime(start_date)).days / chunk_size)

    for chunk in range(num_chunks):
        offset = chunk * chunk_size
        date_filter = f"created_date between '{start_date}' and '{end_date}'"
        params = {
            '$select': 'created_date, complaint_type, incident_zip, latitude, longitude',
            '$where': date_filter,
            '$limit': chunk_size,
            '$offset': offset
        }
        headers = {'X-App-Token': NYC_DATA_APP_TOKEN}
        
        #request data
        response = requests.get(url, headers=headers, params=params)

        data_chunk = pd.DataFrame(response.json())
        # Clean the data: drop missing values and convert data types
        data_chunk_cleaned = data_chunk.dropna(subset=['incident_zip', 'latitude', 'longitude'], how='any').copy()
        data_chunk_cleaned['created_date'] = pd.to_datetime(data_chunk_cleaned['created_date'])
        data_chunk_cleaned['zipcode'] = pd.to_numeric(data_chunk_cleaned['incident_zip'], errors='coerce').dropna().astype(int)
        data_chunk_cleaned = data_chunk_cleaned.drop(columns=['incident_zip'])
        full_data = pd.concat([full_data, data_chunk_cleaned], ignore_index=True)

    # Ensure the directory for data saving exists
    DATA_DIR = Path('data')
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    full_data.to_csv(DATA_DIR / '311_DATA.csv', index=False)
    
    # Convert to GeoDataFrame
    geometry = gpd.points_from_xy(full_data['longitude'], full_data['latitude'])
    geodf_311_data = gpd.GeoDataFrame(full_data, geometry=geometry, crs='EPSG:4326')
    geodf_311_data = geodf_311_data.drop(columns=['longitude', 'latitude'])

    return geodf_311_data

In [6]:
def download_and_clean_tree_data():
    """
    Download and clean tree data from New York City open data.

    This function downloads tree data from the New York City open data API, fills missing values 
    for certain columns, and transforms it into a GeoPandas GeoDataFrame, ready for further analysis.

    Returns:
    - `geodf_tree_data` (geopandas.GeoDataFrame): A GeoDataFrame containing cleaned tree data, 
      with columns including 'created_at', 'tree_id', 'health', 'status', 'spc_common', 
      'zipcode', 'latitude', 'longitude'.

    Raises:
    - `RuntimeError`: If an error occurs during the download process.

    Note:
    The resulting GeoDataFrame is saved as a CSV file named 'TREE_DATA.csv' in the 'data' directory.
 
    """   
    
    url = 'https://data.cityofnewyork.us/resource/5rq2-4hqu.json'
    params = {
        '$select': 'created_at, tree_id, health, status, spc_common, zipcode, latitude, longitude'
    }
    headers = {'X-App-Token': NYC_DATA_APP_TOKEN}

    # Handle network and request errors
    try:
        response = requests.get(url, headers=headers, params=params)
        response.raise_for_status()  # Will raise an HTTPError for unsuccessful status codes
    except requests.exceptions.RequestException as e:
        raise RuntimeError(f"Error downloading data: {e}")

    data = pd.DataFrame(response.json())
    # Fill missing values for specific columns
    columns_to_fillna = ['health', 'status', 'spc_common']
    data[columns_to_fillna] = data[columns_to_fillna].fillna('None')

    # Drop rows with missing zipcode, latitude, or longitude, and copy the dataframe
    data_cleaned = data.dropna(subset=['zipcode', 'latitude', 'longitude'], how='any').copy()

    # Convert 'created_at' to datetime and 'zipcode' to integer, handling errors
    data_cleaned['created_at'] = pd.to_datetime(data_cleaned['created_at'])
    data_cleaned['zipcode'] = pd.to_numeric(data_cleaned['zipcode'], errors='coerce').dropna().astype(int)

    # Ensure the directory for data saving exists
    DATA_DIR = Path('data')
    DATA_DIR.mkdir(parents=True, exist_ok=True)
    data_cleaned.to_csv(DATA_DIR / 'TREE_DATA.csv', index=False)
    
    # Create GeoDataFrame with appropriate CRS
    geometry = gpd.points_from_xy(data_cleaned['longitude'].astype(float), data_cleaned['latitude'].astype(float))
    crs = 'EPSG:4326'
    geodf_tree_data = gpd.GeoDataFrame(data_cleaned, geometry=geometry, crs=crs)
    geodf_tree_data = geodf_tree_data.drop(columns=['longitude', 'latitude'])  # Remove original coordinate columns

    return geodf_tree_data

In [7]:
def load_and_clean_zillow_data():
    """
    Load and clean Zillow rent data for New York City.

    This function loads Zillow rent data, filters for New York City data, cleans and 
    transforms the data into a more usable format for analysis.

    Returns:
    - `zillow_ny` (pandas.DataFrame): Cleaned DataFrame with columns including 'zipcode', 'date', and 'rent'.

    Raises:
    - `FileNotFoundError`: If the file specified by `file_path` is not found.

    Note:
    The Zillow rent data is expected to be stored in a CSV file named 'zillow_rent_data.csv'
    in the 'data' directory. The cleaned data is saved as 'cleaned_zillow_data.csv' in the same directory.
    """
    
    file_path = DATA_DIR / 'zillow_rent_data.csv'

    # Ensure the directory exists before reading the file
    DATA_DIR.mkdir(parents=True, exist_ok=True)

    try:
        df_zillow = pd.read_csv(file_path)
    except FileNotFoundError:
        raise FileNotFoundError(f"File not found: {file_path}")

    # Filter data for New York City
    zillow_ny = df_zillow[df_zillow['City'] == 'New York'].copy()

    # Drop unnecessary columns
    columns_to_delete = ['RegionID', 'SizeRank', 'RegionType', 'StateName', 'Metro']
    zillow_ny.drop(columns=columns_to_delete, axis=1, inplace=True)

    # Reset index and rename columns
    zillow_ny.reset_index(drop=True, inplace=True)
    zillow_ny.rename(columns={'RegionName': 'zipcode'}, inplace=True)

    # Replace 0 with NaN and convert column names to lowercase
    zillow_ny.replace(0, np.nan, inplace=True)
    zillow_ny.columns = zillow_ny.columns.str.lower()

    # Reshape the dataframe and convert 'date' to datetime format
    zillow_ny = pd.melt(zillow_ny, id_vars=['zipcode', 'state', 'city', 'countyname'], 
                        var_name='date', value_name='rent')
    zillow_ny['date'] = pd.to_datetime(zillow_ny['date'], errors='coerce', format='%Y-%m-%d')

    # Round 'rent' to 2 decimal places and drop rows with NaN in 'rent'
    zillow_ny['rent'] = zillow_ny['rent'].round(2)
    zillow_ny.dropna(subset=['rent'], inplace=True)

    # Save the cleaned data
    zillow_ny.to_csv(DATA_DIR / 'cleaned_zillow_data.csv', index=False)

    return zillow_ny.drop(columns=['state', 'city', 'countyname'])

In [8]:
def load_all_data():
    """
    Load all datasets that we cleaned.

    Returns:
    - `geodf_zipcode_data` (geopandas.GeoDataFrame): Cleaned GeoDataFrame containing zipcode data.
    - `geodf_311_data` (geopandas.GeoDataFrame): Cleaned GeoDataFrame containing 311 data.
    - `geodf_tree_data` (geopandas.GeoDataFrame): Cleaned GeoDataFrame containing tree data.
    - `df_zillow_data` (pandas.DataFrame): Cleaned DataFrame containing Zillow rent data.
    """
    geodf_zipcode_data = load_and_clean_zipcodes(ZIPCODE_DATA_FILE)
    geodf_311_data = download_and_clean_311_data()
    geodf_tree_data = download_and_clean_tree_data()
    df_zillow_data = load_and_clean_zillow_data()
    return (
        geodf_zipcode_data,
        geodf_311_data,
        geodf_tree_data,
        df_zillow_data
    )

In [9]:
geodf_zipcode_data, geodf_311_data, geodf_tree_data, df_zillow_data = load_all_data()

In [10]:
# Show basic info about each dataframe
geodf_zipcode_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype   
---  ------    --------------  -----   
 0   zipcode   263 non-null    int32   
 1   geometry  263 non-null    geometry
dtypes: geometry(1), int32(1)
memory usage: 3.2 KB


In [11]:
# Show first 5 entries about each dataframe
geodf_zipcode_data.head()

Unnamed: 0,zipcode,geometry
0,11436,"POLYGON ((-73.80585 40.68291, -73.80569 40.682..."
1,11213,"POLYGON ((-73.93740 40.67973, -73.93487 40.679..."
2,11212,"POLYGON ((-73.90294 40.67084, -73.90223 40.668..."
3,11225,"POLYGON ((-73.95797 40.67066, -73.95576 40.670..."
4,11218,"POLYGON ((-73.97208 40.65060, -73.97192 40.650..."


In [12]:
geodf_311_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 23030861 entries, 0 to 23030860
Data columns (total 4 columns):
 #   Column          Dtype         
---  ------          -----         
 0   created_date    datetime64[ns]
 1   complaint_type  object        
 2   zipcode         float64       
 3   geometry        geometry      
dtypes: datetime64[ns](1), float64(1), geometry(1), object(1)
memory usage: 702.8+ MB


In [13]:
geodf_311_data.head()

Unnamed: 0,created_date,complaint_type,zipcode,geometry
0,2023-09-30 23:59:58,Noise - Street/Sidewalk,11226.0,POINT (-73.95918 40.65567)
1,2023-09-30 23:59:38,Noise - Residential,11361.0,POINT (-73.78752 40.76676)
2,2023-09-30 23:59:35,Noise - Commercial,10002.0,POINT (-73.98487 40.71950)
3,2023-09-30 23:59:34,Noise - Residential,11435.0,POINT (-73.79729 40.68750)
4,2023-09-30 23:59:28,Noise - Residential,11226.0,POINT (-73.95795 40.65220)


In [14]:
geodf_tree_data.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype         
---  ------      --------------  -----         
 0   created_at  1000 non-null   datetime64[ns]
 1   tree_id     1000 non-null   object        
 2   health      1000 non-null   object        
 3   status      1000 non-null   object        
 4   spc_common  1000 non-null   object        
 5   zipcode     1000 non-null   int32         
 6   geometry    1000 non-null   geometry      
dtypes: datetime64[ns](1), geometry(1), int32(1), object(4)
memory usage: 50.9+ KB


In [15]:
geodf_tree_data.head()

Unnamed: 0,created_at,tree_id,health,status,spc_common,zipcode,geometry
0,2015-08-27,180683,Fair,Alive,red maple,11375,POINT (-73.84422 40.72309)
1,2015-09-03,200540,Fair,Alive,pin oak,11357,POINT (-73.81868 40.79411)
2,2015-09-05,204026,Good,Alive,honeylocust,11211,POINT (-73.93661 40.71758)
3,2015-09-05,204337,Good,Alive,honeylocust,11211,POINT (-73.93446 40.71354)
4,2015-08-30,189565,Good,Alive,American linden,11215,POINT (-73.97598 40.66678)


In [16]:
df_zillow_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9039 entries, 5 to 15224
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype         
---  ------   --------------  -----         
 0   zipcode  9039 non-null   int64         
 1   date     9039 non-null   datetime64[ns]
 2   rent     9039 non-null   float64       
dtypes: datetime64[ns](1), float64(1), int64(1)
memory usage: 282.5 KB


In [17]:
df_zillow_data.head()

Unnamed: 0,zipcode,date,rent
5,11226,2015-01-31,1944.61
7,10025,2015-01-31,3068.95
13,11206,2015-01-31,2482.83
14,11221,2015-01-31,2125.74
20,11235,2015-01-31,1687.79


## Part 2: Storing Data

In this phase, we transition from the four datasets that we cleaned from Part one to the PostgreSQL database, creating a foundation for seamless data querying and analysis. 

First, we need to create a database to store data!

In [None]:
def setup_new_postgis_database(username, db_name):
    """
    Set up a new PostgreSQL database with the PostGIS extension.

    Args:
    - username (str): The PostgreSQL username.
    - db_name (str): The name for the new database.

    """
    # Create a new database
    subprocess.run(['createdb', db_name])

    # Enable PostGIS extension
    subprocess.run(['psql', '-U', username, '--dbname', db_name, '-c', 'CREATE EXTENSION postgis;'])

In [None]:
setup_new_postgis_database(DB_USER, DB_NAME)

### Creating Tables

After creating the datebase, we can now create tables for the datasets we gained from Part one, which is pretty much like a virtual spreadsheet that organizes specific types of information.

For example, we have a 'zipcodes' table to store details about different zip codes, a 'nyc311s' table to store details about those 311 complaints, a 'trees' table for information about trees, and a 'zillow_datas' table for rental data. 


In [None]:
engine = db.create_engine(DB_URL)

#### Using SQLAlchemy

In [None]:
Base = declarative_base()

class ZipCode(Base):
    """
    Represents the ZipCode data with geometry as POLYGON.
    """
    __tablename__ = 'zipcodes'

    id = Column(Integer, primary_key=True)
    zipcode = Column(Integer, index=True)  # Added index for faster query performance
    geometry = Column(Geometry('POLYGON'))
    
class NYC311(Base):
    """
    Represents the NYC 311 service request data with geometry as POINT.
    """
    __tablename__ = 'nyc311s'

    id = Column(Integer, primary_key=True)
    created_date = Column(DateTime)
    complaint_type = Column(String)
    zipcode = Column(Integer, index=True)
    geometry = Column(Geometry(geometry_type='POINT', srid=4326))

    
class Tree(Base):
    """
    Represents the Tree data with geometry as POINT.
    """
    __tablename__ = "trees"

    id = Column(Integer, primary_key=True)
    created_at = Column(DateTime)
    tree_id = Column(String)
    health = Column(String)
    status = Column(String)
    spc_common = Column(String)
    zipcode = Column(Integer, index=True)
    geometry = Column(Geometry(geometry_type='POINT', srid=4326))

class ZillowData(Base):
    """
    Represents the Zillow rent data.
    """
    __tablename__ = 'zillow_datas'

    id = Column(Integer, primary_key=True)
    zipcode = Column(Integer, index=True)
    date = Column(DateTime)
    rent = Column(Float)

In [None]:
# Create the schema.sql file
with open(DB_SCHEMA_FILE, 'w') as file:
    for table in Base.metadata.tables.values():
        file.write(f"CREATE TABLE IF NOT EXISTS {table.name} (\n")
        for column in table.columns:
            file.write(f"   {column.name} {column.type},\n")
        file.write(");\n\n")

In [None]:
Base.metadata.create_all(engine)

### Add Data to Database

After setting up our tables, now we can add data to these tables in our database! Basically, we go through every dataset, including zip code data, 311 complaints data, tree data, and Zillow data that we organized in Part One, and store every piece of information in the corresponding tables!


In [None]:
#### Using SQLAlchemy

In [None]:
Session = db.orm.sessionmaker(bind=engine)
session = Session()

In [None]:
# Insert data into the database
zipcodes = []
for index, row in geodf_zipcode_data.iterrows():
    zipcode = ZipCode(
        zipcode=row['zipcode'],
        geometry=f'SRID=4326;{row["geometry"].wkt}'  # Directly use the WKT from the geometry column
    )
    zipcodes.append(zipcode)

try:
    session.add_all(zipcodes)  # Add all zipcode objects at once for efficiency
    session.commit()  # Commit the transaction
except Exception as e:
    session.rollback()  # Rollback in case of any error
    raise e


In [None]:
# Create a GeoDataFrame with an "id" column
geodf_311_data_geometry = gpd.GeoDataFrame(
    geodf_311_data,
    geometry='geometry',
    crs="EPSG:4326"  # Set the coordinate reference system if not already set
)

# Add an "id" column to the GeoDataFrame
geodf_311_data_geometry['id'] = range(1, len(geodf_311_data_geometry) + 1)

# Convert the GeoDataFrame to the specified table using to_postgis
geodf_311_data_geometry.to_postgis(
    "nyc311s",  # Specify the table name
    engine,
    if_exists="replace",  # Use 'replace' or 'fail' based on your requirements
    index=False,  # Set to True if you want to include the index in the database
    dtype={"geometry": Geometry("POINT", srid=4326)},  # Specify the data type for the geometry column
)

In [None]:
tree_records = []
for index, row in geodf_tree_data.iterrows():
    tree_record = Tree(
        created_at=row['created_at'],  
        tree_id=row['tree_id'],   
        health=row['health'],  
        status=row['status'],  
        spc_common=row['spc_common'],  
        zipcode=row['zipcode'],   
        geometry=f'SRID=4326;{row["geometry"].wkt}'  # Directly use the WKT from the geometry column
    )
    tree_records.append(tree_record)

try:
    session.add_all(tree_records)  # Add all Tree records at once for efficiency
    session.commit()  # Commit the transaction
except Exception as e:
    session.rollback()  # Rollback in case of any error
    raise e
