In [1]:
import pandas as pd
import requests
import geopandas as gpd
from shapely.geometry import Point

In [2]:
app_token = "Rl5BUiRawpr4H2LA9OQeKB47L"

# 1 Data Preprocessing

## 1.1 Download 311 data and 2015 tree cencus data

In [3]:
from datetime import datetime, timedelta

def generate_month_ranges(start_date, end_date):
    current = start_date
    temp = 0
    while current < end_date:
        temp += 1
        month_end = current.replace(day=1) + timedelta(days=32)
        month_end = month_end.replace(day=1) - timedelta(days=1) # 9 30
        yield (current, month_end)
        current = month_end + timedelta(days=1)
        if temp == 100:
            break

#Set start date and end ate
start_date = datetime(2018, 10, 1)
end_date = datetime(2023, 9, 30)

#Generate month's range
month_ranges = list(generate_month_ranges(start_date, end_date))

In [4]:
import os

for start, end in month_ranges:
    year = start.year
    csv_file = f"data/complaints_data_{year}.csv"
    print(f"Downloading data for the period: {start.strftime('%Y-%m-%d')} to {end.strftime('%Y-%m-%d')}")

    query = f"created_date between '{start.strftime('%Y-%m-%dT%H:%M:%S')}' and '{end.strftime('%Y-%m-%dT%H:%M:%S')}'"
    response = requests.get(
        url="https://data.cityofnewyork.us/resource/erm2-nwe9.json",
        params={"$$app_token": app_token, "$where": query, "$limit": 999999,
                "$select": "unique_key, created_date, closed_date, agency,  complaint_type, descriptor, location_type, incident_zip, latitude, longitude, borough"
                }
    )
    data = response.json()
    batch_df = pd.DataFrame(data)

    # 根据年份写入或追加到对应的CSV文件
    mode = 'a' if os.path.exists(csv_file) else 'w'
    batch_df.to_csv(csv_file, mode=mode, index=False, header=not os.path.exists(csv_file))

print("Data download complete.")

Downloading data for the period: 2018-10-01 to 2018-10-31
Downloading data for the period: 2018-11-01 to 2018-11-30
Downloading data for the period: 2018-12-01 to 2018-12-31
Downloading data for the period: 2019-01-01 to 2019-01-31
Downloading data for the period: 2019-02-01 to 2019-02-28
Downloading data for the period: 2019-03-01 to 2019-03-31
Downloading data for the period: 2019-04-01 to 2019-04-30
Downloading data for the period: 2019-05-01 to 2019-05-31
Downloading data for the period: 2019-06-01 to 2019-06-30
Downloading data for the period: 2019-07-01 to 2019-07-31
Downloading data for the period: 2019-08-01 to 2019-08-31
Downloading data for the period: 2019-09-01 to 2019-09-30
Downloading data for the period: 2019-10-01 to 2019-10-31
Downloading data for the period: 2019-11-01 to 2019-11-30
Downloading data for the period: 2019-12-01 to 2019-12-31
Downloading data for the period: 2020-01-01 to 2020-01-31
Downloading data for the period: 2020-02-01 to 2020-02-29
Downloading da

In [5]:
tree_data = requests.get(url="https://data.cityofnewyork.us/resource/5rq2-4hqu.json",
                         params={"$$app_token": app_token, "$limit": 99999999999999999999}).json()
tree_df = pd.DataFrame(tree_data)
tree_df.to_csv("data/tree.csv")

## 1.2 Data Cleaning & Filtering

### 1.2.1 311 Data Cleaning & Filtering

In [6]:
def clean_311_data(datafile):
    import pandas as pd
    import numpy as np
    import datetime
    import geopandas as gpd
    from shapely.geometry import Point
    
    #Add fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 12000:
            return np.NaN
        return str(input_zip)

    #Read the file
    df = pd.read_csv(datafile, low_memory=False)

    #fix the zip
    df['incident_zip'] = df['incident_zip'].apply(fix_zip)

    df = df.dropna(how='any')

    #get rid of unspecified boroughs
    df = df[df['borough'] != 'Unspecified']

    df['latitude'] = df['latitude'].astype('float64')
    df['longitude'] = df['longitude'].astype('float64')

    #Converts the 'closed_date','created_date' column into a datetime object
    df['created_date'] = pd.to_datetime(df['created_date'])
    df['closed_date'] = pd.to_datetime(df['closed_date'])
    # Convert longitude and latitude to a 'geometry' column for geopandas
    df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

    # Create a GeoDataFrame and set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [7]:
complaints_2018 = clean_311_data('data/complaints_data_2018.csv')
complaints_2018.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 1513084 entries, 0 to 2022503
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype         
---  ------          --------------    -----         
 0   unique_key      1513084 non-null  int64         
 1   created_date    1513084 non-null  datetime64[ns]
 2   closed_date     1513084 non-null  datetime64[ns]
 3   agency          1513084 non-null  object        
 4   complaint_type  1513084 non-null  object        
 5   descriptor      1513084 non-null  object        
 6   location_type   1513084 non-null  object        
 7   incident_zip    1513084 non-null  object        
 8   latitude        1513084 non-null  float64       
 9   longitude       1513084 non-null  float64       
 10  borough         1513084 non-null  object        
 11  geometry        1513084 non-null  geometry      
dtypes: datetime64[ns](2), float64(2), geometry(1), int64(1), object(6)
memory usage: 150.1+ MB


### 1.2.2 2015 Tree census Data Cleaning & Filtering

In [8]:
tree = pd.read_csv('data/tree.csv', usecols=['tree_id', 'the_geom',   'spc_common', 'status', 'health', 'zipcode', 'boroname', 'latitude', 'longitude'])
tree.to_csv('data/tree_data.csv',index=False)

In [9]:
def clean_tree_data(datafile):
    import pandas as pd
    import numpy as np
    import geopandas as gpd
    from shapely.geometry import Point

    # Add fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0]) 
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 12000:
            return np.NaN
        return str(input_zip)

    # Read the file
    df = pd.read_csv(datafile)

    # Fix the zip
    df['zipcode'] = df['zipcode'].apply(fix_zip)

    df = df.dropna(how='any')

    # Make some columns name readable
    df.rename(columns={'the_geom': 'geometry', 'spc_common': 'species', 'boroname': 'borough'}, inplace=True)

    # Convert longitude and latitude to a 'geometry' column for geopandas
    df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

    # Create a GeoDataFrame and set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [10]:
tree = clean_tree_data('data/tree_data.csv')
tree.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 651235 entries, 0 to 683787
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   tree_id    651235 non-null  int64   
 1   geometry   651235 non-null  geometry
 2   status     651235 non-null  object  
 3   health     651235 non-null  object  
 4   species    651235 non-null  object  
 5   zipcode    651235 non-null  object  
 6   borough    651235 non-null  object  
 7   latitude   651235 non-null  float64 
 8   longitude  651235 non-null  float64 
dtypes: float64(2), geometry(1), int64(1), object(5)
memory usage: 49.7+ MB


### 1.2.3 Zillow Data Cleaning & Filtering

In [11]:
def clean_zillow_data(datafile):
    import pandas as pd

    df = pd.read_csv(datafile, low_memory=False)

    df = df[df['City'] == 'New York']

    # Select the 'RegionName' and 'CountyName' columns and store them in df1
    df1 = df[['RegionName', 'CountyName']]
    # Select all columns from '2018-09-30' onwards and store them in df2
    df2 = df.loc[:, '2018-09-30':]

    # Concatenate df1 and df2 along the columns (axis=1)
    df = pd.concat([df1, df2], axis=1)

    # Make columns' name more readable
    df.rename(columns={'RegionName': 'zipcode', 'CountyName': 'county'}, inplace=True)

    # Convert the 'zipcode' column to a string data type
    df['zipcode'] = df['zipcode'].astype(str)

    return df

In [12]:
zillow = clean_zillow_data('data/zillow_rent_data.csv')
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 145 entries, 4 to 6721
Data columns (total 63 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   zipcode     145 non-null    object 
 1   county      145 non-null    object 
 2   2018-09-30  76 non-null     float64
 3   2018-10-31  76 non-null     float64
 4   2018-11-30  76 non-null     float64
 5   2018-12-31  75 non-null     float64
 6   2019-01-31  77 non-null     float64
 7   2019-02-28  77 non-null     float64
 8   2019-03-31  79 non-null     float64
 9   2019-04-30  79 non-null     float64
 10  2019-05-31  79 non-null     float64
 11  2019-06-30  79 non-null     float64
 12  2019-07-31  79 non-null     float64
 13  2019-08-31  79 non-null     float64
 14  2019-09-30  79 non-null     float64
 15  2019-10-31  79 non-null     float64
 16  2019-11-30  79 non-null     float64
 17  2019-12-31  79 non-null     float64
 18  2020-01-31  81 non-null     float64
 19  2020-02-29  81 non-null     

### 1.2.4 Zipcode data Cleaning & Filtering

In [13]:
def clean_zipcode_data(datafile):
    import geopandas as gpd

    gdf = gpd.read_file(datafile)
    
    gdf.crs = 'EPSG:4326'
    
    gdf = gdf.to_crs('EPSG:4326')
    #Select specific columns
    gdf = gdf[['ZIPCODE', 'POPULATION', 'geometry' ]]

    gdf.rename(columns={'ZIPCODE':'zipcode', 'POPULATION':'population'}, inplace=True)

    return gdf

In [102]:
zipcode = clean_zipcode_data('data/nyc_zipcodes.shp')
zipcode.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   zipcode     263 non-null    object  
 1   population  263 non-null    float64 
 2   geometry    263 non-null    geometry
dtypes: float64(1), geometry(1), object(1)
memory usage: 6.3+ KB


Unnamed: 0,zipcode,population,geometry
0,11436,18681.0,"POLYGON ((1038098.25187 188138.38001, 1038141...."
1,11213,62426.0,"POLYGON ((1001613.71296 186926.43952, 1002314...."
2,11212,83866.0,"POLYGON ((1011174.27554 183696.33771, 1011373...."
3,11225,56527.0,"POLYGON ((995908.36545 183617.61280, 996522.84..."
4,11218,72280.0,"POLYGON ((991997.11343 176307.49586, 992042.79..."
...,...,...,...
258,10310,25003.0,"POLYGON ((950767.50659 172848.96866, 950787.51..."
259,11693,11052.0,"POLYGON ((1028453.99491 167153.40984, 1027813...."
260,11249,28481.0,"POLYGON ((995877.31827 203206.07494, 995968.51..."
261,10162,0.0,"POLYGON ((997731.76075 219560.92215, 997641.94..."


# Storing Data

In [15]:
!createdb FINAL_PROJECT
!psql --dbname FINAL_PROJECT -c 'CREATE EXTENSION postgis;'
!psql --dbname FINAL_PROJECT -f schema.sql

createdb: error: database creation failed: ERROR:  database "FINAL_PROJECT" already exists
ERROR:  extension "postgis" already exists
psql:schema.sql:7: ERROR:  type "string" does not exist
LINE 3:                 neighborhood String,
                                     ^
psql:schema.sql:18: ERROR:  type "string" does not exist
LINE 4:                 complaint_type String,
                                       ^
psql:schema.sql:29: ERROR:  type "string" does not exist
LINE 3:                 species String,
                                ^
psql:schema.sql:35: ERROR:  relation "average_rents" already exists


In [81]:
import psycopg2

In [90]:
conn = psycopg2.connect("dbname=FINAL_PROJECT user=postgres password=123456")
cur = conn.cursor()

In [103]:
create_nyc_zipcodes_table = """
    CREATE TABLE IF NOT EXISTS nyc_zip_codes (
        id INTEGER PRIMARY KEY,
        zipcode INTEGER NOT NULL,
        population INTEGER,
        geometry POINT
    );
"""

In [104]:
create_311_complaints_table = """
    CREATE TABLE IF NOT EXISTS complaints311 (
        id INTEGER PRIMARY KEY,
        created_date DATE,
        closed_date DATE,
        agency VARCHAR,
        complaint_type VARCHAR,
        descriptor VARCHAR,
        location_type VARCHAR,
        incident_zip INTEGER,
        latitude FLOAT,
        longitude FLOAT,
        borough VARCHAR,
        geometry POINT
    );
"""

In [117]:
create_trees_table = """
    CREATE TABLE IF NOT EXISTS tree_census_2015 (
        id INTEGER PRIMARY KEY,
        geometry geometry(Geometry, 4326),
        status VARCHAR,
        health VARCHAR,
        species VARCHAR,
        zipcode INTEGER,
        borough VARCHAR,
        latitude FLOAT,
        longitude FLOAT
    );
"""

In [120]:
create_zillow_table = """
    CREATE TABLE IF NOT EXISTS zillow_rents(
        id INTEGER PRIMARY KEY,
        zipcode INTEGER,
        county VARCHAR,
        other_columns FLOAT
    );
"""

In [118]:
import psycopg2
from psycopg2 import extras
from shapely import wkt  # Import the Well-Known Text (WKT) module from Shapely

# Assuming you have a function clean_tree_data that reads and processes the CSV
gdf = clean_tree_data('data/tree_data.csv')

# Connect to PostgreSQL
conn = psycopg2.connect("dbname=FINAL_PROJECT user=postgres password=123456")
cur = conn.cursor()

# Define the table schema
create_trees_table = """
    CREATE TABLE IF NOT EXISTS tree_census_2015 (
        id INTEGER PRIMARY KEY,
        geometry geometry(Point, 4326),
        status VARCHAR,
        health VARCHAR,
        species VARCHAR,
        zipcode INTEGER,
        borough VARCHAR,
        latitude FLOAT,
        longitude FLOAT
    );
"""
cur.execute(create_trees_table)
conn.commit()
gdf=clean_tree_data('data/tree_data.csv')
# Prepare data for insertion
data_to_insert = [(wkt.dumps(row.geometry), row.status, row.health, row.species, row.zipcode, row.borough, row.latitude, row.longitude)
                  for row in gdf.itertuples(index=False)]

# Insert data into the table
insert_query = """
    INSERT INTO tree_census_2015 (geometry, status, health, species, zipcode, borough, latitude, longitude)
    VALUES (ST_GeomFromText(%s, 4326), %s, %s, %s, %s,%s, %s, %s);
"""
extras.execute_values(cur, insert_query, data_to_insert, template=None, page_size=100)

# Commit the changes
conn.commit()

# Close the connection
conn.close()

ValueError: the query contains more than one '%s' placeholder

In [119]:
import psycopg2
import pandas as pd
import psycopg2.extras as extras
import numpy as np


def insert_data_into_postgres(conn, df, table):
    tuples = [tuple(x) for x in df.to_numpy()]
    cols = ','.join(list(df.columns))
    placeholders = ','.join(['%s'] * len(df.columns))

    query = f"INSERT INTO {table} ({cols}) VALUES ({placeholders})"
    cursor = conn.cursor()

    try:
        extras.execute_values(cursor, query, tuples)
        conn.commit()
        print("The DataFrame is inserted.")
    except (Exception, psycopg2.DatabaseError) as error:
        print("Error:", error)
        conn.rollback()
    finally:
        cursor.close()


conn = psycopg2.connect("dbname=FINAL_PROJECT user=postgres password=123456")

df = clean_tree_data('data/tree_data.csv')

insert_data_into_postgres(conn, df, 'tree_census_2015')

# Close the connection
conn.close()

Error: the query contains more than one '%s' placeholder


In [101]:
import psycopg2
from psycopg2 import extras
conn = psycopg2.connect("dbname=FINAL_PROJECT user=postgres password=123456")
cur = conn.cursor()
create_zillow_table = """
    CREATE TABLE IF NOT EXISTS zillow_rents(
        id INTEGER PRIMARY KEY,
        zipcode INTEGER,
        county VARCHAR,
        other_columns FLOAT
    );
"""



SyntaxError: syntax error at or near "\"
LINE 1: \COPY zillow_rents(zipcode, county, other_columns) FROM 'dat...
        ^


In [107]:
cur.execute(create_nyc_zipcodes_table)
cur.execute(create_311_complaints_table)
cur.execute(create_trees_table)
cur.execute(create_zillow_table)
conn.commit()

# Understanding Data

# Visualizing Data