In [1]:
import pandas as pd
import requests
import geopandas as gpd
from shapely.geometry import Point

In [2]:
app_token = "Rl5BUiRawpr4H2LA9OQeKB47L"

# 1 Data Preprocessing

## 1.1 Download 311 data and 2015 tree cencus data

In [3]:
from datetime import datetime, timedelta

def generate_month_ranges(start_date, end_date):
    current = start_date
    temp = 0
    while current < end_date:
        temp += 1
        month_end = current.replace(day=1) + timedelta(days=32)
        month_end = month_end.replace(day=1) - timedelta(days=1) # 9 30
        yield (current, month_end)
        current = month_end + timedelta(days=1)
        if temp == 100:
            break

#Set start date and end ate
start_date = datetime(2018, 10, 1)
end_date = datetime(2023, 9, 30)

#Generate month's range
month_ranges = list(generate_month_ranges(start_date, end_date))

In [4]:
import os

for start, end in month_ranges:
    year = start.year
    csv_file = f"data/complaints_data_{year}.csv"
    print(f"Downloading data for the period: {start.strftime('%Y-%m-%d')} to {end.strftime('%Y-%m-%d')}")

    query = f"created_date between '{start.strftime('%Y-%m-%dT%H:%M:%S')}' and '{end.strftime('%Y-%m-%dT%H:%M:%S')}'"
    response = requests.get(
        url="https://data.cityofnewyork.us/resource/erm2-nwe9.json",
        params={"$$app_token": app_token, "$where": query, "$limit": 999999,
                "$select": "unique_key, created_date, closed_date, agency,  complaint_type, descriptor, location_type, incident_zip, latitude, longitude, borough"
                }
    )
    data = response.json()
    batch_df = pd.DataFrame(data)

    # 根据年份写入或追加到对应的CSV文件
    mode = 'a' if os.path.exists(csv_file) else 'w'
    batch_df.to_csv(csv_file, mode=mode, index=False, header=not os.path.exists(csv_file))

print("Data download complete.")

Downloading data for the period: 2018-10-01 to 2018-10-31
Downloading data for the period: 2018-11-01 to 2018-11-30
Downloading data for the period: 2018-12-01 to 2018-12-31
Downloading data for the period: 2019-01-01 to 2019-01-31
Downloading data for the period: 2019-02-01 to 2019-02-28
Downloading data for the period: 2019-03-01 to 2019-03-31
Downloading data for the period: 2019-04-01 to 2019-04-30
Downloading data for the period: 2019-05-01 to 2019-05-31
Downloading data for the period: 2019-06-01 to 2019-06-30
Downloading data for the period: 2019-07-01 to 2019-07-31
Downloading data for the period: 2019-08-01 to 2019-08-31
Downloading data for the period: 2019-09-01 to 2019-09-30
Downloading data for the period: 2019-10-01 to 2019-10-31
Downloading data for the period: 2019-11-01 to 2019-11-30
Downloading data for the period: 2019-12-01 to 2019-12-31
Downloading data for the period: 2020-01-01 to 2020-01-31
Downloading data for the period: 2020-02-01 to 2020-02-29
Downloading da

In [5]:
tree_data = requests.get(url="https://data.cityofnewyork.us/resource/5rq2-4hqu.json",
                         params={"$$app_token": app_token, "$limit": 99999999999999999999}).json()
tree_df = pd.DataFrame(tree_data)
tree_df.to_csv("data/tree.csv")

## 1.2 Data Cleaning & Filtering

### 1.2.1 311 Data Cleaning & Filtering

In [32]:
def clean_311_data(datafile):
    import pandas as pd
    import numpy as np
    import datetime
    import geopandas as gpd
    from shapely.geometry import Point
    
    #Add fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 12000:
            return np.NaN
        return str(input_zip)

    #Read the file
    df = pd.read_csv(datafile, low_memory=False)

    #fix the zip
    df['incident_zip'] = df['incident_zip'].apply(fix_zip)

    df = df.dropna(how='any')

    #get rid of unspecified boroughs
    df = df[df['borough'] != 'Unspecified']

    df['latitude'] = df['latitude'].astype('float64')
    df['longitude'] = df['longitude'].astype('float64')

    #Converts the 'closed_date','created_date' column into a datetime object
    df['created_date'] = pd.to_datetime(df['created_date'])
    df['closed_date'] = pd.to_datetime(df['closed_date'])
    # Convert longitude and latitude to a 'geometry' column for geopandas
    df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

    # Create a GeoDataFrame and set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.set_crs(epsg=4326, inplace=True)
    gdf = gdf.reset_index(drop=True)
    
    return gdf

### 1.2.2 2015 Tree census Data Cleaning & Filtering

In [4]:
tree = pd.read_csv('data/tree.csv', usecols=['tree_id', 'the_geom',   'spc_common', 'status', 'health', 'zipcode', 'boroname', 'latitude', 'longitude'])
tree.to_csv('data/tree_data.csv',index=False)

In [5]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point


def clean_tree_data(datafile):

    # Add fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0]) 
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 12000:
            return np.NaN
        return str(input_zip)

    # Read the file
    df = pd.read_csv(datafile)

    # Fix the zip
    df['zipcode'] = df['zipcode'].apply(fix_zip)

    df = df.dropna(how='any')

    # Make some columns name readable
    df.rename(columns={'the_geom': 'geometry', 'spc_common': 'species', 'boroname': 'borough'}, inplace=True)

    # Convert longitude and latitude to a 'geometry' column for geopandas
    df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

    # Create a GeoDataFrame and set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.set_crs(epsg=4326, inplace=True)
    gdf = gdf.reset_index(drop=True)


    return gdf

In [6]:
tree = clean_tree_data('data/tree_data.csv')
tree.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 651235 entries, 0 to 683787
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   tree_id    651235 non-null  int64   
 1   geometry   651235 non-null  geometry
 2   status     651235 non-null  object  
 3   health     651235 non-null  object  
 4   species    651235 non-null  object  
 5   zipcode    651235 non-null  object  
 6   borough    651235 non-null  object  
 7   latitude   651235 non-null  float64 
 8   longitude  651235 non-null  float64 
dtypes: float64(2), geometry(1), int64(1), object(5)
memory usage: 49.7+ MB


### 1.2.3 Zillow Data Cleaning & Filtering

In [44]:
def clean_zillow_data(datafile):
    import pandas as pd

    df = pd.read_csv(datafile, low_memory=False)

    df = df[df['City'] == 'New York']

    # Select the 'RegionName' and 'CountyName' columns and store them in df1
    df1 = df[['RegionName', 'CountyName']]
    # Select all columns from '2018-09-30' onwards and store them in df2
    df2 = df.loc[:, '2018-09-30':]

    # Concatenate df1 and df2 along the columns (axis=1)
    df = pd.concat([df1, df2], axis=1)

    # Make columns' name more readable
    df.rename(columns={'RegionName': 'zipcode', 'CountyName': 'county'}, inplace=True)

    # Convert the 'zipcode' column to a string data type
    df['zipcode'] = df['zipcode'].astype(str)

    df = df.melt(id_vars=["zipcode", "county"], var_name="date")
    
    df = df.reset_index(drop=True)

    return df

In [45]:
zillow = clean_zillow_data('data/zillow_rent_data.csv')
zillow.head()

Unnamed: 0,zipcode,county,date,value
0,11385,Queens County,2018-09-30,2401.525193
1,11208,Kings County,2018-09-30,
2,11236,Kings County,2018-09-30,
3,10467,Bronx County,2018-09-30,1773.839053
4,11373,Queens County,2018-09-30,


### 1.2.4 Zipcode data Cleaning & Filtering

In [52]:
def clean_zipcode_data(datafile):
    import geopandas as gpd

    gdf = gpd.read_file(datafile)
    
    gdf.crs = 'EPSG:4326'
    
    gdf = gdf.to_crs('EPSG:4326')
    #Select specific columns
    gdf = gdf[['ZIPCODE', 'POPULATION', 'geometry' ]]

    gdf.rename(columns={'ZIPCODE':'zipcode', 'POPULATION':'population'}, inplace=True)
    gdf = gdf.reset_index(drop=True)
    

    return gdf

In [53]:
zipcode = clean_zipcode_data('data/nyc_zipcodes.shp')
zipcode.info()


<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   zipcode     263 non-null    object  
 1   population  263 non-null    float64 
 2   geometry    263 non-null    geometry
dtypes: float64(1), geometry(1), object(1)
memory usage: 6.3+ KB


# 2 Storing Data

## 2.1 Creating database

In [11]:
!createdb FINAL_PROJECT
!psql --dbname FINAL_PROJECT -c 'CREATE EXTENSION postgis;'
!psql --dbname FINAL_PROJECT -f schema.sql

'createdb' is not recognized as an internal or external command,
operable program or batch file.
'psql' is not recognized as an internal or external command,
operable program or batch file.
'psql' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
import psycopg2

In [4]:
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

## 2.2 Creating tables

In [62]:
create_nyc_zipcodes_table = """
    CREATE TABLE IF NOT EXISTS nyc_zip_codes (
        id INTEGER PRIMARY KEY,
        zipcode INTEGER NOT NULL,
        population INTEGER,
        geometry geometry(Geometry, 4326)
    );
"""

In [7]:
create_trees_table = """
    CREATE TABLE IF NOT EXISTS tree_census_2015 (
        id INTEGER PRIMARY KEY,
        geometry geometry(Geometry, 4326),
        status VARCHAR,
        health VARCHAR,
        species VARCHAR,
        zipcode INTEGER,
        borough VARCHAR,
        latitude FLOAT,
        longitude FLOAT
    );
"""

In [46]:
create_zillow_table = """
    CREATE TABLE IF NOT EXISTS zillow_rents(
        id INTEGER PRIMARY KEY,
        zipcode INTEGER,
        county VARCHAR,
        date DATE,
        value FLOAT
    );
"""

In [65]:
import psycopg2

conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

In [66]:
cur.execute(create_nyc_zipcodes_table)
cur.execute(create_trees_table)
cur.execute(create_zillow_table)
conn.commit()

In [40]:
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

#Creating 311 tables by years
for year in range(2018, 2024):
    table_name = f"complaints311_{year}"
    create_table_query = f"""
        CREATE TABLE IF NOT EXISTS {table_name} (
            id INTEGER PRIMARY KEY,
            created_date DATE,
            closed_date DATE,
            agency VARCHAR,
            complaint_type VARCHAR,
            descriptor VARCHAR,
            location_type VARCHAR,
            incident_zip INTEGER,
            latitude FLOAT,
            longitude FLOAT,
            borough VARCHAR,
            geometry geometry(Geometry, 4326)
        );
    """
    
    cur.execute(create_table_query)


conn.commit()

## 2.3 Inserting data

### 2.3.1 Inserting tree data

In [36]:
import psycopg2
from shapely import wkt  # Import the Well-Known Text (WKT) module from Shapely

# Assuming you have a function clean_tree_data that reads and processes the CSV
gdf = clean_tree_data('data/tree_data.csv')

# Connect to PostgreSQL
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

# Define the table schema
create_trees_table = """
    CREATE TABLE IF NOT EXISTS tree_census_2015 (
        id INTEGER PRIMARY KEY,
        geometry geometry(Point, 4326),
        status VARCHAR,
        health VARCHAR,
        species VARCHAR,
        zipcode INTEGER,
        borough VARCHAR,
        latitude FLOAT,
        longitude FLOAT
    );
"""
cur.execute(create_trees_table)
conn.commit()
gdf=clean_tree_data('data/tree_data.csv')
# Prepare data for insertion
data_to_insert = [
    (row.Index, wkt.dumps(row.geometry), row.status, row.health, row.species, row.zipcode, row.borough, row.latitude, row.longitude)
                  for row in gdf.itertuples()]

# Insert data into the table
insert_query = """
    INSERT INTO tree_census_2015 (id, geometry, status, health, species, zipcode, borough, latitude, longitude)
    VALUES (%s, ST_GeomFromText(%s, 4326), %s, %s, %s, %s,%s, %s, %s);
"""
cur.executemany(insert_query, data_to_insert)

# Commit the changes
conn.commit()

# Close the connection
conn.close()

UniqueViolation: 错误:  重复键违反唯一约束"tree_census_2015_pkey"
DETAIL:  键值"(id)=(0)" 已经存在


### 2.3.2 Inserting 311 data

In [41]:
import psycopg2
from shapely import wkt

# Connect to the PostgreSQL database
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

for year in range(2018, 2024):
    # Clean the data
    complaints = clean_311_data(f"data/complaints_data_{year}.csv")


    print(year)

    # Prepare the SQL statement for data insertion, dynamically updating the table name
    insert_query = f"""
        INSERT INTO complaints311_{year} (id, created_date, closed_date, agency, complaint_type, descriptor, location_type, incident_zip, latitude, longitude, borough, geometry)
        VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, ST_GeomFromText(%s, 4326));
    """

    # Prepare the data to be inserted
    data_to_insert = [
        (row.Index, row.created_date, row.closed_date, row.agency, row.complaint_type, row.descriptor, row.location_type, row.incident_zip, row.latitude, row.longitude, row.borough, wkt.dumps(row.geometry))
        for row in complaints.itertuples()]

    # Execute the insertion
    cur.executemany(insert_query, data_to_insert)

    # Commit the changes
    conn.commit()

# Close the connection
conn.close()


2018
2019
2020
2021
2022
2023


### 2.3.3 Inserting zillow data

In [51]:
# Connect to PostgreSQL
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

# Prepare data for insertion
data_to_insert = [
    (row.Index,  row.zipcode, row.county, row.date, row.value)
    for row in zillow.itertuples()]

# Insert data into the table
insert_query = """
    INSERT INTO zillow_rents (id,  zipcode, county, date, value)
    VALUES ( %s, %s,%s, %s, %s);
"""
cur.executemany(insert_query, data_to_insert)

# Commit the changes
conn.commit()

# Close the connection
conn.close()

### 2.3.4 Inserting zipcodes data

In [67]:
# Connect to PostgreSQL
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

# Prepare data for insertion
data_to_insert = [
    (row.Index,  row.zipcode, row.population, wkt.dumps(row.geometry) )
    for row in zipcode.itertuples()]

# Insert data into the table
insert_query = """
    INSERT INTO nyc_zip_codes (id,  zipcode, population, geometry)
    VALUES ( %s, %s,%s, ST_GeomFromText(%s, 4326));
"""
cur.executemany(insert_query, data_to_insert)

# Commit the changes
conn.commit()

# Close the conn
conn.close()

# Understanding Data

# Visualizing Data