In [1]:
import pandas as pd
import requests
import geopandas as gpd
from shapely.geometry import Point

In [2]:
app_token = "Rl5BUiRawpr4H2LA9OQeKB47L"

# 1 Data Preprocessing

## 1.1 Download 311 data and 2015 tree cencus data

In [24]:
from datetime import datetime, timedelta

def generate_month_ranges(start_date, end_date):
    current = start_date
    temp = 0
    while current < end_date:
        temp += 1
        month_end = current.replace(day=1) + timedelta(days=32)
        month_end = month_end.replace(day=1) - timedelta(days=1) # 9 30
        yield (current, month_end)
        current = month_end + timedelta(days=1)
        if temp == 100:
            break

#Set start date and end ate
start_date = datetime(2018, 10, 1)
end_date = datetime(2023, 9, 30)

#Generate month's range
month_ranges = list(generate_month_ranges(start_date, end_date))

initial: 2018-10-01 00:00:00


In [25]:
import os

for start, end in month_ranges:
    year = start.year
    csv_file = f"data/complaints_data_{year}.csv"
    print(f"Downloading data for the period: {start.strftime('%Y-%m-%d')} to {end.strftime('%Y-%m-%d')}")

    query = f"created_date between '{start.strftime('%Y-%m-%dT%H:%M:%S')}' and '{end.strftime('%Y-%m-%dT%H:%M:%S')}'"
    response = requests.get(
        url="https://data.cityofnewyork.us/resource/erm2-nwe9.json",
        params={"$$app_token": app_token, "$where": query, "$limit": 999999,
                "$select": "unique_key, created_date, closed_date, agency,  complaint_type, descriptor, location_type, incident_zip, latitude, longitude, borough"
                }
    )
    data = response.json()
    batch_df = pd.DataFrame(data)

    # 根据年份写入或追加到对应的CSV文件
    mode = 'a' if os.path.exists(csv_file) else 'w'
    batch_df.to_csv(csv_file, mode=mode, index=False, header=not os.path.exists(csv_file))

print("Data download complete.")

Downloading data for the period: 2018-10-01 to 2018-10-31
Downloading data for the period: 2018-11-01 to 2018-11-30
Downloading data for the period: 2018-12-01 to 2018-12-31
Downloading data for the period: 2019-01-01 to 2019-01-31
Downloading data for the period: 2019-02-01 to 2019-02-28
Downloading data for the period: 2019-03-01 to 2019-03-31
Downloading data for the period: 2019-04-01 to 2019-04-30
Downloading data for the period: 2019-05-01 to 2019-05-31
Downloading data for the period: 2019-06-01 to 2019-06-30
Downloading data for the period: 2019-07-01 to 2019-07-31
Downloading data for the period: 2019-08-01 to 2019-08-31
Downloading data for the period: 2019-09-01 to 2019-09-30
Downloading data for the period: 2019-10-01 to 2019-10-31
Downloading data for the period: 2019-11-01 to 2019-11-30
Downloading data for the period: 2019-12-01 to 2019-12-31
Downloading data for the period: 2020-01-01 to 2020-01-31
Downloading data for the period: 2020-02-01 to 2020-02-29
Downloading da

In [13]:
tree_data = requests.get(url="https://data.cityofnewyork.us/resource/5rq2-4hqu.json",
                         params={"$$app_token": app_token, "$limit": 99999999999999999999}).json()
tree_df = pd.DataFrame(tree_data)
tree_df.to_csv("data/tree.csv")

## 1.2 Data Cleaning & Filtering

### 1.2.1 311 Data Cleaning & Filtering

In [3]:
def clean_311_data(datafile):
    import pandas as pd
    import numpy as np
    import datetime
    import geopandas as gpd
    from shapely.geometry import Point
    
    #Add fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0])
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 12000:
            return np.NaN
        return str(input_zip)

    #Read the file
    df = pd.read_csv(datafile, low_memory=False)

    #fix the zip
    df['incident_zip'] = df['incident_zip'].apply(fix_zip)

    df = df.dropna(how='any')

    #get rid of unspecified boroughs
    df = df[df['borough'] != 'Unspecified']

    df['latitude'] = df['latitude'].astype('float64')
    df['longitude'] = df['longitude'].astype('float64')

    #Converts the 'closed_date','created_date' column into a datetime object
    df['created_date'] = pd.to_datetime(df['created_date'])
    df['closed_date'] = pd.to_datetime(df['closed_date'])
    # Convert longitude and latitude to a 'geometry' column for geopandas
    df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

    # Create a GeoDataFrame and set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [4]:
complaints_2018 = clean_311_data('data/complaints_data_2018.csv')
complaints_2018.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 338343 entries, 0 to 454962
Data columns (total 12 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   unique_key      338343 non-null  int64         
 1   created_date    338343 non-null  datetime64[ns]
 2   closed_date     338343 non-null  datetime64[ns]
 3   agency          338343 non-null  object        
 4   complaint_type  338343 non-null  object        
 5   descriptor      338343 non-null  object        
 6   location_type   338343 non-null  object        
 7   incident_zip    338343 non-null  object        
 8   latitude        338343 non-null  float64       
 9   longitude       338343 non-null  float64       
 10  borough         338343 non-null  object        
 11  geometry        338343 non-null  geometry      
dtypes: datetime64[ns](2), float64(2), geometry(1), int64(1), object(6)
memory usage: 33.6+ MB


### 1.2.2 2015 Tree census Data Cleaning & Filtering

In [6]:
tree = pd.read_csv('data/tree.csv', usecols=['tree_id', 'the_geom',   'spc_common', 'status', 'health', 'zipcode', 'boroname', 'latitude', 'longitude'])
tree.to_csv('data/tree_data.csv',index=False)

In [7]:
def clean_tree_data(datafile):
    import pandas as pd
    import numpy as np
    import geopandas as gpd
    from shapely.geometry import Point

    # Add fix_zip function
    def fix_zip(input_zip):
        try:
            input_zip = int(float(input_zip))
        except:
            try:
                input_zip = int(input_zip.split('-')[0]) 
            except:
                return np.NaN
        if input_zip < 10000 or input_zip > 12000:
            return np.NaN
        return str(input_zip)

    # Read the file
    df = pd.read_csv(datafile)

    # Fix the zip
    df['zipcode'] = df['zipcode'].apply(fix_zip)

    df = df.dropna(how='any')

    # Make some columns name readable
    df.rename(columns={'the_geom': 'geometry', 'spc_common': 'species', 'boroname': 'borough'}, inplace=True)

    # Convert longitude and latitude to a 'geometry' column for geopandas
    df['geometry'] = df.apply(lambda row: Point(row['longitude'], row['latitude']), axis=1)

    # Create a GeoDataFrame and set the coordinate reference system (CRS) to WGS84 (EPSG:4326)
    gdf = gpd.GeoDataFrame(df, geometry='geometry')
    gdf.set_crs(epsg=4326, inplace=True)

    return gdf

In [8]:
tree = clean_tree_data('data/tree_data.csv')
tree.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Index: 651235 entries, 0 to 683787
Data columns (total 9 columns):
 #   Column     Non-Null Count   Dtype   
---  ------     --------------   -----   
 0   tree_id    651235 non-null  int64   
 1   geometry   651235 non-null  geometry
 2   status     651235 non-null  object  
 3   health     651235 non-null  object  
 4   species    651235 non-null  object  
 5   zipcode    651235 non-null  object  
 6   borough    651235 non-null  object  
 7   latitude   651235 non-null  float64 
 8   longitude  651235 non-null  float64 
dtypes: float64(2), geometry(1), int64(1), object(5)
memory usage: 49.7+ MB


### 1.2.3 Zillow Data Cleaning & Filtering

In [9]:
def clean_zillow_data(datafile):
    import pandas as pd

    df = pd.read_csv(datafile, low_memory=False)

    df = df[df['City'] == 'New York']

    # Select the 'RegionName' and 'CountyName' columns and store them in df1
    df1 = df[['RegionName', 'CountyName']]
    # Select all columns from '2018-09-30' onwards and store them in df2
    df2 = df.loc[:, '2018-09-30':]

    # Concatenate df1 and df2 along the columns (axis=1)
    df = pd.concat([df1, df2], axis=1)

    # Make columns' name more readable
    df.rename(columns={'RegionName': 'zipcode', 'CountyName': 'county'}, inplace=True)

    # Convert the 'zipcode' column to a string data type
    df['zipcode'] = df['zipcode'].astype(str)

    return df

In [10]:
zillow = clean_zillow_data('data/zillow_rent_data.csv')
zillow.info()

<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, 4 to 6721
Data columns (total 63 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   zipcode     145 non-null    object 
 1   county      145 non-null    object 
 2   2018-09-30  76 non-null     float64
 3   2018-10-31  76 non-null     float64
 4   2018-11-30  76 non-null     float64
 5   2018-12-31  75 non-null     float64
 6   2019-01-31  77 non-null     float64
 7   2019-02-28  77 non-null     float64
 8   2019-03-31  79 non-null     float64
 9   2019-04-30  79 non-null     float64
 10  2019-05-31  79 non-null     float64
 11  2019-06-30  79 non-null     float64
 12  2019-07-31  79 non-null     float64
 13  2019-08-31  79 non-null     float64
 14  2019-09-30  79 non-null     float64
 15  2019-10-31  79 non-null     float64
 16  2019-11-30  79 non-null     float64
 17  2019-12-31  79 non-null     float64
 18  2020-01-31  81 non-null     float64
 19  2020-02-29  81 non-null     float

### 1.2.4 Zipcode data Cleaning & Filtering

In [6]:
def clean_zipcode_data(datafile):
    import geopandas as gpd

    gdf = gpd.read_file(datafile)
    
    gdf.crs = 'EPSG:4326'
    
    gdf = gdf.to_crs('EPSG:4326')
    #Select specific columns
    gdf = gdf[['ZIPCODE', 'POPULATION', 'geometry' ]]

    gdf.rename(columns={'ZIPCODE':'zipcode', 'POPULATION':'population'}, inplace=True)

    return gdf

In [7]:
zipcode = clean_zipcode_data('data/nyc_zipcodes.shp')
zipcode.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 263 entries, 0 to 262
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   zipcode     263 non-null    object  
 1   population  263 non-null    float64 
 2   geometry    263 non-null    geometry
dtypes: float64(1), geometry(1), object(1)
memory usage: 6.3+ KB


# Storing Data

In [3]:
!createdb FINAL_PROJECT
!psql --dbname FINAL_PROJECT -c 'CREATE EXTENSION postgis;'

'createdb' is not recognized as an internal or external command,
operable program or batch file.
'psql' is not recognized as an internal or external command,
operable program or batch file.


In [21]:
import psycopg2

In [22]:
conn = psycopg2.connect("dbname=final_project user=postgres password=123456")
cur = conn.cursor()

In [23]:
with open('schema.sql', 'w') as f:
    command = """
        CREATE TABLE zip_codes (
            zip_code Integer PRIMARY KEY,
            neighborhood TEXT,
            borough TEXT,
            geometry GEOMETRY(Point, 4326)
        );

        CREATE TABLE complaints (
            complaint_id Integer PRIMARY KEY,
            created_date TIMESTAMP,
            complaint_type TEXT,
            descriptor TEXT,
            zip_code Integer REFERENCES zip_codes(zip_code),
            latitude FLOAT,
            longitude FLOAT,
            geometry GEOMETRY(Point, 4326)
        );

        CREATE TABLE trees (
            tree_id Integer PRIMARY KEY,
            species TEXT,
            diameter_inches FLOAT,
            health TEXT,
            zip_code Integer REFERENCES zip_codes(zip_code),
            latitude FLOAT,
            longitude FLOAT,
            geometry GEOMETRY(Point, 4326)
        );

        CREATE TABLE average_rents (
            zip_code Integer PRIMARY KEY,
            year INT,
            avg_rent DECIMAL
        );
    """
    f.write(command)

In [24]:
cur.execute(command)
conn.commit()

# Understanding Data

# Visualizing Data