In [9]:
#Helper Functions for Data processing:

import pandas as pd
import numpy as np
import geopandas as gpd
import dask 
import numba
import libpysal as lp
from rasterstats import zonal_stats
import os
import rasterio
from pathlib import Path
import json
from rasterio.mask import mask





In [8]:
def read_spatial(path, espg_code):
    '''
    Function to read spatial data and converts to ESPG 3435
    Input: path to the data file
            epgs_code (string): epsg code as a string
    Output: a gpd object 
    '''
    epsg = "EPSG:" + espg_code
    file=gpd.read_file(path)
    file=file.to_crs(epsg)
    print(path, file.crs)
    return file

In [290]:
#GET TOY TEST DATA:
#com_areas = read_spatial("data/com_areas_chi", "32616")
#bike_rack_points = pd.read_csv('data/Bike_Racks.csv')
#bike_rack_points = bike_rack_points.rename(columns = {"LOCATION":"geometry"})
#bike_racks = gpd.GeoDataFrame(
#    bike_rack_points, geometry=gpd.points_from_xy(bike_rack_points.Longitude,
#                                                  bike_rack_points.Latitude), 
#    crs="EPSG:4326")
#bike_racks = bike_racks.to_crs("EPSG:32616")
#bike_racks = bike_racks[["RackID", "Address", "F12", "F13", "geometry"]]
#bike_racks["F12"] = bike_racks["F12"].round(1)
#bike_racks["F13"] = bike_racks["F13"].round(1)
#base = com_areas.plot(color='white', edgecolor='black')
#bike_racks.plot(ax=base, marker='o', color='red', markersize=5);

In [5]:
def conduct_point_to_polygon(polygon_data, poly_unique_id, other_file):
    '''
    A function that creates a new dataset with spatial join of pyolgon data to points
    Inputs:
        polygon_data (gpd): polygon data (i.e. Community Areas)
        poly_unique_id (list of strings): the unique identifiers of the polygon spatial data
        other_file (gpd): another point-based gpd. Data from this will be aggregated 
                          to the polygon scale
        agg_dict (dict keys: strings - col names in point data
                       values: strings or list of strings - how to aggregate)
                       
    Output:
        the polygon_data dataframe updated with the new column
    Note: .size suggestion from here: https://stackoverflow.com/questions/19384532/get-statistics-for-each-group-such-as-count-mean-etc-using-pandas-groupby
    '''
    spatial_join = gpd.sjoin(other_file, 
                             polygon_data[poly_unique_id + ["geometry"]], 
                             how="inner", 
                             op='intersects')
    
    return spatial_join
#USE CASE: 
#INPUT com areas and point data with all of the ND.. features
#We get the com area the point data is in


In [291]:
#Toy Test:
#bike_ag_dict = {'RackID': ['min', 'max'], 'Address': 'size'}

#com_area_w_bike = conduct_point_to_polygon(com_areas, 
#                                           ["community", "area_numbe"], 
#                                           bike_racks)
#com_area_w_bike

In [9]:
def agg_cell_data(df, cols_to_group_by, ag_dict):
    '''
    Function to aggregate point weather data the cols_to_group_by and perform aggregations
    as specificed in the aggregation_dictionary
    Inputs:
        df the dataframe
        cols_to_group_by (list of strings): columns in df
        ag_dict (dictionary of strings to list of strings): maps column names to 
                                    aggregation operations
    Outputs:
        aggregated df
    '''
    df_new = df.groupby(cols_to_group_by).agg(ag_dict)
    
    return df_new

#USE CASE: ONCE WE HAVE PREVIOUS DATA, WE GROUP BY COM AREA, PERIOD, YEAR
# THEN WE AGGREGATE: MEAN, MAX OF DIFFERENT ND.. COLS


In [292]:
#cols = ["area_numbe", "F12"]
#ag_dict1 = {"RackID":"mean", "F13": ["min", "max"]}
#bike_rack_agg = agg_cell_data(com_area_w_bike, cols_to_group_by = cols, 
#                             ag_dict = ag_dict1)
#bike_rack_agg


In [293]:
#Get raster data:

#SHARED_DATA_FOLDER = Path('tif_data_2021')

#construct SCENE NAME
# path_row = []  
# year = []
# month = []
# day = []

#SCENE = 'LC08_L2SP_022031_20210514_20210525_02_T1'

#check if scene_path is a file

#scene_path = SHARED_DATA_FOLDER/SCENE
#b1_path =scene_path/"{}_SR_B2.TIF".format(SCENE)
#b1 = rasterio.open(b1_path)
#chi_b = gpd.read_file("chi_b")
#chi_b = chi_b.to_crs("EPSG:32616")




In [82]:
#b1 = rasterio.open(b1_path)

In [99]:
#b1.crs

CRS.from_epsg(32616)

In [294]:
def clip_raster(raster, vector_poly):
    '''
    Clip raster polygon with the vecotr polygon boundary
    Input: 
        raster - rasterio object (i.e. read from rasterio.open)
        vector_poly - geopandas dataframe
    Output:
        raster object that is clipped
    '''
    vector_poly_good_crs = vector_poly.to_crs(raster.crs)
    vector_as_json = [json.loads(vector_poly.to_json())['features'][0]['geometry']]
    
    out_img, out_transform = mask(dataset=raster, 
                                  shapes=vector_as_json, 
                                  crop=True)
    return out_img, out_transform
    
    

In [199]:
new_raster, new_raster_transform = clip_raster(b1, chi_b)

In [18]:
def compute_max(list_of_arrays):
    '''
    Function that computes the elementwise max of the arrays in the list of arrays
    Inputs:
        list_of_arrays (list of 1D np arrays), note the arrays should be the same size
    Output:
        max_array (1D np array): the elementwise max from each array
    '''
    n, p = list_of_arrays[0].shape
    mx = np.zeros((n, p))  
    for a in list_of_arrays:
        mx = np.maximum(a, mx)
    
    return mx

#a = np.array([[1, 2], [4, -1]])    
#b = np.array([[-1, 5], [0, 1]])
#c = np.array([[1, 1], [1, 1]])
#d = np.array([[-1, -1], [-1, -1]])
#l = [a, b, c, d, a, b, c, d, a]
#compute_max(l)

In [14]:
compute_max([a, b, c, d])

array([[1., 5.],
       [4., 1.]])

In [10]:
#Read and set community areas up: 
com_areas = read_spatial("data/com_areas_chi", "32616")
com_areas2 = com_areas[["area_numbe", "community", "geometry"]]
#com_areas3 = com_areas2[com_areas2[:, 0].argsort()]
#com_areas2
com_areas2["number"] = pd.to_numeric(com_areas2["area_numbe"])
com_areas3 = com_areas2.sort_values(by=["number"])
com_areas3 #This is the com areas to use

com_areas_no_spatial = com_areas3[["area_numbe", "community", "number"]]

data/com_areas_chi EPSG:32616


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  super(GeoDataFrame, self).__setitem__(key, value)


In [298]:
#test = np.array(test)
#test

In [299]:
def get_W(com_areas):
    '''
    Function that takes community areas shapefile (should be sorted by area number)
    Input:
        com_areas (gpd): a community areas shapefile sorted by area number
    Output:
        W (2D np.array): the spatial weights matrix
    
    '''
    #Get pysal weights matrix:
    weights_matrix = lp.weights.Queen.from_dataframe(com_areas, idVariable='area_numbe')

    #Access their dictionary
    w_dict = weights_matrix.neighbors
    W = np.zeros((77, 77))

    for com_area, list_of_neighbors in w_dict.items():
        for neighbor in list_of_neighbors:
            W[int(com_area) -1, int(neighbor) -1] = 1

    #Standardize:
    sum_of_rows = W.sum(axis=1)
    W = W / sum_of_rows[:, np.newaxis]
    
    return W

#W = get_W(com_areas3)

In [300]:
def compute_spatial_lag(data_array, W):
    '''
    Computes the first order queen contiguity spatial lag for data in the data_array given weights matrix W
    Inputs:
        data_array (2D np.array): data matrix with rows indicating community areas (ordered by com area number)
        W (2D np.array): a spatial weights matrix of size 77 x 77
    Outputs:
        data_array (2D np.array): the data array, but with columns appended for each spatial lag
    '''
    n, p = data_array.shape
    new_data_array = data_array
    
    for i in range(p):
        col = data_array[:,i]
        lag = W @ col
        lag = lag.reshape(-1, 1)
        new_data_array = np.hstack((new_data_array, lag))
    return new_data_array 

out = compute_spatial_lag(test, W)



In [5]:
!ls 


Dask_building_data.ipynb
[34mLC08_L2SP_023031_20130904_20200912_02_T1[m[m
LC08_L2SP_023031_20130904_20200912_02_T1.tar
Land_surface_temp_calulator.py
Project_gdm.ipynb
Project_gdm1.ipynb
README.md
Untitled.ipynb
[34mchi_b[m[m
chi_b.cpg
chi_b.dbf
chi_b.prj
chi_b.qpj
chi_b.shp
chi_b.shx
[31mchi_b.zip[m[m
[34mdata[m[m
get_landsat_data.py
get_landsat_data.sbatch
time_series_split.py
util.py
utils.ipynb


In [31]:
def compute_zonal_stats(path_to_raster, vector, band_name):
    '''
    This function first converts raw bands into their acceptable ranges (see page 12 of
    the link below for more details) and then computes the zonal stats of the for the 
    inputted band for community areas. s
    
    Inputs: 
        path_to_raster (string): path to raster data
        vector (geopandas df): 
        band_name (string): name of band
    Outputs:
        nparray (1D np array): mean values of the raster data 
                             ordered by com area order
    '''
    col_name = "mean_" + band_name
    
    raster = rasterio.open(path_to_raster)
    affine = raster.transform
    raster = raster.read()[0]
    
    print(raster.shape)
    if band_name in ["b1", "b2", "b3", "b4", "b5", "b7"]:
        raster = np.where((raster > 7273) & (raster < 43537), raster, 0)
        raster = (raster * 0.0000275) -0.2
    elif band_name == "b6":
        raster = (raster * 0.0000275) -0.2
    else: #band_name = "b10"
        raster = (raster * 0.00341802) + 149
    
    
    sum_stats = zonal_stats(vector, raster, 
                            nodata = -999,
                            affine = affine,
                            stats=["mean"])
    df = pd.DataFrame(sum_stats)
    df = df.rename(columns = {"mean": col_name})
    nparray = np.array(df)
    
    return nparray

path = "LC08_L2SP_023031_20130904_20200912_02_T1/LC08_L2SP_023031_20130904_20200912_02_T1_SR_B6.TIF"
a = compute_zonal_stats(path, com_areas3, "b6")
#file = "../data_2013_2015/LC08_L2SP_022031_20130524_20200913_02_T1/LC08_L2SP_022031_20130524_20200913_02_T1_SR_B1.TIF"
#print(a.shape, a)
a

(7951, 7841)


array([[0.20772682],
       [0.19564331],
       [0.2089464 ],
       [0.1861174 ],
       [0.17212766],
       [0.18262329],
       [0.17880583],
       [0.17063834],
       [0.1775142 ],
       [0.18189693],
       [0.18034152],
       [0.16393084],
       [0.18782113],
       [0.18138435],
       [0.17451679],
       [0.17785677],
       [0.18202376],
       [0.17721729],
       [0.18172259],
       [0.1793875 ],
       [0.19032342],
       [0.18364333],
       [0.19223613],
       [0.19295021],
       [0.18443179],
       [0.19599144],
       [0.20545481],
       [0.20723008],
       [0.20627781],
       [0.19790727],
       [0.20395322],
       [0.15756092],
       [0.22431137],
       [0.20121338],
       [0.20923171],
       [0.21381369],
       [0.20723384],
       [0.2093681 ],
       [0.18655243],
       [0.20653025],
       [0.18892075],
       [0.19486871],
       [0.1875129 ],
       [0.17612655],
       [0.16781336],
       [0.17223131],
       [0.18019996],
       [0.168