In [2]:
from pathlib import Path
from src.processing import degradation_overlap
from src.data import spark_postgis
from src.data import jrc_parser
from src import constants

TypeError: expected str, bytes or os.PathLike object, not NoneType

Constants

In [None]:
#shots_dir = 's3://gfw2-data/climate/European_height_carbon_model/gedi_coincident_shot/data/shots/'
shots_dir = 's3://gfw2-data/climate/European_height_carbon_model/gedi_coincident_shot/data/dummy/'
ecozone_dir = 's3://gfw2-data/climate/European_height_carbon_model/gedi_coincident_shot/data/vector/ecozone/'
gadm_dir = 's3://gfw2-data/climate/European_height_carbon_model/gedi_coincident_shot/data/vector/country/'
sdpt_dir = 's3://gfw2-data/climate/European_height_carbon_model/gedi_coincident_shot/data/vector/sdpt/'

ANALYSIS

Step 1: Get a Spark Session 

In [None]:
#Create a spark session
spark = spark_postgis.get_spark()

Step 2: Create Shots Dataframe

In [None]:
shots_df = degradation_overlap.get_shots_df(spark, shots_dir)  #TODO: Lat and long are flipped, make sure geom vs geometry

In [None]:
#Check the geometries from the first two rows in the shots data frame
shots_df.show(n=2)
shots_df.select(['t1_geom', 't2_geom']).show(n=2)

Step 3: Convert Shot Dates

In [None]:
shots_df = jrc_parser.convert_shot_dates(shots_df)
shots_df.createOrReplaceTempView("gedi_shots")

In [None]:
#Check the dates from the first two rows in the shots data frame
shots_df.select(['t1_year', 't2_year']).show(n=2)

Step 4: Overlay Ecozone

In [None]:
#Create ecozone dataframe
def get_ecozone_df(spark, ecozone_dir):
    ecozone_df = spark.read.parquet(ecozone_dir.as_posix())
    ecozone_df.createOrReplaceTempView("ecozones")
    ecozone_df = spark.sql(
        "SELECT *, ST_GeomFromWKB(geometry) AS geom FROM ecozones"
    )
    ecozone_df = ecozone_df.drop("geometry")
    ecozone_df.createOrReplaceTempView("ecozones")
    return ecozone_df

ecozone_df = get_ecozone_df(spark, ecozone_dir)

In [None]:
#Check the first ten rows in the ecozone data frame
ecozone_df.show(n=10)

In [None]:
#Join ecozones to shots dataframe
ecozone_join_query = f"""
    SELECT s.*, f.emisEcozon AS emisEcozon, f.gainEcozon AS gainEcozon, f.GEZ_TERM AS GEZ_TERM
    FROM gedi_shots as s INNER JOIN ecozones as f
    ON ST_Contains(f.geom, s.t2_geom)
"""

shots_df = spark.sql(ecozone_join_query)
shots_df.createOrReplaceTempView("gedi_shots")

In [None]:
#Check the ecozone columns from the first two rows in the shots data frame
shots_df.select(['emisEcozon', 'gainEcozon', 'GEZ_TERM']).show(n=2)

Step 5: Overlay Country

In [None]:
#TODO: Update with new GADM
#Create GADM dataframe
def get_gadm_df(spark, gadm_dir):
    gadm_df = spark.read.parquet(gadm_dir.as_posix())
    gadm_df.createOrReplaceTempView("gadm")
    gadm_df = spark.sql(
        "SELECT *, ST_GeomFromWKB(geometry) AS geom FROM gadm"
    )
    gadm_df = gadm_df.drop("geometry")
    gadm_df.createOrReplaceTempView("gadm")
    return gadm_df

gadm_df = get_gadm_df(spark, gadm_dir)



shots_df.show(n=10)

In [None]:
#Check the first ten rows in the gadm data frame
gadm_df.show(n=10)

In [None]:
#Join GADM to shots dataframe
gadm_join_query = f"""
    SELECT s.*, f.iso AS country
    FROM gedi_shots as s INNER JOIN gadm as f
    ON ST_Contains(f.geom, s.t2_geom)
"""

shots_df = spark.sql(gadm_join_query)
shots_df.createOrReplaceTempView("gedi_shots")

In [None]:
#Check the gadm column from the first two rows in the shots data frame
shots_df.select(['country']).show(n=2)