In [None]:
import pandas as pd
import dlt
from pyspark.sql.functions import *
import pyspark.pandas as ps
#from trase.tools.aws.aws_helpers_cached import get_pandas_df_once
#from trase.tools.aws.metadata import write_csv_for_upload
spark.conf.set("spark.databricks.delta.schema.autoMerge.enabled","true")
spark.conf.set("spark.databricks.delta.schema.overwriteSchema.enabled","true")
#spark.conf.set("spark.sql.execution.arrow.pyspark.fallback.enabled","false")

In [None]:
def get_pandas_df_once( key,
    bucket="s3a://uutrase/data/trase-uppsala/",
    version_id=None,
    client=None,
    track=True,
    sep=";",
    encoding="utf8",
    xlsx=False,
    **kwargs,):
    table_name = key.split(".")[0].split("/")[-1]
    return ps.DataFrame(dlt.read(table_name)).to_pandas()

def get_df_once( key,
    bucket="s3a://uutrase/data/trase-uppsala/",
    version_id=None,
    client=None,
    track=True,
    sep=";",
    encoding="utf8",
    xlsx=False,
    **kwargs,):
    table_name = key.split(".")[0].split("/")[-1]
    return ps.DataFrame(dlt.read(table_name)).to_pandas()
    

def get_df(key,bucket="s3a://uutrase/data/trase-uppsala/",version_id=None,client=None,track=True,sep=";",encoding="utf8",xlsx=False,**kwargs):
    """
    Read a CSV or XLSX dataset from S3 to a Pandas DataFrame. See
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
    https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_excel.html
    """
    #@dlt.table(name=key.split(".")[0].split("/")[-1])
    #def create_table():
    if xlsx:
        #return read_xlsx(key, bucket, version_id, client, track, **kwargs)
        return spark.read.format("com.crealytics.spark.excel").option("delimiter",sep).option("encoding",encoding).option("header","true").option("multiline","true").load(bucket+key)

    else:
        return spark.read.option("delimiter",sep).option("encoding",encoding).option("header","true").option("multiline","true").csv(bucket+key)
            #return read_csv(
        #    key, bucket, version_id, client, track, sep=sep, encoding=encoding, **kwargs
       # )
    
def write_csv_for_upload( df: "pd.DataFrame",
    key: str,
    script_path: str = None,
    path: str = None,
    metadata_path: str = None,
    metadata_key: str = None,
    bucket: str = "trase-storage",
    do_upload: bool = None,
    upstream = None,
    **pd_to_csv_kwargs,):
    
    pass
    
    
    

In [None]:
"""
Method 2.
Industry (SeafoodTIP) and government sources tell us that exports are 95% of production.
Therefore we divide by 0.95.
This spreads the estimate of the total production (rather than just the exports) across the ponds, proportional to their area.
<< Eqn: production = ((pond_area/total_area))*exports_for_year)/0.95  >>

This is given in columns: AVG_PROD_2013 ... to 2019
"""

def run_script_combined():
    
    HS_CODES = ["030616", "030617", "030635", "030636", "030695", "160521", "160529"]


    def process(df, flows_df):
        _exports_for_year = flows_df["volume_raw"].sum()
        _total_area = df["AREA_HA"].sum()

        def get_production(area_of_pond):
            return ((_exports_for_year / 0.95) / _total_area) * area_of_pond

        df["avg_produced_tons"] = df["AREA_HA"].apply(get_production)
        df["avg_remaining_tons"] = df["avg_produced_tons"]

        return df

    pond_df = get_pandas_df_once(
        "ecuador/production/crop_maps/production/shrimp_pond_maps/out/ec_shrimp_ponds.csv"
    )
    pond_df.AREA_HA = pond_df.AREA_HA.astype(float)

    combined = []
    YEARS = list(range(2013, 2020))
    for year in YEARS:
        flows_df = get_pandas_df_once(
            f"ecuador/trade/cd/export/{year}/CD_ECUADOR_{year}.csv",
            dtype=str,
            keep_default_na=False,
        )
        flows_df = flows_df.rename(
            columns={
                "TOTAL.Net.Weight..Kg.": "volume_raw",
                "Harmonized.Code.Product.Spanish": "hs6",
            },
            errors="raise",
        )
        flows_df = flows_df[flows_df["hs6"].isin(HS_CODES)]
        flows_df["volume_raw"] = flows_df.volume_raw.astype(float)
        flows_df["volume_raw"] /= 1_000
        out_df = process(pond_df, flows_df)
        #print(year, out_df["avg_produced_tons"].sum())
        out_df = out_df[
            [
                "POND_TRASE_ID",
                "PARISH_TRASE_ID",
                "AREA_HA",
                "avg_produced_tons",
                "avg_remaining_tons",
            ]
        ]
        out_df = out_df.assign(YEAR=str(year))

        # combine to output for parish
        cout_df = (
            out_df.groupby(["PARISH_TRASE_ID", "YEAR"])[["avg_produced_tons", "AREA_HA"]]
            .sum()
            .reset_index()
        )
        assert cout_df["avg_produced_tons"].sum() >= flows_df["volume_raw"].sum()
        combined.append(cout_df)
        #print("cout_df",cout_df)

    combined_df = pd.concat(combined)
    
    write_csv_for_upload(
        combined_df,
        "ecuador/production/crop_maps/production/shrimp_pond_maps/out/ec_production_per_parish.csv",
    )
    print(ps.DataFrame(combined_df))
    return ps.DataFrame(combined_df)
def run_script_area():
    combined_df = get_df_once("ecuador/production/crop_maps/production/shrimp_pond_maps/out/ec_production_per_parish.csv")
    print("asd: ",combined_df)
    area_df = combined_df[combined_df["YEAR"] == "2019"]
    area_df = area_df[["PARISH_TRASE_ID", "AREA_HA"]]
    write_csv_for_upload(
        area_df,
        "ecuador/production/crop_maps/production/shrimp_pond_maps/out/ec_shrimp_area_per_parish.csv",
    )
    return ps.DataFrame(area_df)

YEARS = list(range(2013, 2020))
table_names = []
for year in YEARS:
    table_names += [f"ecuador/trade/cd/export/{year}/CD_ECUADOR_{year}.csv"]
table_names += ["ecuador/production/crop_maps/production/shrimp_pond_maps/out/ec_shrimp_ponds.csv"]
#for name in table_names:
#    @dlt.table(name = name.split(".")[0].split("/")[-1])
#    def create_table():
#        return get_df(name)
def get_table_name(longname):
    return longname.split(".")[0].split("/")[-1]

In [None]:
from pyspark.sql.types import *

@dlt.table(name=get_table_name(table_names[0]))
def table0():
    return get_df(table_names[0])
@dlt.table(name=get_table_name(table_names[1]))
def table1():
    return get_df(table_names[1])
@dlt.table(name=get_table_name(table_names[2]))
def table2():
    return get_df(table_names[2])
@dlt.table(name=get_table_name(table_names[3]))
def table3():
    return get_df(table_names[3])
@dlt.table(name=get_table_name(table_names[4]))
def table4():
    return get_df(table_names[4])
@dlt.table(name=get_table_name(table_names[5]))
def table5():
    return get_df(table_names[5])
@dlt.table(name=get_table_name(table_names[6]))
def table6():
    return get_df(table_names[6])
@dlt.table(name=get_table_name(table_names[7]))
def table7():
    return get_df(table_names[7])
    
#,     
@dlt.table(name="ec_production_per_parish",schema=
    StructType(
     [
        StructField('PARISH_TRASE_ID', StringType()),
        StructField('YEAR', StringType()),
        StructField('avg_produced_tons', DoubleType()),
        StructField('AREA_HA', DoubleType())
      ]
    ))
def eppp():
    return run_script_combined()

@dlt.table(name="ec_shrimp_area_per_parish")
def sapp():
    return run_script_area()    