In [None]:
# %%cleanup -f

In [None]:
# spark

In [None]:
# %stop_session

##### Importamos todas las funciones de pyspark que vamos a necesitar 

In [3]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, datediff, last, when, date_add, udf, lit, desc, row_number, mean, sum
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import OneHotEncoder, StringIndexer

BUCKET_NAME = "marcial-tmf-oct22-msf-data"




#### Contacts

##### Cargamos en un dataframe de Spark el fichero de Contactos de MSF

In [3]:
df_con = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/contact/MSF_Contact.parquet")
)




##### Renombramos la columna id a con_id para que no haya ambiguedades en los merges

In [4]:
df_con = df_con.withColumnRenamed("id","con_id")




#### Tasks

##### Cargamos en un dataframe de Spark el fichero de Tareas de MSF

In [5]:
df_tasks = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/task/MSF_Task.parquet")
)




##### Renombramos la columna id a task_id para que no haya ambiguedades en los merges

In [6]:
df_tasks = df_tasks.withColumnRenamed("id","task_id")




In [9]:
df_tasks.printSchema()

root
 |-- task_id: string (nullable = true)
 |-- activitydate: date (nullable = true)
 |-- isarchived: boolean (nullable = true)
 |-- isdeleted: boolean (nullable = true)
 |-- lastmodifieddate: timestamp (nullable = true)
 |-- msf_attributefive__c: string (nullable = true)
 |-- msf_attributefour__c: string (nullable = true)
 |-- msf_attributeone__c: string (nullable = true)
 |-- msf_attributesix__c: string (nullable = true)
 |-- msf_attributethree__c: string (nullable = true)
 |-- msf_attributetwo__c: string (nullable = true)
 |-- msf_campaign__c: string (nullable = true)
 |-- msf_channel__c: string (nullable = true)
 |-- msf_closedescription__c: string (nullable = true)
 |-- msf_closetype__c: string (nullable = true)
 |-- msf_inboundoutbound__c: string (nullable = true)
 |-- msf_leadsource__c: string (nullable = true)
 |-- msf_objective__c: string (nullable = true)
 |-- msf_owner__c: string (nullable = true)
 |-- msf_proactivereactive__c: string (nullable = true)
 |-- msf_productprogr

#### MERGE CON CONTACTS

##### Mergeamos las tablas de contactos y tareas por el id de contacto para tener en un mismos dataframe los datos de ambos ficheros. Seleccionamos las columnas que consideramos apropiadas para llevar a cabo con ellas un posterior análisis

In [38]:
df_tasks_cons = (
        df_con
        .join(df_tasks, df_con.con_id == df_tasks.whoid, 'inner')
    .select(
        df_con.msf_seniority__c, df_con.msf_birthyear__c, df_con.msf_entrycampaign__c, df_con.npo02__best_gift_year__c, df_con.npo02__averageamount__c,
        df_con.msf_begindatemsf__c, df_con.msf_datefirstrecurringdonorquota__c, df_con.gender__c, df_con.msf_ltvcont__c,
        df_con.msf_ltvdesc__c, df_con.msf_recencyrecurringdonorcont__c, df_con.msf_recencytotalcont__c, df_con.msf_recencytotalscore__c, df_con.msf_rfvrecurringdonor__c, 
        df_con.npo02__totaloppamount__c, df_con.msf_valuetotalcont__c, df_con.msf_valuetotaldesc__c, df_con.msf_lifetime__c, df_con.msf_pressurecomplaint__c, 
        df_con.msf_scoringrfvrecurringdonor__c, df_con.msf_scoringrvtotal__c, df_con.msf_averagedonorvalue__c, df_con.msf_percomssummary__c,
        df_con.msf_totalfiscaloppamount__c, df_con.msf_lastannualizedquota__c, df_con.msf_maximumdonorvalue__c,
        df_tasks.whoid, df_tasks.activitydate, df_tasks.msf_channel__c, df_tasks.msf_inboundoutbound__c, df_tasks.msf_closetype__c
    ).where((df_con.con_id == df_tasks.whoid) &
            (df_con.msf_isactiverecurringdonor__c == 'Socio') &
            (df_con.msf_percomssummary__c.isin(['Todo','Varios'])) &
            (df_tasks.subject == "MSF Aumento de Cuota") &
            (df_tasks.isdeleted == False))
)




##### Creamos una columna nueva en la que computamos los días que han pasado respecto a la tarea anterior basándonos en la columna activitydate de la tabla de tareas. Lo hacemos así porque pensamos que es interesante disponer de una columna que nos indique los días transcurridos desde la última interacción con el socio.

In [39]:
windowSpec = Window.partitionBy("whoid").orderBy("activitydate")
  
df_tasks_cons = df_tasks_cons.withColumn("previous_modification_date", lag("activitydate").over(windowSpec)) \
    .withColumn("days_elapsed_since_previous_modification", datediff(col("activitydate"), col("previous_modification_date")))




-----------------------------------------------------------------------------------

#### Recurring Donations

##### Cargamos en un dataframe de Spark la tabla de donaciones periódicas

In [14]:
df_rec = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/recurringdonation/MSF_RecurringDonation.parquet")
)




##### Renombramos el id como en los casos anteriores

In [15]:
df_rec = df_rec.withColumnRenamed("id","rec_id")




##### Cargamos en un dataframe de Spark la tabla de modificaciones de cuota por parte de los socios

In [16]:
df_quo = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/quotamodification/MSF_QuotaModification.parquet")
)




##### Renombramos el id como en los casos anteriores

In [17]:
df_quo = df_quo.withColumnRenamed("id","quo_id")




##### Hacemos el merge entre las tablas de donaciones periódicas, la de modificaciones de cuotas y la de contactos (para asociar el id del socio) Seleccionamos las columnas que consideramos apropiadas para el posterior análisis

In [18]:
df_rec_quo = (
        df_con
        .join(df_rec, df_con.con_id == df_rec.npe03__contact__c, 'inner')
        .join(df_quo, df_con.con_id == df_quo.msf_contactid__c, 'inner')
    .select(
        df_con.con_id, 
        df_rec.msf_annualizedquota__c, df_rec.npe03__total_paid_installments__c, df_rec.npe03__installment_period__c,
        df_quo.msf_changedate__c, df_quo.msf_changetype__c, df_quo.msf_leadsource3__c, 
        df_quo.msf_newamount__c, df_quo.msf_newannualizedquota__c, df_quo.msf_newrecurringperiod__c
    ).where((df_con.con_id == df_rec.npe03__contact__c) & 
            (df_con.con_id == df_quo.msf_contactid__c) &
            (df_rec.rec_id == df_quo.msf_recurringdonation__c) &
            (df_con.msf_percomssummary__c.isin(['Todo','Varios'])) &      
            (df_rec.npe03__open_ended_status__c == 'Open') &
            (df_rec.isdeleted == False) &
            (df_quo.isdeleted == False))
)




##### A continuación utilizamos las funciones lag y datediff para meter en cada registro una columna con la fecha del cambio de cuota anterior y los datos con la cuota del anterior cambio. La justificación es que parece interesante tener esta información en todos los registros para luego rellenar información en campos que se hayan quedado en blanco al hacer el siguiente merge que queremos hacer.

In [21]:
windowSpec = Window.partitionBy("con_id").orderBy(col("msf_changedate__c"))

df_rec_quo = df_rec_quo.withColumn("previous_change_date", lag("msf_changedate__c").over(windowSpec)) \
    .withColumn("days_elapsed_since_previous_change", datediff(col("msf_changedate__c"), col("previous_change_date"))) \
    .withColumn("msf_oldamount__c", lag(col("msf_newamount__c")).over(windowSpec))  \
    .withColumn("msf_oldannualizedquota__c", lag(col("msf_newannualizedquota__c")).over(windowSpec))  \
    .withColumn("msf_oldrecurringperiod__c", lag(col("msf_newrecurringperiod__c")).over(windowSpec))




##### Dado que no hay forma de cruzar directamente el fichero de tareas con el de modificaciones de cuota lo que hacemos es crear un left join desde tareas en el que el cruce se hace por id de cliente (por supuesto) y de forma que la diferencia de fechas entre la de la tarea y la siguiente modificación de cuota en la línea temporal sea menor de 60 días. Esto lo hacemos así porque la fechas de una tarea que resulta positiva en la apmpliación de cuota no tiene repercusión en la propia tabla de modificaciones de cuota hasta unos días después, dependiendo de la periodicidad de la cuota. Obviamente para periodicidades superiores a la mensual puede que los registros no se asocien correctamente. Por lo que hemos visto el fichero de Tareas comenzó a rellenarse sobre el año 2014 con posterioridad al de donaciones periódicas. Esto hace que en el siguiente merge se pierdan registros, pero es preferible que sea así porque dichos registros corresponderían únicamente a interacciones positivas (incrementos de cuota) y haría que la muestra final no estuviera balanceada

In [40]:
df_merge = (
        df_tasks_cons
        .join(df_rec_quo, (df_rec_quo.msf_changedate__c > df_tasks_cons.activitydate) &
                          (df_rec_quo.msf_changedate__c <= date_add(df_tasks_cons.activitydate, 60)) &
                          (df_rec_quo.con_id == df_tasks_cons.whoid)
             , 'left')
)




In [41]:
(
  df_merge.coalesce(1).write
        .format("parquet")
        .mode("overwrite")
        .save(f"s3://{BUCKET_NAME}/z_output/merge_3")
)




##### Rellenamos determinadas columnas que hayan quedado a null con los datos del registro anterior que las tenga rellenas.

In [42]:
window_spec = Window.partitionBy("whoid").orderBy(col("activitydate"),col("previous_modification_date"))

columns_to_fill = ["msf_newamount__c", "msf_newannualizedquota__c", "msf_newrecurringperiod__c",
                   "previous_change_date", "msf_oldamount__c", "msf_oldannualizedquota__c", "msf_oldrecurringperiod__c"]

# Use the last function to fill null values with values from the previous row within the same ID
for col in columns_to_fill:
    df_merge = df_merge.withColumn(col, last(col, True).over(window_spec))




##### Hacemos lo mismo pero esta vez en sentido inverso temporalmente.

In [43]:
from pyspark.sql.functions import col
window_spec = Window.partitionBy("whoid").orderBy(col("activitydate").desc())

columns_to_fill = ["previous_change_date", "msf_oldamount__c", "msf_oldannualizedquota__c", "msf_oldrecurringperiod__c"]

# Use the last function to fill null values with values from the previous row within the same ID
for col in columns_to_fill:
    df_merge = df_merge.withColumn(col, last(col, True).over(window_spec))




##### Creamos una columna que nos indique los días transcurridos desde el último cambio de cuota.

In [44]:
from pyspark.sql.functions import datediff, col

df_merge = df_merge.withColumn("days_elapsed_since_previous_quota_change", datediff(col("activitydate"), col("previous_change_date")))




##### Rellenamos también valores de columnas que hayan quedado en blanco para algunos de los campos que nos interesan

In [45]:
from pyspark.sql.functions import coalesce

df_merge = df_merge.withColumn("days_elapsed_since_previous_modification", coalesce(col("days_elapsed_since_previous_modification"),col("days_elapsed_since_previous_quota_change"))) \
                   .withColumn('msf_newamount__c', coalesce(col('msf_newamount__c'),col('msf_oldamount__c'))) \
                   .withColumn('msf_newannualizedquota__c', coalesce(col('msf_newannualizedquota__c'),col('msf_oldannualizedquota__c'))) \
                   .withColumn('msf_newrecurringperiod__c', coalesce(col('msf_newrecurringperiod__c'),col('msf_oldrecurringperiod__c')))                                




##### Guardamos en S3 este primer dataframe

In [46]:
(
  df_merge.coalesce(1).write
        .format("parquet")
        .mode("overwrite")
        .save(f"s3://{BUCKET_NAME}/z_output/merge_3_raw")
)




### --------------------------------------------------------------------------------

##### Cargamos de nuevo desde S3 el dataframe

In [4]:
df_merge_raw = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/z_output/merge_3_raw")
)




In [5]:
df_merge_raw.show(1)

+----------------+----------------+--------------------+------------------------+-----------------------+-------------------+-----------------------------------+---------+--------------+------------------+--------------------------------+-----------------------+------------------------+------------------------+------------------------+---------------------+---------------------+---------------+------------------------+-------------------------------+---------------------+------------------------+---------------------+---------------------------+--------------------------+------------------------+------------------+------------+--------------+----------------------+----------------+--------------------------+----------------------------------------+------+----------------------+---------------------------------+----------------------------+-----------------+-----------------+------------------+----------------+-------------------------+-------------------------+--------------------+----

##### Transformamos el año de nacimiento en la edad del socio

In [6]:
# Calculate average birth year excluding zeros
avg_birthyear = df_merge_raw.filter(col("msf_birthyear__c") != 0).agg(mean("msf_birthyear__c")).collect()[0][0]

# Replace 0 with average birth year
df_merge_raw = df_merge_raw.withColumn("msf_birthyear__c", when(col("msf_birthyear__c") == 0, avg_birthyear).otherwise(col("msf_birthyear__c")))

# Calculate age
currentYear = 2023
df_merge_raw = df_merge_raw.withColumn("age", lit(currentYear) - col("msf_birthyear__c"))




##### Eliminamos columnas que entendemos no deben formar parte de las features

In [7]:
# Eliminamos campos que no deben estar entre las features
columnsToDrop = ["msf_birthyear__c", "msf_entrycampaign__c", "msf_begindatemsf__c", "msf_datefirstrecurringdonorquota__c", 
                 "msf_inboundoutbound__c", "previous_purchase_date","con_id", "msf_changedate__c", "msf_leadsource3__c",
                 "msf_ltvdesc__c","msf_changedate__c","previous_change_date", "msf_annualizedquota__c", 
                 "days_elapsed_since_previous_change","npe03__installment_period__c"]
df_merge_raw = df_merge_raw.drop(*columnsToDrop)




In [11]:
desired_column_order = ['whoid', 'activitydate', 'msf_seniority__c', 'npo02__best_gift_year__c','npo02__averageamount__c', 'gender__c', 'msf_ltvcont__c', 'msf_recencyrecurringdonorcont__c', 
                        'msf_rfvrecurringdonor__c', 'npo02__totaloppamount__c', 'msf_valuetotalcont__c', 'msf_valuetotaldesc__c', 'msf_lifetime__c', 'msf_pressurecomplaint__c', 
                        'msf_scoringrfvrecurringdonor__c', 'msf_averagedonorvalue__c', 'msf_percomssummary__c', 'msf_channel__c', 'days_elapsed_since_previous_modification', 
                        'msf_recencytotalcont__c', 'msf_recencytotalscore__c', 'msf_scoringrvtotal__c', 'npe03__total_paid_installments__c', 'msf_newamount__c', 
                        'msf_newannualizedquota__c', 'msf_newrecurringperiod__c', 'msf_oldamount__c', 'msf_oldannualizedquota__c',
                        'msf_totalfiscaloppamount__c', 'msf_lastannualizedquota__c', 'msf_maximumdonorvalue__c',
                        'msf_oldrecurringperiod__c', 'days_elapsed_since_previous_quota_change', 'age', 'msf_closetype__c']  # Replace with your desired column order
df_merge_raw = df_merge_raw.select(*desired_column_order)




In [9]:
df_merge_raw.show(1)

+------------------+------------+----------------+------------------------+-----------------------+---------+--------------+--------------------------------+------------------------+------------------------+---------------------+---------------------+---------------+------------------------+-------------------------------+------------------------+---------------------+--------------+----------------------------------------+-----------------------+------------------------+---------------------+---------------------------------+----------------+-------------------------+-------------------------+----------------+-------------------------+------------------------+---------------------------+--------------------------+---------------------+------------------------+-------------------------+----------------------------------------+----+----------------+
|             whoid|activitydate|msf_seniority__c|npo02__best_gift_year__c|npo02__averageamount__c|gender__c|msf_ltvcont__c|msf_recencyrecu

In [None]:
(
  df_merge_raw.coalesce(1).write
        .format("parquet")
        .mode("overwrite")
        .save(f"s3://{BUCKET_NAME}/z_output/merge_3_selection")
)

##### Transformamos la columna gender a numérico

In [52]:
df_merge_raw = df_merge_raw.withColumn(
    "gender__c",
    when(col("gender__c") == "H", "Male")
    .when(col("gender__c") == "M", "Female")
    .when(col("gender__c") == "", "Other")
)




In [65]:
df_merge_raw.select([sum(when(col(c).isNull(), 1)).alias(c) for c in df_merge_raw.columns]).show()

+-----+------------+----------------+-----------------------+---------+--------------+--------------------------------+------------------------+------------------------+---------------------+---------------------+---------------+------------------------+-------------------------------+------------------------+---------------------+--------------+----------------------------------------+----------------------+---------------------------------+----------------------------+----------------+-------------------------+-------------------------+----------------------------------+----------------+-------------------------+-------------------------+----------------------------------------+------+----------------+
|whoid|activitydate|msf_seniority__c|npo02__averageamount__c|gender__c|msf_ltvcont__c|msf_recencyrecurringdonorcont__c|msf_rfvrecurringdonor__c|npo02__totaloppamount__c|msf_valuetotalcont__c|msf_valuetotaldesc__c|msf_lifetime__c|msf_pressurecomplaint__c|msf_scoringrfvrecurringdonor__c|

##### Transformamos la valor del socio a numérico

In [33]:
#msf_valuetotaldesc__c >> ["Muy bajo","Nulo","Medio","Bajo","Alto","Muy Alto"]

# Define a mapping from categories to integers
value_mapping = {'Nulo': 0, 'Muy bajo': 1, 'Bajo': 2, 'Medio': 3, 'Alto': 4, 'Muy Alto': 5}

# Define a udf to map categories to integers using the dictionary
mapping_udf = udf(lambda value: value_mapping[value], IntegerType())

# Apply the udf to the DataFrame
df_task_merge = df_task_merge.withColumn('msf_valuetotaldesc__c', mapping_udf(df_task_merge['msf_valuetotaldesc__c']))




##### Transformamos la periodicidad de la cuota de socio a numérico

In [34]:
#msf_newrecurringperiod__c >> ["Monthly","Yearly","Quarterly","Semestral","Bimensual",""]
df_task_merge = df_task_merge.fillna({"msf_newrecurringperiod__c": '1'})

# Define a mapping from categories to integers
period_mapping = {'': 1, 'Monthly': 1, 'Bimensual': 2, 'Quarterly': 3, 'Semestral': 4, 'Yearly': 5}

# Define a udf to map categories to integers using the dictionary
mapping_udf = udf(lambda period: period_mapping[period], IntegerType())

# Apply the udf to the DataFrame
df_task_merge = df_task_merge.withColumn('msf_newrecurringperiod__c', mapping_udf(df_task_merge['msf_newrecurringperiod__c']))




##### Nos quedamos con los canales que consideramos relevantes y al resto les ponemos un valor genérico

In [35]:
# msf_channel__c >> ["Correo Postal","Llamada","E-mail","Online","SMS","Formularios Web","Presencial","Mi Perfil","n/a","Interno","Mensajería Instantánea","Generación Aplicación","Fichero Informático","Fax"]

# Define the condition and replacement values
condition = ~df_task_merge["msf_channel__c"].isin(["Correo Postal","Llamada","E-mail","Online","SMS","Formularios Web"])
replacement_value = "Otros"

# Use the 'when' function to replace values based on the condition
df_task_merge = df_task_merge.withColumn("msf_channel__c", when(condition, replacement_value).otherwise(df_task_merge["msf_channel__c"]))




##### Transformamos el canal a numérico aunque entiendo que deberíamos usar one hot encoder

##### Consideramos negativas todas las interacciones que no son expresamente positivas

In [37]:
# Define the condition and replacement values
condition = df_task_merge["msf_closetype__c"] != "Positivo"
replacement_value = "Negativo"

# Use the 'when' function to replace values based on the condition
df_task_merge = df_task_merge.withColumn("msf_closetype__c", when(condition, replacement_value).otherwise(df_task_merge["msf_closetype__c"]))




##### Configuramos el orden en el que queremos las columnas en el dataset

In [None]:
desired_column_order = ['whoid', 'activitydate', 'msf_seniority__c', 'npo02__averageamount__c', 'msf_datefirstrecurringdonorquota__c', 
                        'gender__c', 'msf_ltvcont__c', 'msf_recencyrecurringdonorcont__c', 'msf_rfvrecurringdonor__c', 'npo02__totaloppamount__c', 
                        'msf_valuetotalcont__c', 'msf_valuetotaldesc__c', 'msf_lifetime__c', 'msf_pressurecomplaint__c', 'msf_scoringrfvrecurringdonor__c', 
                        'msf_averagedonorvalue__c', 'msf_percomssummary__c', 'msf_channel__c', 'days_elapsed_since_previous_modification', 'con_id', 'msf_annualizedquota__c', 'npe03__total_paid_installments__c', 'npe03__installment_period__c', 'msf_changedate__c', 'msf_changetype__c', 'msf_leadsource3__c', 'msf_newamount__c', 
                        'msf_newannualizedquota__c', 'msf_newrecurringperiod__c', 'days_elapsed_since_previous_change', 'msf_oldamount__c', 'msf_oldannualizedquota__c',
                        'msf_oldrecurringperiod__c', 'days_elapsed_since_previous_quota_change', 'age', 'msf_closetype__c']  # Replace with your desired column order
df_task_merge = df_task_merge.select(*desired_column_order)

In [24]:
df_task_merge.count()

2098989


In [25]:
df_task_merge.where(df_task_merge.msf_newrecurringperiod__c!=0).count()

1124975


##### Vemos que hay numerosos registros que no tienen periodicidad asociada. En principio prescindimos de ellos.

In [39]:
df_task_merge = df_task_merge.where(df_task_merge.msf_newrecurringperiod__c != 0)




##### Grabamos en S3 el fichero resultante, en el que tenemos las features, el id del contacto y la fecha de la tarea

In [40]:
(
  df_task_merge.write
        .format("parquet")
        .mode("overwrite")
        .save(f"s3://{BUCKET_NAME}/output/task_merge_id_date")
)




También en csv

In [41]:
(
  df_task_merge.coalesce(1).write
    .format("csv")
    .option("compression", "gzip")
    .option("header", True)
    .mode("overwrite")
    .save(f"s3://{BUCKET_NAME}/output/task_merge_id_date_csv")
)




### --------------------------------------------------------------------------------

##### Cargamos de nuevo el fichero

In [6]:
df_feature_store = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/output/task_merge_id_date")
)




##### Para crear un fichero de features o feature store nos quedamos con el registro más reciente para cada contacto

In [7]:
# Define a window spec
windowSpec = Window.partitionBy("whoid").orderBy(desc("activityDate"))

# Use the row_number function to assign a row number to each record in the window
df_feature_store = df_feature_store.withColumn("row_num", row_number().over(windowSpec))

# Filter for the most recent record for each customerId
df_feature_store = df_feature_store.filter(col("row_num") == 1).drop("row_num")




##### Y lo guardamos también

In [9]:
(
  df_feature_store.coalesce(1).write
    .format("csv")
    .option("compression", "gzip")
    .option("header", True)
    .mode("overwrite")
    .save(f"s3://{BUCKET_NAME}/output/feature_store_csv")
)




### --------------------------------------------------------------------------------

##### Obtenemos otro data frame pero esta vez sólo con las features que utilizaremos en el modelo. 

In [42]:
# Eliminamos id y date
columnsToDrop = ["whoid", "activitydate"]
df_task_merge_clean = df_task_merge.drop(*columnsToDrop)




##### Y lo salvamos para su uso posterior

In [43]:
(
  df_task_merge_clean.write
        .format("parquet")
        .mode("overwrite")
        .save(f"s3://{BUCKET_NAME}/output/task_merge_clean")
)




In [44]:
(
  df_task_merge_clean.coalesce(1).write
    .format("csv")
    .option("compression", "gzip")
    .option("header", True)
    .mode("overwrite")
    .save(f"s3://{BUCKET_NAME}/output/task_merge_clean_csv")
)


