In [None]:
# %%cleanup -f

In [None]:
# spark

In [None]:
# %stop_session

##### Importamos todas las funciones de pyspark que vamos a necesitar 

In [24]:
from pyspark.sql.window import Window
from pyspark.sql.functions import col, lag, datediff, last, when, date_add, udf, lit, desc, row_number, mean, sum, year, isnull, months_between, to_date, round
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import OneHotEncoder, StringIndexer

BUCKET_NAME = "marcial-tmf-oct22-msf-data"




#### Contacts

##### Cargamos en un dataframe de Spark el fichero de Contactos de MSF

In [2]:
df_con = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/contact/MSF_Contact.parquet")
)




##### Renombramos la columna id a con_id para que no haya ambiguedades en los merges

In [3]:
df_con = df_con.withColumnRenamed("id","con_id")
df_con = df_con.withColumnRenamed("msf_annualizedquota__c","msf_annualizedquota__c_con")




#### Recurring Donations

##### Cargamos en un dataframe de Spark la tabla de donaciones periódicas

In [4]:
df_rec = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/recurringdonation/MSF_RecurringDonation.parquet")
)




##### Renombramos el id como en los casos anteriores

In [5]:
df_rec = df_rec.withColumnRenamed("id","rec_id")




##### Cargamos en un dataframe de Spark la tabla de modificaciones de cuota por parte de los socios

In [7]:
df_quo = (
  spark.read
    .option("header", "true")
    .option("quote", "\"")
    .option("escape" , "\"")   
    .format("parquet")
    .load(f"s3://{BUCKET_NAME}/quotamodification/MSF_QuotaModification.parquet")
)




##### Renombramos el id como en los casos anteriores

In [8]:
df_quo = df_quo.withColumnRenamed("id","quo_id")
df_quo = df_quo.withColumnRenamed("msf_leadsource1__c","msf_leadsource1__c_quo")
df_quo = df_quo.withColumnRenamed("msf_annualizedquota__c","msf_annualizedquota__c_quo")




In [9]:
# Initialize an empty DataFrame to store the concatenated results
all_years_df = None

# Define the range of years
start_year = 2015
end_year = 2022

for yr in range(start_year, end_year):
    target_date = f"{yr}-12-31"
    next_year = yr + 1
    
    df_yearly = df_rec.filter(
        (col("npe03__date_established__c") <= target_date) & 
        ((col("msf_cancelationdate__c") > target_date) | (isnull(col("msf_cancelationdate__c"))))
    )

    df_yearly = (
        df_yearly
        .join(df_quo, (df_rec.npe03__contact__c == df_quo.msf_contactid__c) & 
                      (df_rec.rec_id == df_quo.msf_recurringdonation__c) &
                      (df_quo.msf_changetype__c == 'Increase') &               
                      (year(df_quo.msf_changedate__c) == next_year ), 'left')
    )

    # Create boolean column to check if there's an increment in the next year
    df_yearly = df_yearly.withColumn(
        "has_increment_next_year", 
        when(col("msf_changeamount__c").isNotNull() & (col("msf_changeamount__c") > 0), 1).otherwise(0)
    )
    
    # Add start_year and startDate to the DataFrame
    df_yearly = df_yearly.withColumn("start_year", lit(yr))
    df_yearly = df_yearly.withColumn("target_date", lit(target_date))   
    
    # Concatenate the yearly data with the cumulative DataFrame
    if all_years_df is None:
        all_years_df = df_yearly
    else:
        all_years_df = all_years_df.union(df_yearly)




In [10]:
all_years_df.where(col("npe03__contact__c") == "0033Y00002unQGTQA2").show()

+------------------+---------+----------------------+----------------------+------------------------+----------------------+-------------------------+-------------------------------+------------------+---------------+----------------+------------------+--------------------------+----------------------------+---------------------------+---------------------------+---------------------------+---------------------+-------------------------------------+---------------------------------+---------------------------+------------------+---------+--------------------+------------------------+------------------------+-------------------+----------------------------+-----------------+----------------------+------------------+------------------+----------------+-------------------------+-------------------------+------------------+-----------------+-----------------------+----------+-----------+
|            rec_id|isdeleted|msf_annualizedquota__c|msf_cancelationdate__c|msf_cancelationreason__c|ms

In [12]:
all_years_with_contacts = all_years_df.join(df_con, df_con.con_id == all_years_df.npe03__contact__c, 'inner')




In [1]:
#all_years_with_contacts.where(col("npe03__contact__c") == "0033Y00002unQGTQA2").show()

In [None]:
distinct_df = all_years_with_contacts.dropDuplicates(['msf_birthyear__c'])
distinct_df.show()

Execution Interrupted. Attempting to cancel the statement (statement_id=15)
Statement 15 has been cancelled


In [1]:
#all_years_with_contacts.printSchema()

In [16]:
avg_birthyear = round(all_years_with_contacts.filter(col("msf_birthyear__c") != '').agg(mean("msf_birthyear__c")).collect()[0][0], 0)
avg_birthyear

1964.0


In [17]:
# Calculate average birth year excluding zeros
#avg_birthyear = all_years_with_contacts.filter(col("msf_birthyear__c") != '').agg(mean("msf_birthyear__c")).collect()[0][0]

# Replace 0 with average birth year
all_years_with_contacts = all_years_with_contacts.withColumn("msf_birthyear__c", when(col("msf_birthyear__c") == '', avg_birthyear).otherwise(col("msf_birthyear__c")))

# Calculate age
all_years_with_contacts = all_years_with_contacts.withColumn("age", (lit(col("start_year")) - col("msf_birthyear__c")) * 12)




In [26]:
all_years_with_contacts = all_years_with_contacts.withColumn("age_donation", round(months_between(to_date(lit(target_date)), col("npe03__date_established__c")), 0))




In [27]:
m2_train = (
        all_years_with_contacts
        .select('con_id', 'gender__c', 'msf_ltvcont__c', 'msf_pressurecomplaint__c', 'msf_scoringrfvrecurringdonor__c', 'msf_averagedonorvalue__c', 'age',
                'msf_annualizedquota__c', 'msf_leadsource1__c',  'msf_currentleadsource1__c', 'npe03__installment_period__c', 'npe03__recurring_donation_campaign__c',
                'age_donation', 'msf_changeannualizedquota__c', 'msf_changedate__c', 'start_year', 'has_increment_next_year')
)




In [32]:
m2_train.where(col("con_id") == "0033Y00002unQGTQA2").show()

+------------------+---------+--------------+------------------------+-------------------------------+------------------------+----+----------------------+------------------+-------------------------+----------------------------+-------------------------------------+------------+----------------------------+-----------------+----------+-----------------------+
|            con_id|gender__c|msf_ltvcont__c|msf_pressurecomplaint__c|msf_scoringrfvrecurringdonor__c|msf_averagedonorvalue__c| age|msf_annualizedquota__c|msf_leadsource1__c|msf_currentleadsource1__c|npe03__installment_period__c|npe03__recurring_donation_campaign__c|age_donation|msf_changeannualizedquota__c|msf_changedate__c|start_year|has_increment_next_year|
+------------------+---------+--------------+------------------------+-------------------------------+------------------------+----+----------------------+------------------+-------------------------+----------------------------+-------------------------------------+-------

In [28]:
(
  m2_train.coalesce(1).write
        .format("parquet")
        .mode("overwrite")
        .save(f"s3://{BUCKET_NAME}/z_output/m2_features")
)


