# Imports

In [1]:
from math import isnan

import numpy as np
from ds_aws_services.athena_api  import AthenaApi
# from ds_aws_services   import CachedAthenaApi
import pandas as pd
# from datetime import datetime, timedelta

# Sofi

In [2]:
# V1
def query_sofi_func() -> pd.DataFrame:  #run_id_prt:str
    query_sofi = f"""
With subid_data_from_enrichment_agg AS (SELECT SUBID,
                                               partner_id,
                                               company,
--                                                min(transaction_date) as min_date_prt,
--                                               max(transaction_date) as max_date_prt,
--                                                transaction_month_prt,
                                               sum(cast(payout as double))       as total_payout,
                                               SUM(CASE
                                                       WHEN (CAMPAIGN_NAME = 'First Deposit' or event_type = 'First Deposit')
                                                           THEN SALE_AMOUNT END) AS first_deposit_amount,
                                               COUNT(CASE
                                                         WHEN (CAMPAIGN_NAME = 'First Deposit' or event_type = 'First Deposit')
                                                             THEN SUBID END)     AS first_deposit_counts,
                                               COUNT(CASE
                                                         WHEN (CAMPAIGN_NAME = 'Bank Account Created' or
                                                               event_type = 'Bank Account Created') AND
                                                              (ordered = 'checking' or CATEGORY = 'checking')
                                                             THEN subid END)     AS accounts_opened
                                        FROM dlk_visitor_funnel_dwh_production.enrich_conversions_flatten
                                        where transaction_month_prt >= '2025-01'
                                          And vertical_id = '60cb9343d2882671782f4a63'
                                          AND deal_type = 'Sale'
                                          and item_name like '%SoFi%'
                                          AND (AD LIKE '%2053605%' OR AD LIKE '%1869324%' OR
                                               AD LIKE '%1786733%' OR AD LIKE '%1786738%' OR
                                               AD LIKE '%2225300%' or ad LIKE '%1270333%' OR
                                               ad LIKE '%2038521%' OR AD LIKE '%2038520%')
                                          and rn = 1
                                        group by 1, 2, 3),
     subid_to_visitid AS (select distinct visit_iid, channel_click_id, cid
                          from dlk_visitor_funnel_dwh_production.chart_funnel
                          where vertical_id = '60cb9343d2882671782f4a63'
                            and product_id = 13909
                            and clickout_date >= date '2025-01-01'),
     enrichmen_final AS (select S.SUBID,
                                S.partner_id       as e_partner_id,
                                S.company          as e_company,
                                S.total_payout,
                                S.first_deposit_amount,
                                S.first_deposit_counts,
                                S.accounts_opened,
                                v.visit_iid        as e_visit_iid,
                                v.channel_click_id as e_channel_click_id
                         from subid_data_from_enrichment_agg S
                                  LEFT join subid_to_visitid v on S.SUBID = v.cid),
     prediction_data AS (SELECT channel_click_id,
                                visit_iid,
                                min(clickout_timestamp)      AS clickout_timestamp,
                                Max(avg_conversion_lag_lead) AS avg_conversion_lag_lead,
                                max(normalized_p_cr_lead)    AS normalized_p_cr_lead,
                                sum(normalized_p_cr_lead)    AS normalized_p_cr_lead_sum,
                                max(p_cr_lead)               AS p_cr_lead,
                                max(p_conversion_count_lead) AS p_conversion_count_lead,
                                Max(p_conversion_time_lead)  AS p_conversion_time_lead,
                                Max(avg_conversion_lag_ql)   AS avg_conversion_lag_ql,
                                max(normalized_p_cr_ql)      AS normalized_p_cr_ql,
                                max(p_conversion_count_ql)   AS p_conversion_count_ql,
                                Max(p_conversion_time_ql)    AS p_conversion_time_ql,
                                Max(avg_conversion_lag_sale) AS avg_conversion_lag_sale,
                                max(normalized_p_cr_sale)    AS normalized_p_cr_sale,
                                sum(normalized_p_cr_sale)    AS normalized_p_cr_sale_sum,
                                max(p_cr_sale)               as p_cr_sale,
                                max(p_conversion_count_sale) AS p_conversion_count_sale,
                                min(p_conversion_time_sale)  AS p_conversion_time_sale,
                                max(p_conversion_count)      AS p_conversion_count,
                                max(p_conversion_value)      AS p_conversion_value,
                                Max(model_version)           AS model_version,
                                Max(model_id)                AS model_id,
                                Max(model_run_id)            AS model_run_id,
                                Max(predicted_commission)    AS predicted_commission,
                                Max(run_id_prt)              AS run_id_prt,
                                Max(gclid)                   AS gclid,
                                Max(clickout_position)       AS clickout_position,
                                Max(pli_vertical_id)         AS pli_vertical_id,
                                Max(pli_vertical_name)       AS pli_vertical_name,
                                Max(ad_group_id)             AS ad_group_id,
                                Max(ad_group_name)           AS ad_group_name,
                                Max(agent_browser)           AS agent_browser,
                                Max(agent_os)                AS agent_os,
                                Max(agent_platform)          AS agent_platform,
                                Max(bidded_keyword)          AS bidded_keyword,
                                Max(campaign_id)             AS campaign_id,
                                Max(campaign_name)           AS campaign_name,
                                Max(channel_country_code)    AS channel_country_code,
                                Max(channel_region_code)     AS channel_region_code,
                                Max(landing_page_uri)        AS landing_page_uri,
                                Max(match_type)              AS match_type,
                                Max(pli_segment_id)          AS pli_segment_id,
                                Max(pli_segment_name)        AS pli_segment_name,
                                Max(product_id)              AS product_id,
                                Max(product_name)            AS product_name,
                                Max(source_join)             AS source_join,
                                Max(topic)                   AS topic,
                                Max(utm_source)              AS utm_source,
                                Max(visit_timestamp)         AS visit_timestamp,
                                max(estimated_earnings_usd)  AS estimated_earnings_usd,
                                max(estimated_conversions)   AS estimated_conversions,
                                max(conversion_count)        AS conversion_count,
                                Max(user_id)                 AS user_id,
                                max(leads_count)             AS leads_count,
                                max(qualified_leads_count)   AS qualified_leads_count,
                                max(sales_count)             AS sales_count,
                                Max(bucket_group)            AS bucket_group,
                                Max(ppc_account_name)        AS ppc_account_name,
                                Max(vertical_id)             AS vertical_id,
                                Max(traffic_source_name)     AS traffic_source_name,
                                Max(company)                 AS company,
                                Max(traffic_join)            AS traffic_join,
                                Max(ppc_account_id)          AS ppc_account_id,
                                Max(country_code)            AS country_code,
                                Max(ip_region_code)          AS ip_region_code,
                                Max(channel_click_id_source) AS channel_click_id_source,
                                Max(site_id)                 AS site_id,
                                Max(site_name)               AS site_name,
                                Max(partner_id)              AS partner_id,
                                Max(partner_name)            AS partner_name,
                                Max(out_type)                AS out_type,
                                Max(page_type_name)          AS page_type_name,
                                min(clickout_date_prt)       AS clickout_date_prt,
                                min(conversion_month_prt)    AS conversion_month_prt
                         FROM dlk_mlmodels_production.v_multilabel_conversions_predictions_fast_longer
                         WHERE product_id = 13909 --OR  product_id= 14162)
                           AND vertical_id = '60cb9343d2882671782f4a63'
                           AND clickout_date_prt >= '01/01/2025'
                         group by channel_click_id, visit_iid
                         )
select *
from prediction_data as p
         left join enrichmen_final as en
                   on (p.channel_click_id = en.e_channel_click_id and p.visit_iid = en.e_visit_iid)
-- limit 100

    """
    raw = AthenaApi().execute_fetch(query_sofi)
    df_enrich_sofi = pd.DataFrame(raw)
    return df_enrich_sofi

df_enrich_sofi = query_sofi_func()
print(df_enrich_sofi.head())

fetching manifest from s3://aws-athena-query-results-925511037392-us-east-1/Unsaved/2025/07/24/3dfe147f-8a5a-4c8f-a095-e8b51d2d4d04-manifest.csv
INFO - the function _execute_unload was executed in 63.41655 seconds
INFO - returning system cpu count
INFO - the function _execute_fetch was executed in 69.06321 seconds
                                    channel_click_id             visit_iid  \
0                   a467cb3d7f6e15ea2c5aacd450b08252  ZLGygKqLMeEYES5776sk   
1  Cj0KCQjwgIXCBhDBARIsAELC9ZiSB6wVdGyenNe01PNAv7...  NHbUTBEqNgNpoq078tbw   
2  EAIaIQobChMI3ODe5KiKjQMVVxqtBh24FxKOEAAYAyAAEg...  FQLU31R5zgZER0586jbj   
3  EAIaIQobChMIh6jht_yajgMV-wStBh0wPguDEAAYASAAEg...  ZoN5pjRjGvD6IZ265bvc   
4                   7b976ded29f81f85f390058463c602b3  RNKwtVSUi9CdeR320l57   

   clickout_timestamp  avg_conversion_lag_lead  normalized_p_cr_lead  \
0 2025-03-30 11:56:17                    14.46              0.018092   
1 2025-06-05 22:52:10                     7.10              0.026315   

In [3]:
sofi_only_sales = df_enrich_sofi.loc[df_enrich_sofi['sales_count'] > 0]

In [4]:
sofi_only_sales[['visit_iid','subid', 'sales_count','total_payout','first_deposit_amount','first_deposit_counts', 'accounts_opened','conversion_count','leads_count','normalized_p_cr_lead','normalized_p_cr_sale']]


Unnamed: 0,visit_iid,subid,sales_count,total_payout,first_deposit_amount,first_deposit_counts,accounts_opened,conversion_count,leads_count,normalized_p_cr_lead,normalized_p_cr_sale
3,ZoN5pjRjGvD6IZ265bvc,ewJ5wC0Ne1,1.0,1350.0,200.00,1.0,1.0,1.0,1.0,0.048266,0.030055
51,tcYqNm0VssZ7k0716sdk,NQLGCgQE7w,1.0,600.0,,0.0,1.0,1.0,1.0,0.057101,0.031022
87,snSbgS2MdTvWnQ556krp,26EyLHFSZj,1.0,825.0,6000.00,1.0,1.0,1.0,1.0,0.074181,0.036448
107,B0fin8VogPTQhI4874ms,chn4g7TgtM,1.0,600.0,10.00,1.0,1.0,1.0,1.0,0.079808,0.007764
121,P2f9bQKljod1IM678bfh,5mN5T0hKjN,1.0,825.0,14000.00,1.0,1.0,1.0,1.0,0.097972,0.026655
...,...,...,...,...,...,...,...,...,...,...,...
283632,3CpMild6QF8Eev911rjz,DOvEl0bbQc,1.0,1350.0,1000.00,1.0,1.0,1.0,1.0,0.013132,0.007393
283675,ygZZUYBBFAbjKy103vlx,hyMjSfjbgX,1.0,1350.0,1543.87,1.0,1.0,1.0,1.0,0.071822,0.051635
283677,8b2s5ZoSvhEE8N73765l,Ot0jZ5eE1d,1.0,1350.0,30000.00,1.0,1.0,1.0,1.0,0.032429,0.017248
283691,B8iWD7EkIPwLD8856q8j,ZzxxpH1zDx,1.0,600.0,10.00,1.0,1.0,1.0,1.0,0.073497,0.013812


some sales without conversion - visit_iid = 'tvfj69l6X18Osu240hc9'
some sales without lead - '3EXyi3IDCDWI80797fpb' . they are also without payout



In [5]:
df_enrich_sofi.loc[df_enrich_sofi['visit_iid'] == 'MH79ITQh4TjyWy384qg2'].head()

Unnamed: 0,channel_click_id,visit_iid,clickout_timestamp,avg_conversion_lag_lead,normalized_p_cr_lead,normalized_p_cr_lead_sum,p_cr_lead,p_conversion_count_lead,p_conversion_time_lead,avg_conversion_lag_ql,...,conversion_month_prt,subid,e_partner_id,e_company,total_payout,first_deposit_amount,first_deposit_counts,accounts_opened,e_visit_iid,e_channel_click_id
156028,CjwKCAiAqfe8BhBwEiwAsne6gaFcyL9Rz7V-ADIpHwNzgm...,MH79ITQh4TjyWy384qg2,2025-02-01 22:07:33,12.39,0.095499,0.190997,0.121357,0,2025-02-02 10:34:35,5.78,...,2025-02,jpdNPVeYva,10468.0,ni,825.0,2000.0,1.0,1.0,MH79ITQh4TjyWy384qg2,CjwKCAiAqfe8BhBwEiwAsne6gaFcyL9Rz7V-ADIpHwNzgm...


In [6]:
df_enrich_sofi.columns
# df_enrich_sofi.describe()

Index(['channel_click_id', 'visit_iid', 'clickout_timestamp',
       'avg_conversion_lag_lead', 'normalized_p_cr_lead',
       'normalized_p_cr_lead_sum', 'p_cr_lead', 'p_conversion_count_lead',
       'p_conversion_time_lead', 'avg_conversion_lag_ql', 'normalized_p_cr_ql',
       'p_conversion_count_ql', 'p_conversion_time_ql',
       'avg_conversion_lag_sale', 'normalized_p_cr_sale',
       'normalized_p_cr_sale_sum', 'p_cr_sale', 'p_conversion_count_sale',
       'p_conversion_time_sale', 'p_conversion_count', 'p_conversion_value',
       'model_version', 'model_id', 'model_run_id', 'predicted_commission',
       'run_id_prt', 'gclid', 'clickout_position', 'pli_vertical_id',
       'pli_vertical_name', 'ad_group_id', 'ad_group_name', 'agent_browser',
       'agent_os', 'agent_platform', 'bidded_keyword', 'campaign_id',
       'campaign_name', 'channel_country_code', 'channel_region_code',
       'landing_page_uri', 'match_type', 'pli_segment_id', 'pli_segment_name',
       'product_

In [7]:
df_enrich_sofi.isnull().sum().sort_values(ascending=False)

partner_id                  283731
partner_name                283731
first_deposit_amount        275088
estimated_conversions       267909
estimated_earnings_usd      267909
                             ...  
p_conversion_count_sale          0
p_cr_sale                        0
normalized_p_cr_sale_sum         0
run_id_prt                       0
channel_click_id                 0
Length: 84, dtype: int64

In [8]:
df_enrich_sofi.loc[df_enrich_sofi['ppc_account_name'].isnull()].head()

Unnamed: 0,channel_click_id,visit_iid,clickout_timestamp,avg_conversion_lag_lead,normalized_p_cr_lead,normalized_p_cr_lead_sum,p_cr_lead,p_conversion_count_lead,p_conversion_time_lead,avg_conversion_lag_ql,...,conversion_month_prt,subid,e_partner_id,e_company,total_payout,first_deposit_amount,first_deposit_counts,accounts_opened,e_visit_iid,e_channel_click_id
0,a467cb3d7f6e15ea2c5aacd450b08252,ZLGygKqLMeEYES5776sk,2025-03-30 11:56:17,14.46,0.018092,0.018092,0.018092,0,2025-03-31 02:23:53,34.57,...,,,,,,,,,,
4,7b976ded29f81f85f390058463c602b3,RNKwtVSUi9CdeR320l57,2025-07-05 15:54:58,12.64,0.040406,0.040406,0.040406,0,2025-07-06 04:33:22,24.07,...,,,,,,,,,,
6,dd2028ba3ea51d2b79d240e60f0f1067,4gZoy75Azeubtl29026v,2025-06-25 17:32:04,10.87,0.018191,0.018191,0.018191,0,2025-06-26 04:24:16,13.96,...,,,,,,,,,,
13,f22bc0f1142f1c58591ea887a4b45e88,hX0mn3ltJYPRGT823fck,2025-07-15 19:49:04,17.61,0.024197,0.024197,0.024197,0,2025-07-16 13:25:40,146.73,...,,,,,,,,,,
19,a49ad03decf81b5104aab8716bf35ba0,ocXBfCBElBoCoN204l6j,2025-06-13 20:18:06,21.09,0.031905,0.031905,0.031905,0,2025-06-14 17:23:30,168.0,...,,,,,,,,,,


In [9]:
df_enrich_sofi['out_type'].value_counts()

out_type
click    283731
Name: count, dtype: int64

In [10]:
# Ensure 'clickout_timestamp' is datetime
df_enrich_sofi['clickout_timestamp'] = pd.to_datetime(df_enrich_sofi['clickout_timestamp'])

# Extract month
df_enrich_sofi['click_month'] = df_enrich_sofi['clickout_timestamp'].dt.to_period('M')

# Aggregate by month
df_enrich_sofi.groupby('click_month')[['estimated_earnings_usd', 'sales_count', 'normalized_p_cr_sale', 'normalized_p_cr_sale_sum','p_cr_sale','leads_count','normalized_p_cr_lead']].sum()
# ֿ

Unnamed: 0_level_0,estimated_earnings_usd,sales_count,normalized_p_cr_sale,normalized_p_cr_sale_sum,p_cr_sale,leads_count,normalized_p_cr_lead
click_month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2025-01,2474.511,3.0,3.235876,3.406749,3.241531,8.0,6.344131
2025-02,456511.8,340.0,357.433726,374.415402,371.209445,978.0,1083.050281
2025-03,927433.9,698.0,756.54804,782.557035,790.154924,2071.0,1775.165622
2025-04,1075661.0,849.0,719.295602,750.12699,744.290654,2190.0,1794.716778
2025-05,1848801.0,1712.0,912.185505,955.514284,944.656166,2904.0,2903.995293
2025-06,2724750.0,2409.0,1834.389674,1924.485816,1894.390936,4336.0,3952.815128
2025-07,1923900.0,1543.0,1763.094137,1831.755758,1819.156888,3200.0,3146.889354


In [11]:
df_enrich_sofi['leads_count'] = df_enrich_sofi['leads_count'].fillna(0)
df_enrich_sofi['sales_count'] = df_enrich_sofi['sales_count'].fillna(0)


df_enrich_sofi['sales_diff'] = df_enrich_sofi['sales_count'] - df_enrich_sofi['normalized_p_cr_sale']
df_enrich_sofi[['sales_count','normalized_p_cr_sale','sales_diff']]
rmse = np.sqrt((df_enrich_sofi['sales_diff'] ** 2).mean())
print(f'Overall RMSE: {rmse:.2f}')

Overall RMSE: 0.16


In [12]:
# Biz2Credit

# Bi2Credit

In [53]:
def query_biz_func() -> pd.DataFrame:  #run_id_prt:str
    query_biz = f"""
-- biz2credit bi - leads process 22/07
WITH subid_data_from_enrichment_agg AS (
    /* --- Process #2: bi_biz2credit_sale --- */
    SELECT process_name,
           subid                             AS subid,                      -- H “Click Id Update”
           partner_id,
           company,
           CAST(transaction_date AS VARCHAR) AS application_date,           -- A “Lead RegistrationDate”
           category                          AS subsource_name,             -- D “Subsource Name”
           lead_source                       AS lead_id,                    -- I “Lead ID”
           contact_source                    AS track_name,                 -- E “Track Name”
           product_id                        AS app_id,                     -- K “App Id”
           item_name                         AS channel_name,               -- B “Channel Name”
        /* sale dates & states */
           CAST(transaction_date AS VARCHAR) AS registration_date,
           customer_status                   AS business_legal_structure,
           age_group                         AS age_of_business_months,     -- N “Age of Business (Months)”
           net_revenue                       AS application_annual_revenue, -- P “Application Annual Revenue”
           -- New Avocado calc for commissions
           CASE
               WHEN age_group >= 18
                   AND net_revenue >= 120000 THEN 492
               WHEN age_group >= 18
                   AND net_revenue < 120000 THEN 32
               WHEN age_group BETWEEN 12 AND 17
                   AND net_revenue >= 120000 THEN 28
               WHEN age_group BETWEEN 12 AND 17
                   AND net_revenue < 120000 THEN 1
               ELSE 0
               END                           AS commission

    FROM dlk_visitor_funnel_dwh_production.enrich_conversions_flatten
    WHERE partner_id = 13589
      AND process_name in ('bi_biz2credit_lead') -- bi_biz2credit_sale
      and transaction_month_prt >= '2025-05'
      and rn = 1),
subid_to_visitid AS (select distinct visit_iid, channel_click_id, cid
                          from dlk_visitor_funnel_dwh_production.chart_funnel
                          where vertical_id = '64e33e7be3cbc4ce1041a30f'
                            and product_id = 13465
                            and clickout_date >= date '2025-05-01'),
     enrichmen_final as (select s.*, v.visit_iid, v.channel_click_id
                         from subid_data_from_enrichment_agg S
                                  LEFT join subid_to_visitid v on S.SUBID = v.cid),
     prediction_data AS (SELECT channel_click_id,
                                visit_iid,
                                cid,
                                min(clickout_timestamp)      AS clickout_timestamp,
                                Max(avg_conversion_lag_lead) AS avg_conversion_lag_lead,
                                max(normalized_p_cr_lead)    AS normalized_p_cr_lead,
                                sum(normalized_p_cr_lead)    AS normalized_p_cr_lead_sum,
                                max(p_cr_lead)               AS p_cr_lead,
                                max(p_conversion_count_lead) AS p_conversion_count_lead,
                                Max(p_conversion_time_lead)  AS p_conversion_time_lead,
                                Max(avg_conversion_lag_ql)   AS avg_conversion_lag_ql,
                                max(normalized_p_cr_ql)      AS normalized_p_cr_ql,
                                max(p_conversion_count_ql)   AS p_conversion_count_ql,
                                Max(p_conversion_time_ql)    AS p_conversion_time_ql,
                                Max(avg_conversion_lag_sale) AS avg_conversion_lag_sale,
                                max(normalized_p_cr_sale)    AS normalized_p_cr_sale,
                                sum(normalized_p_cr_sale)    AS normalized_p_cr_sale_sum,
                                max(p_cr_sale)               as p_cr_sale,
                                max(p_conversion_count_sale) AS p_conversion_count_sale,
                                min(p_conversion_time_sale)  AS p_conversion_time_sale,
                                max(p_conversion_count)      AS p_conversion_count,
                                max(p_conversion_value)      AS p_conversion_value,
                                Max(model_version)           AS model_version,
                                Max(model_id)                AS model_id,
                                Max(model_run_id)            AS model_run_id,
                                Max(predicted_commission)    AS predicted_commission,
                                Max(run_id_prt)              AS run_id_prt,
                                Max(gclid)                   AS gclid,
                                Max(clickout_position)       AS clickout_position,
                                Max(pli_vertical_id)         AS pli_vertical_id,
                                Max(pli_vertical_name)       AS pli_vertical_name,
                                Max(ad_group_id)             AS ad_group_id,
                                Max(ad_group_name)           AS ad_group_name,
                                Max(agent_browser)           AS agent_browser,
                                Max(agent_os)                AS agent_os,
                                Max(agent_platform)          AS agent_platform,
                                Max(bidded_keyword)          AS bidded_keyword,
                                Max(campaign_id)             AS campaign_id,
                                Max(campaign_name)           AS campaign_name,
                                Max(channel_country_code)    AS channel_country_code,
                                Max(channel_region_code)     AS channel_region_code,
                                Max(landing_page_uri)        AS landing_page_uri,
                                Max(match_type)              AS match_type,
                                Max(pli_segment_id)          AS pli_segment_id,
                                Max(pli_segment_name)        AS pli_segment_name,
                                Max(product_id)              AS product_id,
                                Max(product_name)            AS product_name,
                                Max(source_join)             AS source_join,
                                Max(topic)                   AS topic,
                                Max(utm_source)              AS utm_source,
                                Max(visit_timestamp)         AS visit_timestamp,
                                max(estimated_earnings_usd)  AS estimated_earnings_usd,
                                max(estimated_conversions)   AS estimated_conversions,
                                max(conversion_count)        AS conversion_count,
                                Max(user_id)                 AS user_id,
                                max(leads_count)             AS leads_count,
                                max(qualified_leads_count)   AS qualified_leads_count,
                                max(sales_count)             AS sales_count,
                                Max(bucket_group)            AS bucket_group,
                                Max(ppc_account_name)        AS ppc_account_name,
                                Max(vertical_id)             AS vertical_id,
                                Max(traffic_source_name)     AS traffic_source_name,
                                Max(company)                 AS company,
                                Max(traffic_join)            AS traffic_join,
                                Max(ppc_account_id)          AS ppc_account_id,
                                Max(country_code)            AS country_code,
                                Max(ip_region_code)          AS ip_region_code,
                                Max(channel_click_id_source) AS channel_click_id_source,
                                Max(site_id)                 AS site_id,
                                Max(site_name)               AS site_name,
                                Max(partner_id)              AS partner_id,
                                Max(partner_name)            AS partner_name,
                                Max(out_type)                AS out_type,
                                Max(page_type_name)          AS page_type_name,
                                min(clickout_date_prt)       AS clickout_date_prt,
                                min(conversion_month_prt)    AS conversion_month_prt
                         FROM dlk_mlmodels_production.v_multilabel_conversions_predictions_fast_longer
                         WHERE product_id = 13465
                           AND vertical_id = '64e33e7be3cbc4ce1041a30f'
--                            AND clickout_date_prt >= '01/05/2025'
                           and clickout_date_prt >= '2025-05-01'
                         group by channel_click_id, visit_iid,cid)
select p.*, en.process_name,
            en.subid,
            application_date,
            subsource_name,
            en.lead_id,
            track_name,
            app_id,   -- not important
            channel_name,
            registration_date,
           age_of_business_months,
           application_annual_revenue,
           en.commission
from prediction_data as p
inner join enrichmen_final as en
on (p.channel_click_id = en.channel_click_id and p.visit_iid = en.visit_iid and p.cid = en.subid)
where p.leads_count >= 1

    """
    raw = AthenaApi().execute_fetch(query_biz)
    df_biz_enrich = pd.DataFrame(raw)
    return df_biz_enrich

df_biz_enrich = query_biz_func()
df_biz_enrich.head()

fetching manifest from s3://aws-athena-query-results-925511037392-us-east-1/Unsaved/2025/07/24/cd51d97e-3dd7-4fa2-a888-0af79afa7228-manifest.csv
INFO - the function _execute_unload was executed in 45.1663 seconds
INFO - returning system cpu count
INFO - the function _execute_fetch was executed in 46.67315 seconds


Unnamed: 0,channel_click_id,visit_iid,cid,clickout_timestamp,avg_conversion_lag_lead,normalized_p_cr_lead,normalized_p_cr_lead_sum,p_cr_lead,p_conversion_count_lead,p_conversion_time_lead,...,application_date,subsource_name,lead_id,track_name,app_id,channel_name,registration_date,age_of_business_months,application_annual_revenue,commission
0,CjwKCAjw6s7CBhACEiwAuHQckq3acb-MEevKYtLf6eiaSt...,SBwbRHA2lwVS15vSqWwS,zvdR8I8Oj6,2025-06-19 23:21:39,1.36,0.294954,0.589909,0.294954,0,2025-06-20 00:43:15,...,2025-06-19T00:00:00+0300,Funding Hero,2273384.0,Best Biz Loan Quick Apply,1835136,Referral,2025-06-19T00:00:00+0300,5.0,2.0,0
1,CjwKCAjwg7PDBhBxEiwAf1CVu3kKif1AS9kvCABF2GUynG...,3lu1nKYKiXvbzpIEbHIZ,aTvdigAmdF,2025-07-08 21:08:21,1.05,0.221853,0.88741,0.221853,0,2025-07-08 22:11:21,...,2025-07-08T00:00:00+0300,Funding Hero,2285064.0,Best Biz Loan Quick Apply,1844995,Referral,2025-07-08T00:00:00+0300,69.0,269000.0,492
2,Cj0KCQjwss3DBhC3ARIsALdgYxM4dyxfAkt1yiQpKUDm1T...,Cf8dMqID1FYG5MevYvfI,gHePSjnWyb,2025-07-13 21:18:52,1.04,0.288248,1.152991,0.288248,0,2025-07-13 22:21:16,...,2025-07-13T00:00:00+0300,Funding Hero,2287710.0,Best Biz Loan Quick Apply,1847453,Referral,2025-07-13T00:00:00+0300,30.0,175000.0,492
3,CjwKCAjw6NrBBhB6EiwAvnT_rnX-MQ9UNj5_8AbSQjqYcJ...,M1o0nphlBcqP9j09fvco,jYNUQc7iVj,2025-05-29 02:44:51,10.01,0.169757,0.679026,0.169757,1,2025-05-29 12:45:27,...,2025-05-28T00:00:00+0300,Funding Hero,2261918.0,Best Biz Loan Quick Apply,1824180,Referral,2025-05-28T00:00:00+0300,238.0,175000.0,492
4,Cj0KCQjwjo7DBhCrARIsACWauSm03Y78COXg5H5JaruIBU...,phOiHHeM05uDHEm3yU1L,cw2hJZTvxr,2025-07-02 03:46:58,1.6,0.226848,0.90739,0.226848,0,2025-07-02 05:22:58,...,2025-07-01T00:00:00+0300,Funding Hero,2280811.0,Best Biz Loan Quick Apply,1841548,Referral,2025-07-01T00:00:00+0300,59.0,375000.0,492


In [86]:
a1 = df_biz_enrich.groupby('subid').application_date.nunique()>1

In [87]:
a1 = a1.to_frame()

In [90]:
big_subids = a1[a1.application_date == True].index

In [92]:
len(big_subids)

181

In [95]:
df_biz_enrich[df_biz_enrich['subid'].isin(big_subids)].sales_count.sum() #.sales_count.sum()

180

In [94]:
df_biz_enrich[df_biz_enrich['process_name'].isnull()].sort_values(by = 'clickout_timestamp')

Unnamed: 0,channel_click_id,visit_iid,cid,clickout_timestamp,avg_conversion_lag_lead,normalized_p_cr_lead,normalized_p_cr_lead_sum,p_cr_lead,p_conversion_count_lead,p_conversion_time_lead,...,application_date,subsource_name,lead_id,track_name,app_id,channel_name,registration_date,age_of_business_months,application_annual_revenue,commission


In [36]:
df_biz_enrich.isnull().sum()/df_biz_enrich.count().round(3) #.sort_values(ascending=False)

channel_click_id              0.000000
visit_iid                     0.000000
clickout_timestamp            0.000000
avg_conversion_lag_lead       0.000000
normalized_p_cr_lead          0.000000
                                ...   
channel_name                  0.049123
registration_date             0.049123
age_of_business_months        0.049123
application_annual_revenue    0.240407
commission                    0.049123
Length: 87, dtype: float64

In [37]:
df_biz_enrich.describe()

Unnamed: 0,clickout_timestamp,avg_conversion_lag_lead,normalized_p_cr_lead,normalized_p_cr_lead_sum,p_cr_lead,p_conversion_count_lead,p_conversion_time_lead,avg_conversion_lag_ql,normalized_p_cr_ql,p_conversion_count_ql,...,conversion_count,leads_count,qualified_leads_count,sales_count,site_id,partner_id,app_id,age_of_business_months,application_annual_revenue,commission
count,5980,5980.0,5980.0,5980.0,5980.0,5980.0,5980,5980.0,5980.0,5980.0,...,5980.0,5980.0,5980.0,5980.0,5980.0,0.0,5700.0,5700.0,4821.0,5700.0
mean,2025-06-14 13:02:04.093311232,5.784171,0.20969,0.750838,0.213349,0.213712,2025-06-14 18:55:47.645295616,28.262416,0.057937,0.061371,...,1.0,1.0,0.309197,0.075585,10307.0,,1831984.0,60.91193,1856217.0,168.619474
min,2025-05-01 00:04:31,0.68,0.0,0.0,0.011108,0.0,2025-05-01 01:22:40,6.9,0.0,0.0,...,1.0,1.0,0.0,0.0,10307.0,,1809320.0,0.0,0.0,0.0
25%,2025-05-23 17:42:21.500000,1.08,0.166341,0.433335,0.168802,0.0,2025-05-23 22:32:30,12.96,0.027048,0.0,...,1.0,1.0,0.0,0.0,10307.0,,1821918.0,3.0,46667.0,0.0
50%,2025-06-16 19:31:39.500000,1.42,0.210602,0.642232,0.212412,0.0,2025-06-16 23:55:54.500000,17.12,0.041033,0.0,...,1.0,1.0,0.0,0.0,10307.0,,1832570.0,26.0,145000.0,32.0
75%,2025-07-06 02:55:03.249999872,6.7125,0.247392,0.890848,0.251089,0.0,2025-07-06 09:54:15,24.23,0.073296,0.0,...,1.0,1.0,1.0,0.0,10307.0,,1842318.0,70.0,375000.0,492.0
max,2025-07-24 07:00:56,91.23,0.595459,4.275271,0.595459,1.0,2025-07-25 08:49:53,168.0,0.431295,1.0,...,1.0,1.0,1.0,1.0,10307.0,,1851337.0,1265.0,4294967000.0,492.0
std,,12.309054,0.074636,0.525026,0.076557,0.40996,,35.3928,0.04836,0.24003,...,0.0,0.0,0.462201,0.264356,0.0,,11955.34,106.860658,63289780.0,225.608521


In [38]:
df_biz_enrich.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5980 entries, 0 to 5979
Data columns (total 87 columns):
 #   Column                      Non-Null Count  Dtype         
---  ------                      --------------  -----         
 0   channel_click_id            5980 non-null   object        
 1   visit_iid                   5980 non-null   object        
 2   clickout_timestamp          5980 non-null   datetime64[ns]
 3   avg_conversion_lag_lead     5980 non-null   float64       
 4   normalized_p_cr_lead        5980 non-null   float64       
 5   normalized_p_cr_lead_sum    5980 non-null   float64       
 6   p_cr_lead                   5980 non-null   float64       
 7   p_conversion_count_lead     5980 non-null   int64         
 8   p_conversion_time_lead      5980 non-null   datetime64[ns]
 9   avg_conversion_lag_ql       5980 non-null   float64       
 10  normalized_p_cr_ql          5980 non-null   float64       
 11  p_conversion_count_ql       5980 non-null   int64       

In [39]:
df_biz_enrich.shape

(5980, 87)

In [19]:
df_biz_enrich.age_of_business_months.describe()

count    5701.000000
mean       60.901772
std       106.854036
min         0.000000
25%         3.000000
50%        26.000000
75%        70.000000
max      1265.000000
Name: age_of_business_months, dtype: float64

In [20]:
df_biz_enrich.commission.value_counts()

commission
0.0      2193
492.0    1862
32.0     1267
1.0       227
28.0      152
Name: count, dtype: int64

In [21]:
df_biz_enrich.business_legal_structure.value_counts()

AttributeError: 'DataFrame' object has no attribute 'business_legal_structure'

In [56]:
df_biz_enrich.columns

Index(['process_name', 'subid', 'partner_id', 'company', 'application_date',
       'subsource_name', 'lead_id', 'track_name', 'app_id', 'channel_name',
       'registration_date', 'business_legal_structure',
       'age_of_business_months', 'application_annual_revenue', 'commission'],
      dtype='object')

In [57]:
df_biz_enrich.groupby('business_legal_structure')[['commission','application_annual_revenue']].sum()

Unnamed: 0_level_0,commission,application_annual_revenue
business_legal_structure,Unnamed: 1_level_1,Unnamed: 2_level_1
Corporation,59721,380763130.0
I don't Know,0,0.0
I just do not know,12132,42907274.0
Limited Liability Company,154133,405461541.0
Limited Partnership,1476,617000240.0
Non Profit Corp,5339,220952547.0
Partnership,4589,8280003.0
Sole Proprietorship,33326,48596467.0


In [53]:
df_biz_enrich.groupby('business_legal_structure')[['application_annual_revenue','age_of_business_months','application_annual_revenue']].mean()

Unnamed: 0_level_0,application_annual_revenue,age_of_business_months,application_annual_revenue
business_legal_structure,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Corporation,2572724.0,127.008086,2572724.0
I don't Know,,29.0,
I just do not know,335213.1,68.482394,335213.1
Limited Liability Company,522502.0,53.629728,522502.0
Limited Partnership,154250100.0,174.083333,154250100.0
Non Profit Corp,6312930.0,113.512821,6312930.0
Partnership,460000.2,122.228571,460000.2
Sole Proprietorship,197546.6,72.385274,197546.6
