# Imports

In [1]:
import os
from math import isnan
import numpy as np
import pandas as pd

# from ds_aws_services.athena_api  import cachedAthenaApi
from ds_aws_services import CachedAthenaApi
import pandas as pd
# from datetime import datetime, timedelta
import time

os.environ['disk_caching'] = 'true'

# Analysis

### Params:

In [None]:
# Biz2credit BI
partner_ids = [13589]
process_names = ["bi_biz2credit_lead"]
transaction_month_prt = "2025-01"
vertical_ids = ["64e33e7be3cbc4ce1041a30f"]
start_date = "2025-01-01"
end_date = "2025-09-06"
transaction_month_prt_start = "2025-01"
transaction_month_prt_end = "2025-10"

enrichment_cols = "age_of_business_months, application_annual_revenue, business_legal_structure, loan_purpose, industry, sub_industry, users_prob_sale, "
max_enrichment_cols = "MAX(age_of_business_months) AS age_of_business_months, MAX(application_annual_revenue) AS application_annual_revenue, max(business_legal_structure) as business_legal_structure, max(loan_purpose) as loan_purpose, max(industry) as industry, max(sub_industry) as sub_industry, max(users_prob_sale) as users_prob_sale "

In [44]:
# Rocket 
partner_ids = [3158,3178]
process_names = ["quicken_rocket_allocated"]
vertical_ids = ["5fa2b415c91a2010c3432900"]
start_date = "2025-01-01"
end_date = "2025-09-09"
transaction_month_prt = "2025-01"
# transaction_month_prt_start = "2025-08"
# transaction_month_prt_end = "2025-10"

enrich_cond = "AND loanpurpose = 'Refinance'"
enrichment_cols = "loanamount, creditscore, "
max_enrichment_cols = "MAX(f.loanamount)  AS loan_amount, MAX(f.creditscore) AS credit_score "

### Query 1 :  Checking RN

In [27]:
import pandas as pd

def get_data1(
    partner_ids,
    process_names,              # e.g. ['quicken_rocket_allocated','quicken_rocket_leads']
    transaction_month_prt,      # e.g. '2025-02' (or None to derive from start_date)
    vertical_ids,
    start_date,                 # e.g. '2025-08-01'
    end_date,                   # not used in this query, kept for params symmetry
    cond1_col=None,             # e.g. 'loanpurpose'
    cond1_val=None              # e.g. 'Refinance' or '%%' to skip
) -> pd.DataFrame:
    # --- Build WHERE for the CTE ---
    query = f"""
WITH enrichment_data AS (
    SELECT  f.subid,
            process_name,
            f.partner_name,
            f.rn,
            transaction_month_prt,
            company,
            MIN(f.rn) OVER (PARTITION BY subid) AS min_rn,
            {max_enrichment_cols}

    FROM dlk_visitor_funnel_dwh_production.enrich_conversions_flatten f
    WHERE f.partner_id in ({','.join(map(str, partner_ids))})
    AND process_name in ({','.join(f"'{p}'" for p in process_names)})
     AND transaction_month_prt >= '{transaction_month_prt}'
     AND f.vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})

    GROUP BY 1,2,3,4,5,6
)
SELECT
       rn,
       COUNT(DISTINCT subid) AS cids,
       COUNT(subid)          AS rowss
FROM enrichment_data
WHERE rn = min_rn
GROUP BY 1
ORDER BY 2 DESC
"""
    return pd.DataFrame(CachedAthenaApi().execute_fetch(query))

df1 = get_data1(
    partner_ids=partner_ids,
    process_names=process_names,
    transaction_month_prt=transaction_month_prt,
    vertical_ids=vertical_ids,
    start_date=start_date,
    end_date=end_date
)
df1.head()

2025-09-09 11:16:32,551 INFO [ds_logger.py:68] [Cached AthenaApi] Local disk-caching is ENABLED.
2025-09-09 11:16:32,551 INFO [ds_logger.py:68] [Cached AthenaApi] Using cached results for execute_fetch(args=("\nWITH enrichment_data AS (\n    SELECT  f.subid,\n            process_name,\n            f.partner_name,\n            f.rn,\n            transaction_month_prt,\n            company,\n            MIN(f.rn) OVER (PARTITION BY subid) AS min_rn,\n            MAX(f.loanamount)  AS loan_amount, MAX(f.creditscore) AS credit_score \n\n    FROM dlk_visitor_funnel_dwh_production.enrich_conversions_flatten f\n    WHERE f.partner_id in (3158,3178)\n    AND process_name in ('quicken_rocket_allocated')\n     AND transaction_month_prt >= '2025-01'\n     AND f.vertical_id in ('5fa2b415c91a2010c3432900')\n\n    GROUP BY 1,2,3,4,5,6\n)\nSELECT\n       rn,\n       COUNT(DISTINCT subid) AS cids,\n       COUNT(subid)          AS rowss\nFROM enrichment_data\nWHERE rn = min_rn\nGROUP BY 1\nORDER BY 2 D

Unnamed: 0,rn,cids,rowss
0,1,82127,82136


### Query 2: checking dates Last Updates (Enrichment Action Time)

In [28]:
def get_data2(
    partner_ids,
    process_names,
    transaction_month_prt,
    vertical_ids,
    start_date,
    end_date
) -> pd.DataFrame:

    query = f"""
SELECT
  CAST(SUBSTRING(action_time, 1, 10) AS DATE) AS action_day,
  COUNT(DISTINCT subid) AS subids
FROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN
WHERE partner_id in ({','.join(map(str, partner_ids))})
AND process_name in ({','.join(f"'{p}'" for p in process_names)})
 AND transaction_month_prt >= '{transaction_month_prt}'
 AND vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})
GROUP BY 1
ORDER BY 1 DESC
"""
    return pd.DataFrame(CachedAthenaApi().execute_fetch(query))

df2 = get_data2(
    partner_ids=partner_ids,
    process_names=process_names,
    transaction_month_prt=transaction_month_prt,
    vertical_ids=vertical_ids,
    start_date=start_date,
    end_date=end_date
)
df2.head()    

2025-09-09 11:16:33,156 INFO [ds_logger.py:68] [Cached AthenaApi] Local disk-caching is ENABLED.
2025-09-09 11:16:33,157 INFO [ds_logger.py:68] [Cached AthenaApi] Using cached results for execute_fetch(args=("\nSELECT\n  CAST(SUBSTRING(action_time, 1, 10) AS DATE) AS action_day,\n  COUNT(DISTINCT subid) AS subids\nFROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN\nWHERE partner_id in (3158,3178)\nAND process_name in ('quicken_rocket_allocated')\n AND transaction_month_prt >= '2025-01'\n AND vertical_id in ('5fa2b415c91a2010c3432900')\nGROUP BY 1\nORDER BY 1 DESC\n",), kwargs={}).


Unnamed: 0,action_day,subids
0,2025-09-07,1671
1,2025-09-01,9532
2,2025-08-26,1
3,2025-08-14,1
4,2025-08-01,7732


Most subids are updated in the last day! The ones that has previous rows data is because the values changed so they have bigger RN

### Query 3: Per creation date counts

In [29]:

def get_data3(
    partner_ids,
    process_names,
    transaction_month_prt,
    vertical_ids,
    start_date,
    end_date
) -> pd.DataFrame:

    query = f"""
SELECT
  CAST(SUBSTRING(created_at, 1, 10) AS DATE) AS created_at_day,
  COUNT(DISTINCT subid) AS subids
FROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN
WHERE partner_id in ({','.join(map(str, partner_ids))})
AND process_name in ({','.join(f"'{p}'" for p in process_names)})
 AND transaction_month_prt >= '{transaction_month_prt}'
 AND vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})
 -- and rn = 1
GROUP BY 1
ORDER BY 1 DESC
"""
    return pd.DataFrame(CachedAthenaApi().execute_fetch(query))

df3 = get_data3(
    partner_ids=partner_ids,
    process_names=process_names,
    transaction_month_prt=transaction_month_prt,
    vertical_ids=vertical_ids,
    start_date=start_date,
    end_date=end_date
)
df3.head()        

2025-09-09 11:16:39,198 INFO [ds_logger.py:68] [Cached AthenaApi] Local disk-caching is ENABLED.
2025-09-09 11:16:39,199 INFO [ds_logger.py:68] [Cached AthenaApi] Using cached results for execute_fetch(args=("\nSELECT\n  CAST(SUBSTRING(created_at, 1, 10) AS DATE) AS created_at_day,\n  COUNT(DISTINCT subid) AS subids\nFROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN\nWHERE partner_id in (3158,3178)\nAND process_name in ('quicken_rocket_allocated')\n AND transaction_month_prt >= '2025-01'\n AND vertical_id in ('5fa2b415c91a2010c3432900')\n -- and rn = 1\nGROUP BY 1\nORDER BY 1 DESC\n",), kwargs={}).


Unnamed: 0,created_at_day,subids
0,2025-09-07,210
1,2025-09-06,320
2,2025-09-05,313
3,2025-09-04,287
4,2025-09-03,500


Looks like every day added around 60 +- subids.  looks valid

### Query 4: Hours diff for each field

In [None]:
def get_data3_combined(
    partner_ids,
    process_names,
    transaction_month_prt, 
    vertical_ids,
    start_date,  # 'YYYY-MM-DD'
    end_date     # 'YYYY-MM-DD'
) -> pd.DataFrame:
    tm_start = start_date[:7]
    tm_end = end_date[:7]
    if tm_end == tm_start:
        year, month = map(int, tm_end.split('-'))
        if month == 12:
            year += 1
            month = 1
        else:
            month += 1
        tm_end = f"{year:04d}-{month:02d}"

    query = f"""
WITH raw_enrich AS (
  SELECT DISTINCT
      subid,
      process_name,
      CAST(SUBSTRING(transaction_date, 1, 10) AS DATE) AS transaction_day,
      transaction_date,
      transaction_month_prt,
      created_at,
      {enrichment_cols}
      rn
  FROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN
  WHERE partner_id in ({','.join(map(str, partner_ids))})
    AND process_name in ({','.join(f"'{p}'" for p in process_names)})
    AND transaction_month_prt between '{tm_start}' and '{tm_end}'
    AND vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})
    {enrich_cond}
    AND rn = 1
),
ff AS (
  SELECT
      cid,
      conversion_date,
      conversion_timestamp,
      clickout_timestamp,
      company,
      SUM(leads_count) AS leads,
      SUM(qualified_leads_count) AS qls,
      SUM(sales_count) AS sales
  FROM dlk_visitor_funnel_dwh_production.chart_funnel
  WHERE partner_id in ({','.join(map(str, partner_ids))})
    AND dt between '{tm_start}' and '{tm_end}'
    and clickout_date between cast('{start_date}' as date) and cast('{end_date}' as date)
    AND vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})
    AND traffic_type = 'users'
  GROUP BY 1,2,3,4,5
  HAVING SUM(leads_count) >= 1
),
combined AS (
  SELECT
      r.subid,
      ff.company,
      r.transaction_date,
      date_format(date_trunc('millisecond', MAX(ff.clickout_timestamp)),
        '%Y-%m-%d %H:%i:%s.%f') AS clickout_ts_mt,

      date_format( date_trunc( 'millisecond', MIN(CASE WHEN r.creditscore IS NOT NULL THEN
              COALESCE(
                date_parse(SUBSTRING(r.created_at, 1, 19), '%Y-%m-%dT%H:%i:%s'),
                from_iso8601_timestamp(regexp_replace(r.created_at, '([+-]\\d{2})(\\d{2})$', '\\1:\\2'))
              )
            END
          )
        ),
        '%Y-%m-%d %H:%i:%s.%f'
      ) AS creditscore_time,

      /* loanamount_time → only when present; parse flexibly, ms-trunc, stringify */
      date_format(
        date_trunc(
          'millisecond',
          MIN(
            CASE WHEN r.loanamount IS NOT NULL THEN
              COALESCE(
                date_parse(SUBSTRING(r.created_at, 1, 19), '%Y-%m-%dT%H:%i:%s'),
                date_parse(SUBSTRING(r.created_at, 1, 19), '%Y-%m-%d %H:%i:%s'),
                from_iso8601_timestamp(regexp_replace(r.created_at, '([+-]\\d{2})(\\d{2})$', '\\1:\\2'))
              )
            END
          )
        ),
        '%Y-%m-%d %H:%i:%s.%f'
      ) AS loanamount_time,
      sum(ff.leads) as leads,
      sum(ff.qls) as qls,
      sum(ff.sales) as sales
      
  FROM raw_enrich r
  INNER JOIN ff
    ON r.subid = ff.cid
   AND r.transaction_day >= ff.conversion_date
  GROUP BY r.subid, r.transaction_date, ff.company
)
SELECT * FROM combined
"""
    return pd.DataFrame(CachedAthenaApi().execute_fetch(query))

combined_df = get_data3_combined(
    partner_ids=partner_ids,
    process_names=process_names,
    transaction_month_prt=transaction_month_prt,   # ignored for month range; kept for signature symmetry
    vertical_ids=vertical_ids,
    start_date= start_date,
    end_date= end_date
)

# parse the string timestamps before hours math (your build_hours_summary already does this)
for col in ["clickout_ts_mt", "creditscore_time", "loanamount_time", "transaction_date"]:
    combined_df[col] = pd.to_datetime(combined_df[col], errors="coerce", utc=True)

# Add hour difference columns for all columns ending with '_time'
def add_hour_diffs(df, reference_col='clickout_ts_mt'):
    """
    Add hour difference columns for all columns ending with '_time'
    compared to the reference column (default: clickout_ts_mt)
    """
    df = df.copy()
    
    # Find all columns ending with '_time'
    time_cols = [col for col in df.columns if col.endswith('_time')]
    
    for time_col in time_cols:
        # Create diff column name
        diff_col = time_col.replace('_time', '_diff_hours')
        
        # Calculate hour difference: (time_col - reference_col) in hours
        # Floor the result and ensure non-negative values
        time_diff = (df[time_col] - df[reference_col]).dt.total_seconds() / 3600.0
        time_diff = np.floor(time_diff)  # Floor like TIMESTAMPDIFF('hour',...)
        time_diff = np.maximum(time_diff, 0)  # Ensure non-negative (GREATEST(..., 0))
        
        # Handle NaN values (when either timestamp is null)
        time_diff = np.where(df[time_col].isna() | df[reference_col].isna(), np.nan, time_diff)
        
        df[diff_col] = time_diff
    
    return df

# Apply the function to add hour difference columns
combined_df = add_hour_diffs(combined_df)

print("Added hour difference columns:")
print("Columns:", combined_df.columns.tolist())
print("\nFirst 5 rows with hour differences:")
combined_df.head()



2025-09-09 11:16:42,361 INFO [ds_logger.py:68] [Cached AthenaApi] Local disk-caching is ENABLED.
2025-09-09 11:16:42,362 INFO [ds_logger.py:68] [Cached AthenaApi] Cache miss for execute_fetch(args=("\nWITH raw_enrich AS (\n  SELECT DISTINCT\n      subid,\n      process_name,\n      CAST(SUBSTRING(transaction_date, 1, 10) AS DATE) AS transaction_day,\n      transaction_date,\n      transaction_month_prt,\n      created_at,\n      loanamount, creditscore, \n      rn\n  FROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN\n  WHERE partner_id in (3158,3178)\n    AND process_name in ('quicken_rocket_allocated')\n    AND transaction_month_prt between '2025-09' and '2025-10'\n    AND vertical_id in ('5fa2b415c91a2010c3432900')\n    AND loanpurpose = 'Refinance'\n    AND rn = 1\n),\nff AS (\n  SELECT\n      cid,\n      conversion_date,\n      conversion_timestamp,\n      clickout_timestamp,\n      company,\n      SUM(leads_count) AS leads,\n      SUM(qualified_leads_count) AS qls,

Unnamed: 0,subid,company,transaction_date,clickout_ts_mt,creditscore_time,loanamount_time,leads,qls,sales,creditscore_diff_hours,loanamount_diff_hours
0,K1UMk9wWH6,ni,2025-09-03 21:00:00+00:00,2025-09-04 13:33:25+00:00,2025-09-05 18:00:51+00:00,2025-09-05 18:00:51+00:00,1,0,0,28.0,28.0
1,pcA2j93fnk,ni,2025-09-04 21:00:00+00:00,2025-09-05 20:10:46+00:00,2025-09-06 18:00:51+00:00,2025-09-06 18:00:51+00:00,1,0,1,21.0,21.0
2,iJJE9l5aYa,ni,2025-09-04 21:00:00+00:00,2025-09-05 19:23:26+00:00,2025-09-06 18:00:51+00:00,2025-09-06 18:00:51+00:00,1,0,0,22.0,22.0
3,SvAFy00H3Y,ni,2025-09-03 21:00:00+00:00,2025-09-05 00:05:19+00:00,2025-09-05 18:00:51+00:00,2025-09-05 18:00:51+00:00,1,0,0,17.0,17.0
4,sLVh68kfbe,ni,2025-09-04 21:00:00+00:00,2025-09-05 13:01:49+00:00,2025-09-06 18:00:51+00:00,2025-09-06 18:00:51+00:00,1,0,0,28.0,28.0


### Query 5: aggregated hours diff

In [31]:
# Fixed version with sales stats included, but now removing null rate and filled rows columns, and adding a print statement before stats
def compute_field_stats_fixed(df: pd.DataFrame) -> pd.DataFrame:
    # Find all columns ending with '_diff_hours' or '_hours'
    hour_cols = [col for col in df.columns if col.endswith('_diff_hours') or col.endswith('_hours')]
    
    if not hour_cols:
        print("No hour columns found. Available columns:", df.columns.tolist())
        return pd.DataFrame()
    print(f"Using hour columns: {hour_cols}")

    has_sales = 'sales' in df.columns
    print(f"Sales column available: {has_sales}")
    
    long_df = df.melt(
        id_vars=["subid", "company", "transaction_date", "clickout_ts_mt"] + (["sales"] if has_sales else []),
        value_vars=hour_cols,
        var_name="field_name",
        value_name="hours"
    )
    long_df["field_name"] = long_df["field_name"].str.upper()

    g = long_df.groupby(["company", "field_name"], dropna=False)
    avg_hours     = g["hours"].mean()
    median_hours  = g["hours"].quantile(0.50, interpolation="linear")
    p80_hours     = g["hours"].quantile(0.80, interpolation="linear")
    p90_hours     = g["hours"].quantile(0.90, interpolation="linear")

    # Calculate sales stats if sales column exists
    if has_sales:
        avg_sales = g["sales"].mean()
        sum_sales = g["sales"].sum()
        
        out = pd.concat(
            [avg_hours.rename("avg_hours"),
             median_hours.rename("median_hours"),
             p80_hours.rename("p80_hours"),
             p90_hours.rename("p90_hours"),
             avg_sales.rename("avg_sales"),
             sum_sales.rename("sum_sales")],
            axis=1
        ).reset_index()
        
        out = out[["company","field_name","avg_hours","median_hours","p80_hours","p90_hours",
                   "avg_sales","sum_sales"]]\
                .sort_values(["company","field_name"]).reset_index(drop=True)
    else:
        out = pd.concat(
            [avg_hours.rename("avg_hours"),
             median_hours.rename("median_hours"),
             p80_hours.rename("p80_hours"),
             p90_hours.rename("p90_hours")],
            axis=1
        ).reset_index()
        
        out = out[["company","field_name","avg_hours","median_hours","p80_hours","p90_hours"]]\
                .sort_values(["company","field_name"]).reset_index(drop=True)
    
    return out

# Use the fixed function
field_stats = compute_field_stats_fixed(combined_df)    

print("\nConsidering only rows with enrichment values\n")
print("="*60)
print("SUMMARY STATISTICS:")
print("="*60)
field_stats.head(20)


Using hour columns: ['creditscore_diff_hours', 'loanamount_diff_hours']
Sales column available: True

Considering only rows with enrichment values

SUMMARY STATISTICS:


Unnamed: 0,company,field_name,avg_hours,median_hours,p80_hours,p90_hours,avg_sales,sum_sales
0,ni,CREDITSCORE_DIFF_HOURS,27.250244,24.0,29.0,42.0,0.024438,25
1,ni,LOANAMOUNT_DIFF_HOURS,27.250244,24.0,29.0,42.0,0.024438,25


### Query 6: for general data for analysis:

In [66]:
def get_general_data(
    partner_ids,
    process_names,
    transaction_month_prt,  # not used (we derive months from dates; kept for symmetry)
    vertical_ids,
    start_date,  # 'YYYY-MM-DD'
    end_date     # 'YYYY-MM-DD'
) -> pd.DataFrame:
    tm_start = start_date[:7]
    tm_end   = end_date[:7]

    query = f"""
WITH raw_enrich AS (
  SELECT DISTINCT
      subid,
      process_name,
      transaction_date,
      CAST(SUBSTRING(transaction_date, 1, 10) AS DATE) AS transaction_day,
      transaction_month_prt,
      {enrichment_cols}
      row_number () over (partition by subid order by transaction_date desc) as rnn

  FROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN
  WHERE partner_id in ({','.join(map(str, partner_ids))})
    AND process_name in ({','.join(f"'{p}'" for p in process_names)})
    AND transaction_month_prt between '{tm_start}' and '{tm_end}'
    AND vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})
    AND rn = 1 
    AND loanpurpose = 'Refinance'
),
ff AS (
  SELECT
      cid,
      conversion_date,
      conversion_timestamp,
      clickout_date,
      company,
      SUM(leads_count) AS leads,
      SUM(qualified_leads_count) AS qls,
      SUM(sales_count) AS sales

  FROM dlk_visitor_funnel_dwh_production.chart_funnel
  WHERE partner_id in ({','.join(map(str, partner_ids))})
    AND dt between '{tm_start}' and '{tm_end}'
    AND vertical_id in ({','.join(f"'{v}'" for v in vertical_ids)})
    AND traffic_type = 'users'
  GROUP BY 1,2,3,4,5
  HAVING SUM(leads_count) >= 1
),
combined AS (
  SELECT
      coalesce(ff.cid, r.subid) as subid,
      ff.company,
      r.transaction_date,
      clickout_date,
      {enrichment_cols}
      sum(ff.leads) as leads,
      sum(ff.qls) as qls,
      sum(ff.sales) as sales
      
  FROM ff 
  LEFT JOIN raw_enrich r
    ON r.subid = ff.cid
   AND r.transaction_day >= ff.conversion_date and rnn = 1 
  GROUP BY 1,2,3,4,5,6
)
SELECT * FROM combined """

    return pd.DataFrame(CachedAthenaApi().execute_fetch(query))

analysis_df = get_general_data(
    partner_ids=partner_ids,
    process_names=process_names,
    transaction_month_prt= transaction_month_prt,   # ignored for month range; kept for signature symmetry
    vertical_ids=vertical_ids,
    start_date= start_date,
    end_date= end_date
)

analysis_df.head

2025-09-09 11:31:52,288 INFO [ds_logger.py:68] [Cached AthenaApi] Local disk-caching is ENABLED.
2025-09-09 11:31:52,289 INFO [ds_logger.py:68] [Cached AthenaApi] Cache miss for execute_fetch(args=("\nWITH raw_enrich AS (\n  SELECT DISTINCT\n      subid,\n      process_name,\n      transaction_date,\n      CAST(SUBSTRING(transaction_date, 1, 10) AS DATE) AS transaction_day,\n      transaction_month_prt,\n      loanamount, creditscore, \n      row_number () over (partition by subid order by transaction_date desc) as rnn\n\n  FROM DLK_VISITOR_FUNNEL_DWH_PRODUCTION.ENRICH_CONVERSIONS_FLATTEN\n  WHERE partner_id in (3158,3178)\n    AND process_name in ('quicken_rocket_allocated')\n    AND transaction_month_prt between '2025-01' and '2025-09'\n    AND vertical_id in ('5fa2b415c91a2010c3432900')\n    AND rn = 1 \n    AND loanpurpose = 'Refinance'\n),\nff AS (\n  SELECT\n      cid,\n      conversion_date,\n      conversion_timestamp,\n      clickout_date,\n      company,\n      SUM(leads_coun

<bound method NDFrame.head of              subid company          transaction_date clickout_date  \
0       v8zBooq3KF      ni  2025-08-18T00:00:00+0300    2025-08-18   
1       6xoosaFgYb      ni  2025-03-18T00:00:00+0200    2025-03-18   
2       MPw62Wmria      ni                      None    2025-02-27   
3       dQi5l3uEu0      ni                      None    2025-04-29   
4       cFAqUFSxh6      ni                      None    2025-07-04   
...            ...     ...                       ...           ...   
170757  ygczhros2e      ni  2025-07-24T00:00:00+0300    2025-07-24   
170758  iHzHjwbOOq      ni                      None    2025-08-26   
170759  2cTshj9SCA      ni                      None    2025-01-15   
170760  bEX08o38IE      ni                      None    2025-03-28   
170761  cmZDnzJVLb      ni                      None    2025-04-20   

        loanamount    creditscore  leads  qls  sales  
0         359000.0      Excellent      1    0      0  
1              0.0 

In [67]:
analysis_df.sort_values(by='leads', ascending=False).head(10)

Unnamed: 0,subid,company,transaction_date,clickout_date,loanamount,creditscore,leads,qls,sales
152959,YCD9FmIEyN,ni,,2025-06-12,,,2,0,0
107397,GekiJI6qrD,ni,,2025-06-24,,,2,0,0
165535,YbIQfMGdbs,ni,,2025-07-12,,,2,0,0
112765,vkjHDsyxOM,ni,,2025-05-29,,,2,0,0
65893,4sGbG5YWlM,ni,,2025-07-26,,,2,0,0
73415,Dl82pFHyQp,ni,,2025-06-02,,,2,0,0
39880,1OOGsZdj4P,ni,,2025-05-30,,,2,1,0
152778,hDGQKJLBhm,ni,,2025-01-31,,,2,0,0
29766,i59JpccfiM,ni,,2025-01-22,,,2,0,0
143366,y4QCOxx43e,ni,2025-08-21T00:00:00+0300,2025-07-06,320000.0,Average,2,0,0


#### Additional Analysis

##### 1. NULL RATES FOR ENRICHMENT FIELDS

In [81]:
# 1. NULL RATES FOR ENRICHMENT FIELDS
print("="*60)
print("1. NULL RATES FOR ENRICHMENT FIELDS")
print("="*60)

enrichment_fields = [col for col in analysis_df.columns if col not in ['subid', 'company', 'transaction_date', 'leads', 'qls', 'sales','l2s', 'clickout_date']]
print(f"Enrichment fields found: {enrichment_fields}")

null_rates = []
for field in enrichment_fields:
    total_rows = len(analysis_df)
    null_rows = analysis_df[field].isnull().sum()
    null_rate = (null_rows / total_rows) * 100
    
    null_rates.append({
        'field': field,
        'total_rows': total_rows,
        'null_rows': null_rows,
        'null_rate_pct': null_rate,
        'filled_rows': total_rows - null_rows
    })
    
    print(f"{field}:")
    print(f"  Total rows: {total_rows}")
    print(f"  Null rows: {null_rows}")
    print(f"  Filled rows: {total_rows - null_rows}")
    print(f"  Null rate: {null_rate:.2f}%")
    print("-" * 40)

# Create summary DataFrame
null_summary = pd.DataFrame(null_rates)
print("\nSUMMARY TABLE:")
print(null_summary[['field', 'null_rate_pct', 'filled_rows', 'total_rows']].round(2))


1. NULL RATES FOR ENRICHMENT FIELDS
Enrichment fields found: ['loanamount', 'creditscore', 'enrich_month']
loanamount:
  Total rows: 170762
  Null rows: 135240
  Filled rows: 35522
  Null rate: 79.20%
----------------------------------------
creditscore:
  Total rows: 170762
  Null rows: 135240
  Filled rows: 35522
  Null rate: 79.20%
----------------------------------------
enrich_month:
  Total rows: 170762
  Null rows: 0
  Filled rows: 170762
  Null rate: 0.00%
----------------------------------------

SUMMARY TABLE:
          field  null_rate_pct  filled_rows  total_rows
0    loanamount           79.2        35522      170762
1   creditscore           79.2        35522      170762
2  enrich_month            0.0       170762      170762


Per Month

In [82]:
# Per month analysis of how many values exist for enrichment columns, and null rate
# Use clickout_date (or clickout_ts_mt/clickout_timestamp) for month extraction

import pandas as pd

# Define which columns are considered "enrichment columns"
enrichment_columns = ['creditscore', 'loanamount']

# Choose the correct timestamp column for month extraction
timestamp_col = None
for col in ['clickout_ts_mt', 'clickout_date', 'clickout_timestamp']:
    if col in analysis_df.columns:
        timestamp_col = col
        break

if timestamp_col is None:
    raise ValueError("No clickout timestamp column found in analysis_df")

# Ensure the timestamp column is in datetime format
if not pd.api.types.is_datetime64_any_dtype(analysis_df[timestamp_col]):
    analysis_df[timestamp_col] = pd.to_datetime(analysis_df[timestamp_col], errors='coerce')

analysis_df['enrich_month'] = analysis_df[timestamp_col].dt.to_period('M')

# Prepare per-month stats for each enrichment column
# The correct null rate is null_count / total_records (where total_records is the number of rows for that month)
month_counts = analysis_df.groupby('enrich_month').size().rename('total_records').reset_index()

enrichment_stats = [month_counts]
for col in enrichment_columns:
    per_month = (
        analysis_df
        .groupby('enrich_month')[col]
        .agg(
            **{
                f'{col}_non_null_count': lambda x: x.notnull().sum(),
                f'{col}_null_count': lambda x: x.isnull().sum()
            }
        )
        .reset_index()
    )
    enrichment_stats.append(per_month)

# Merge all stats on enrich_month
from functools import reduce
enrichment_per_month = reduce(lambda left, right: pd.merge(left, right, on='enrich_month', how='outer'), enrichment_stats)

# Calculate null rates for each enrichment column
for col in enrichment_columns:
    enrichment_per_month[f'{col}_null_rate'] = enrichment_per_month[f'{col}_null_count'] / enrichment_per_month['total_records']

# Display the result
print("\nPer month analysis of enrichment columns (non-null counts and null rates):")
print(enrichment_per_month.sort_values('enrich_month').to_string(index=False))


Per month analysis of enrichment columns (non-null counts and null rates):
enrich_month  total_records  creditscore_non_null_count  creditscore_null_count  loanamount_non_null_count  loanamount_null_count  creditscore_null_rate  loanamount_null_rate
     2025-01          29856                        6808                   23048                       6808                  23048               0.771972              0.771972
     2025-02          23428                        5373                   18055                       5373                  18055               0.770659              0.770659
     2025-03          23203                        4860                   18343                       4860                  18343               0.790544              0.790544
     2025-04          20779                        4403                   16376                       4403                  16376               0.788103              0.788103
     2025-05          17910                      

Rows where both creditscore & loanamount are nulls (For Rocket)

In [69]:
# Count of rows with null creditscore
null_creditscore_count = analysis_df['creditscore'].isnull().sum()
print(f"Number of rows with null creditscore: {null_creditscore_count}")

# Check if whenever creditscore is null, loanamount is also null
cs_null_la_notnull = analysis_df[analysis_df['creditscore'].isnull() & analysis_df['loanamount'].notnull()]
if cs_null_la_notnull.empty:
    print("Whenever creditscore is null, loanamount is also null.")
else:
    print(f"There are {len(cs_null_la_notnull)} rows where creditscore is null but loanamount is NOT null.")

# Check if whenever loanamount is null, creditscore is also null
la_null_cs_notnull = analysis_df[analysis_df['loanamount'].isnull() & analysis_df['creditscore'].notnull()]
if la_null_cs_notnull.empty:
    print("Whenever loanamount is null, creditscore is also null.")
else:
    print(f"There are {len(la_null_cs_notnull)} rows where loanamount is null but creditscore is NOT null.")


Number of rows with null creditscore: 135240
Whenever creditscore is null, loanamount is also null.
Whenever loanamount is null, creditscore is also null.


##### 2. SALES AND QLS ANALYSIS BY ENRICHMENT FIELD VALUES

In [70]:
# 2. SALES AND QLS MEAN AND SUMS PER ENRICHMENT FIELD VALUES
print("="*60)
print("2. SALES AND QLS ANALYSIS BY ENRICHMENT FIELD VALUES")
print("="*60)

# Calculate L2S (Leads to Sales) conversion rate
analysis_df['l2s'] = analysis_df['sales'] / analysis_df['leads'].replace(0, np.nan)

for field in enrichment_fields:
    print(f"\n{field.upper()}:")
    print("-" * 40)
    
    # Treat nulls as a separate category for grouping
    field_stats = (
        analysis_df
        .copy()
        .assign(**{field: analysis_df[field].where(analysis_df[field].notnull(), 'NULL')})
        .groupby(field)
        .agg({
            'leads': ['count', 'sum'],
            'qls': ['sum', 'mean'],
            'sales': ['sum', 'mean'],
            'l2s': 'mean'
        })
        .round(4)
    )
    
    # Flatten column names
    field_stats.columns = ['_'.join(col).strip() for col in field_stats.columns]
    field_stats = field_stats.reset_index()
    
    # Rename columns for clarity
    field_stats = field_stats.rename(columns={
        f'{field}': field,
        'leads_count': 'leads_count',
        'leads_sum': 'total_leads',
        'qls_sum': 'total_qls',
        'qls_mean': 'avg_qls_per_record',
        'sales_sum': 'total_sales',
        'sales_mean': 'avg_sales_per_record',
        'l2s_mean': 'avg_l2s_rate'
    })
    
    # Sort by total leads descending
    field_stats = field_stats.sort_values('total_leads', ascending=False)
    
    print(f"Top 10 values by total leads:")
    print(field_stats.head(10).to_string(index=False))
    
    # For overall summary, count nulls as a unique value
    unique_values = analysis_df[field].nunique(dropna=True) + analysis_df[field].isnull().any()
    print(f"\nOverall summary for {field}:")
    print(f"  Unique values: {unique_values}")
    print(f"  Total leads: {analysis_df['leads'].sum()}")
    print(f"  Total QLS: {analysis_df['qls'].sum()}")
    print(f"  Total sales: {analysis_df['sales'].sum()}")
    print(f"  Overall L2S rate: {(analysis_df['sales'].sum() / analysis_df['leads'].sum() * 100):.2f}%")
    print("=" * 60)


2. SALES AND QLS ANALYSIS BY ENRICHMENT FIELD VALUES

LOANAMOUNT:
----------------------------------------
Top 10 values by total leads:
loanamount  leads_count  total_leads  total_qls  avg_qls_per_record  total_sales  avg_sales_per_record  avg_l2s_rate
      NULL       135240       135268       2497              0.0185           10                0.0001        0.0001
  100000.0          998          998          0              0.0000           12                0.0120        0.0120
  150000.0          894          894          0              0.0000           12                0.0134        0.0134
   50000.0          889          889          0              0.0000           21                0.0236        0.0236
  200000.0          618          618          0              0.0000            4                0.0065        0.0065
  300000.0          473          473          0              0.0000            7                0.0148        0.0148
  250000.0          446          446        

##### 2.1. Loanamount Bins

In [71]:
# Bin loanamount into up to 7 bins (including 1 for nulls), using qcut for equal leads per bin

import numpy as np
import pandas as pd

print("="*60)
print("LOANAMOUNT BINNED ANALYSIS")
print("="*60)

# Prepare a copy to avoid modifying original
loan_df = analysis_df.copy()

# Separate nulls
loan_null_mask = loan_df['loanamount'].isnull()
loan_nonnull = loan_df.loc[~loan_null_mask].copy()

# We want up to 6 bins for non-nulls, 1 for nulls (total 7)
n_bins = 6
# If there are fewer unique non-null values than bins, reduce bins
unique_nonnull = loan_nonnull['loanamount'].nunique()
if unique_nonnull < n_bins:
    n_bins = unique_nonnull

# Use qcut to bin by equal number of leads (rows) per bin
if n_bins > 1:
    # qcut may fail if there are too many duplicate values, so handle that
    try:
        loan_nonnull['loanamount_bin'] = pd.qcut(loan_nonnull['loanamount'], q=n_bins, duplicates='drop')
    except ValueError:
        # fallback: use cut with equal-width bins
        loan_nonnull['loanamount_bin'] = pd.cut(loan_nonnull['loanamount'], bins=n_bins)
else:
    loan_nonnull['loanamount_bin'] = loan_nonnull['loanamount']

# Assign 'NULL' bin to nulls
loan_df['loanamount_bin'] = np.nan
loan_df.loc[loan_null_mask, 'loanamount_bin'] = 'NULL'
loan_df.loc[~loan_null_mask, 'loanamount_bin'] = loan_nonnull['loanamount_bin'].astype(str).values

# Group and aggregate
loanamount_stats = (
    loan_df
    .groupby('loanamount_bin')
    .agg(
        leads_count=('leads', 'count'),
        total_leads=('leads', 'sum'),
        total_qls=('qls', 'sum'),
        avg_qls_per_record=('qls', 'mean'),
        total_sales=('sales', 'sum'),
        avg_sales_per_record=('sales', 'mean'),
        avg_l2s_rate=('l2s', 'mean')
    )
    .reset_index()
)

# Sort bins: NULL first, then by bin order
def sort_key(x):
    if x == 'NULL':
        return -1
    # Try to extract left edge of interval for sorting
    try:
        if x.startswith('(') or x.startswith('['):
            return float(x.split(',')[0].replace('(','').replace('[',''))
        else:
            return float(x)
    except Exception:
        return 0

loanamount_stats = loanamount_stats.sort_values(by='loanamount_bin', key=lambda col: col.map(sort_key))

print("Top 10 bins by total leads:")
print(loanamount_stats.head(10).to_string(index=False))

# Overall summary
unique_bins = loanamount_stats['loanamount_bin'].nunique()
print(f"\nOverall summary for loanamount (binned):")
print(f"  Unique bins: {unique_bins}")
print(f"  Total leads: {loan_df['leads'].sum()}")
print(f"  Total QLS: {loan_df['qls'].sum()}")
print(f"  Total sales: {loan_df['sales'].sum()}")
print(f"  Overall L2S rate: {(loan_df['sales'].sum() / loan_df['leads'].sum() * 100):.2f}%")
print("=" * 60)

LOANAMOUNT BINNED ANALYSIS
Top 10 bins by total leads:
        loanamount_bin  leads_count  total_leads  total_qls  avg_qls_per_record  total_sales  avg_sales_per_record  avg_l2s_rate
                  NULL       135240       135268       2497            0.018463           10              0.000074      0.000074
     (-0.001, 85000.0]         5922         5924          0            0.000000          101              0.017055      0.017055
   (85000.0, 161000.0]         5932         5933          0            0.000000          126              0.021241      0.021241
  (161000.0, 245000.0]         5924         5925          0            0.000000          108              0.018231      0.018231
  (245000.0, 339000.0]         5910         5911          0            0.000000          116              0.019628      0.019628
  (339000.0, 490000.0]         5933         5935          0            0.000000          114              0.019215      0.019215
(490000.0, 19999998.0]         5901       

  loan_df.loc[loan_null_mask, 'loanamount_bin'] = 'NULL'


##### 3. Per Company

In [None]:
# 3. PER COMPANY L2S ANALYSIS
print("="*60)
print("3. PER COMPANY L2S ANALYSIS")
print("="*60)

# Company-level analysis
company_stats = analysis_df.groupby('company').agg({
    'subid': 'count',
    'leads': 'sum',
    'qls': 'sum', 
    'sales': 'sum',
    'l2s': 'mean'
}).round(4)

company_stats = company_stats.rename(columns={
    'subid': 'total_records',
    'leads': 'total_leads',
    'qls': 'total_qls',
    'sales': 'total_sales',
    'l2s': 'avg_l2s_rate'
})

# Calculate additional metrics
company_stats['l2s_overall'] = (company_stats['total_sales'] / company_stats['total_leads'] * 100).round(2)
company_stats['qls_rate'] = (company_stats['total_qls'] / company_stats['total_leads'] * 100).round(2)
company_stats['sales_rate'] = (company_stats['total_sales'] / company_stats['total_leads'] * 100).round(2)

# Sort by total sales
company_stats = company_stats.sort_values('total_sales', ascending=False)

print("Company Performance Summary:")
print(company_stats.to_string())

print(f"\nOverall Summary:")
print(f"  Total companies: {len(company_stats)}")
print(f"  Total records: {company_stats['total_records'].sum()}")
print(f"  Total leads: {company_stats['total_leads'].sum()}")
print(f"  Total QLS: {company_stats['total_qls'].sum()}")
print(f"  Total sales: {company_stats['total_sales'].sum()}")
print(f"  Overall L2S rate: {(company_stats['total_sales'].sum() / company_stats['total_leads'].sum() * 100):.2f}%")


3. PER COMPANY L2S ANALYSIS
Company Performance Summary:
         total_records  total_leads  total_qls  total_sales  avg_l2s_rate  l2s_overall  qls_rate  sales_rate
company                                                                                                     
ni              170762       170797       2497          636        0.0037         0.37      1.46        0.37

Overall Summary:
  Total companies: 1
  Total records: 170762
  Total leads: 170797
  Total QLS: 2497
  Total sales: 636
  Overall L2S rate: 0.37%


all rows has company value (220010 leads)

##### 4. CATEGORICAL ANALYSIS + CORRELATION ANALYSIS AND CATEGORICAL ANALYSIS

In [73]:
# 5. CORRELATION ANALYSIS AND CATEGORICAL ANALYSIS (FIXED VERSION)
print("="*60)
print("4. CORRELATION ANALYSIS AND CATEGORICAL ANALYSIS")
print("="*60)

# Function to detect if field is numeric
def is_numeric_field(series):
    try:
        pd.to_numeric(series, errors='raise')
        return True
    except:
        return False

# Function to safely calculate correlation
def safe_correlation(series1, series2):
    try:
        # Remove null values
        data = pd.DataFrame({'x': series1, 'y': series2}).dropna()
        if len(data) < 2:
            return np.nan
        # Check if either series has zero variance
        if data['x'].std() == 0 or data['y'].std() == 0:
            return np.nan
        return data['x'].corr(data['y'])
    except:
        return np.nan

for field in enrichment_fields:
    print(f"\n{field.upper()}:")
    print("-" * 40)
    
    # Check if field is numeric
    is_numeric = is_numeric_field(analysis_df[field])
    print(f"Field type: {'Numeric' if is_numeric else 'Categorical'}")
    
    if is_numeric:
        # Correlation analysis for numeric fields
        print("\nCORRELATION WITH SALES:")
        # Remove null values for correlation
        numeric_data = analysis_df[[field, 'sales', 'leads', 'qls', 'l2s']].dropna()
        
        if len(numeric_data) > 1:
            correlations = {
                'sales': safe_correlation(numeric_data[field], numeric_data['sales']),
                'leads': safe_correlation(numeric_data[field], numeric_data['leads']),
                'qls': safe_correlation(numeric_data[field], numeric_data['qls']),
                'l2s': safe_correlation(numeric_data[field], numeric_data['l2s'])
            }
            
            for metric, corr in correlations.items():
                if np.isnan(corr):
                    print(f"  {field} vs {metric}: No correlation (constant values or insufficient data)")
                else:
                    print(f"  {field} vs {metric}: {corr:.4f}")
        else:
            print("  Not enough data for correlation analysis")
            
    else:
        # Categorical analysis
        print("\nTOP 10 CATEGORIES BY SALES:")
        cat_analysis = analysis_df.groupby(field).agg({
            'leads': 'sum',
            'qls': 'sum',
            'sales': 'sum',
            'l2s': 'mean'
        }).round(4)
        
        # Calculate L2S rate safely (avoid division by zero)
        cat_analysis['l2s_rate'] = np.where(
            cat_analysis['leads'] > 0,
            (cat_analysis['sales'] / cat_analysis['leads'] * 100).round(2),
            0
        )
        
        # Sort by sales
        cat_analysis = cat_analysis.sort_values('sales', ascending=False)
        
        # Show top 10
        top_10 = cat_analysis.head(10)
        print(f"{'Category':<20} {'Leads':<10} {'QLS':<10} {'Sales':<10} {'L2S Rate':<10}")
        print("-" * 70)
        for category, row in top_10.iterrows():
            print(f"{str(category)[:18]:<20} {row['leads']:<10.0f} {row['qls']:<10.0f} {row['sales']:<10.0f} {row['l2s_rate']:<10.2f}%")
        # Yes, the correlation is calculated after dropping rows with nulls in the field being analyzed and the target metric.
        # See: numeric_data = analysis_df[[field, 'sales', 'leads', 'qls', 'l2s']].dropna()
        # This means only rows where both the field and the metric (e.g., sales) are not null are included in the correlation calculation.
        print(f"\nTotal categories: {len(cat_analysis)}")
        print(f"Categories with sales > 0: {len(cat_analysis[cat_analysis['sales'] > 0])}")
        
        # Additional insights for categorical data
        print(f"\nCategory Performance Insights:")
        print(f"  Best L2S rate: {cat_analysis[cat_analysis['leads'] > 0]['l2s_rate'].max():.2f}%")
        print(f"  Worst L2S rate: {cat_analysis[cat_analysis['leads'] > 0]['l2s_rate'].min():.2f}%")
        print(f"  Average L2S rate: {cat_analysis[cat_analysis['leads'] > 0]['l2s_rate'].mean():.2f}%")
    
    print("=" * 60)


4. CORRELATION ANALYSIS AND CATEGORICAL ANALYSIS

LOANAMOUNT:
----------------------------------------
Field type: Numeric

CORRELATION WITH SALES:
  loanamount vs sales: -0.0165
  loanamount vs leads: -0.0027
  loanamount vs qls: No correlation (constant values or insufficient data)
  loanamount vs l2s: -0.0165

CREDITSCORE:
----------------------------------------
Field type: Categorical

TOP 10 CATEGORIES BY SALES:
Category             Leads      QLS        Sales      L2S Rate  
----------------------------------------------------------------------
Excellent            17822      0          347        1.95      %
Good                 8020       0          188        2.34      %
Average              4552       0          70         1.54      %
Below Average        3252       0          19         0.58      %
Poor                 1867       0          2          0.11      %
Unknown              16         0          0          0.00      %

Total categories: 6
Categories with sales > 0

##### 5. INVESTIGATION: Leads vs Records Discrepancy (rows with more than 1 lead)

In [74]:
# INVESTIGATION: Leads vs Records Discrepancy
print("="*60)
print("INVESTIGATION: Leads vs Records Discrepancy")
print("="*60)

print(f"Total records: {len(analysis_df)}")
print(f"Total leads: {analysis_df['leads'].sum()}")
print(f"Average leads per record: {analysis_df['leads'].mean():.4f}")

print(f"\nLeads distribution:")
print(analysis_df['leads'].value_counts().sort_index().head(20))

print(f"\nRecords with leads > 1:")
leads_gt_1 = analysis_df[analysis_df['leads'] > 1]
print(f"Count: {len(leads_gt_1)}")
print(f"Percentage: {len(leads_gt_1) / len(analysis_df) * 100:.2f}%")

if len(leads_gt_1) > 0:
    print(f"\nTop records with highest leads:")
    print(leads_gt_1[['subid', 'leads', 'qls', 'sales']].sort_values('leads', ascending=False).head(10))

print(f"\nLeads statistics:")
print(f"Min: {analysis_df['leads'].min()}")
print(f"Max: {analysis_df['leads'].max()}")
print(f"Median: {analysis_df['leads'].median()}")
print(f"Std: {analysis_df['leads'].std():.2f}")

# Check if there are any records with 0 leads
zero_leads = analysis_df[analysis_df['leads'] == 0]
print(f"\nRecords with 0 leads: {len(zero_leads)}")
if len(zero_leads) > 0:
    print("This might explain the discrepancy - some records have 0 leads")


INVESTIGATION: Leads vs Records Discrepancy
Total records: 170762
Total leads: 170797
Average leads per record: 1.0002

Leads distribution:
leads
1    170727
2        35
Name: count, dtype: int64

Records with leads > 1:
Count: 35
Percentage: 0.02%

Top records with highest leads:
             subid  leads  qls  sales
10828   KGfE1FMa1A      2    1      0
143366  y4QCOxx43e      2    0      0
112765  vkjHDsyxOM      2    0      0
114048  8sivNysjEN      2    0      0
115401  IKdwaEMaJD      2    1      0
117247  5dBAiZv3XM      2    0      0
133885  BHDJjWFZYg      2    0      0
134743  4XI2mjMuPH      2    0      0
150485  OTBjDRULV0      2    0      0
110555  aderKZPPJM      2    0      0

Leads statistics:
Min: 1
Max: 2
Median: 1.0
Std: 0.01

Records with 0 leads: 0


In [75]:
analysis_df.columns

Index(['subid', 'company', 'transaction_date', 'clickout_date', 'loanamount',
       'creditscore', 'leads', 'qls', 'sales', 'l2s'],
      dtype='object')

### Conclusions:

- The Rocket Refinance Allocated is a small portion of the leads. only 20% of the leads (So 80% null rate for the enrichment fields).
- Null rows has almost 0 sales but all QLs. 
- Creditscore is a good predictor for L2S
- Loanamount is not a good predictor (20% corr and doesnt look good .. Loanamount bins has pretty equal L2S values).
- Most users has 1 lead (from Jan only 35 users had 2 leads).
- Seems like the columns takes 20-30 hours to be filled but we can't verify it because of data structure. 
- All rows has rn = 1 (enrichment flattern table).

In [None]:
print('thank you and we will meet again in the next partenr check :)')

NameError: name 'analysis_df' is not defined