## Imports

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import DataFrame,SparkSession

## PSI

In [0]:
def calculate_psi(expected, actual, buckettype='bins', buckets=10):
    if buckettype == 'bins':
        expected_perc = expected.groupBy('bucket').agg(F.count('*').alias('expected_count'))
        actual_perc = actual.groupBy('bucket').agg(F.count('*').alias('actual_count'))
    else:
        expected_perc = expected.groupBy('value').agg(F.count('*').alias('expected_count'))
        actual_perc = actual.groupBy('value').agg(F.count('*').alias('actual_count'))

    expected_perc = expected_perc.withColumn('expected_perc', F.col('expected_count') / expected_perc.agg(F.sum('expected_count')).first()[0])
    actual_perc = actual_perc.withColumn('actual_perc', F.col('actual_count') / actual_perc.agg(F.sum('actual_count')).first()[0])

    psi_df = expected_perc.join(actual_perc, on='bucket', how='outer').fillna(0)
    psi_df = psi_df.withColumn('psi', (F.col('actual_perc') - F.col('expected_perc')) * F.log(F.col('actual_perc') / F.col('expected_perc')))
    
    return psi_df.agg(F.sum('psi')).first()[0]

## Variáveis

In [0]:
variaveis_unico = [
'payment_method_id_1m_index',
'is_auto_renew_1m_index',
'actual_amount_paid_1m',
'payment_plan_days_1m',
'account_time_1m',
'actual_amount_paid_avg_2m',
'city_1m_index',
'actual_amount_paid_median_2m',
'age_group_1m_index',
'num_unq_median_2m',
'num_25_max_2m',
'num_100_min_2m',
'actual_amount_paid_max_3m',
'actual_amount_paid_avg_3m',
'plan_list_price_median_2m',
'plan_list_price_avg_4m',
'num_unq_max_4m',
'plan_list_price_1m',
'plan_list_price_min_2m',
'actual_amount_paid_median_4m',
'num_25_max_4m',
'num_100_avg_2m',
'num_unq_avg_4m',
'plan_list_price_min_4m',
'actual_amount_paid_max_2m',
'diversity_score_median_3m',
'plan_list_price_avg_3m',
'num_50_max_3m',
]

## Base

In [0]:
base_spine = spark.table("sand_riscos_pm_pf.T789778_base_final_dm").sample(0.25).drop('features').select(variaveis_unico)
base_spine

In [0]:
train_test_data = base_spine.filter(~F.col('safra').isin(201601)).filter(F.col('safra').between(201505,201607))
validation_data = base_spine.filter(F.col('safra').between(201608,201611))

In [0]:
psi_results = {}
for column in variaveis_unico:
    expected_data = train_test_data.select(column).withColumn('bucket', F.ntile(10).over(Window.orderBy(column)))
    actual_data = validation_data.select(column).withColumn('bucket', F.ntile(10).over(Window.orderBy(column)))
    psi_value = calculate_psi(expected_data, actual_data)
    psi_results[column] = psi_value

psi_results_df = spark.createDataFrame(psi_results.items(), ["Variável", "PSI"])
psi_results_df.display()

In [0]:
psi_results_df.toPandas()