## Imports

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import DataFrame, SparkSession

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, roc_curve, auc

## Função

In [0]:
def calcular_valores(fn, tp):
    '''
    Calcula os valores de Crm Proativo e Lucro do CRM Proativo
    133.75 NTD: Valor Médio Ponderado da Mensalidade do Streaming de Música
    1 NTD = 0.03170 USD em Agosto de 2016
    0.5 é considerando que 50% do Público Verdadeiro Positivo responde positivamente a ação e fica 12 meses ativo.
    9 é o número de meses que o Público Verdadeiro Positivo fica ativo após a ação
    '''
    inv_proativo = -round((fn + tp) * 133.75 * 0.03170 * 3, 2)
    inv_crm = -round(tp * 133.75 * 0.03170 * 0.5 * 3, 2)
    lucro_br = round(tp * 133.75 * 0.03170 * 0.5 * 12, 2)
    lucro_lq = round(tp * 133.75 * 0.03170 * 0.5 * 9, 2)
    return inv_proativo, inv_crm, lucro_br, lucro_lq

In [0]:
def calcular_valores_por_cluster(df, cluster_col):

    resultados = df.groupBy(cluster_col).agg(
        F.sum('fn').alias('total_fn'),
        F.sum('tp').alias('total_tp')
    )

    valores_por_cluster = resultados.rdd.map(lambda row: (
        row[cluster_col], 
        *calcular_valores(row['total_fn'], row['total_tp'])
    )).toDF([cluster_col, 'inv_proativo', 'inv_crm', 'lucro_br', 'lucro_lq'])

    return valores_por_cluster

## Base Inicial

In [0]:
base = spark.table('sand_riscos_pm_pf.T789778_spine_target_dm').filter(F.col('safra').between(201608, 201611))
base

In [0]:
base.groupBy('payment_plan_days').count().display()

In [0]:
base.groupBy('plan_list_price').count().display()

In [0]:
base.filter(F.col('payment_plan_days').isin(30,31)).groupBy('plan_list_price').count().display()

In [0]:
top_5_counts = base.filter(F.col('payment_plan_days').isin(30,31)) \
    .groupBy('plan_list_price') \
    .count() \
    .orderBy(F.desc('count')) \
    .limit(5)

weighted_avg = (top_5_counts.withColumn('weighted_price', F.col('plan_list_price') * F.col('count')) \
    .agg(F.round(F.sum('weighted_price') / F.sum('count'),2)).alias('weighted_avg_price')
    ).toPandas()#['round((sum(weighted_price) / sum(count)), 2)']

weighted_avg

## Bases Escoradas

In [0]:
modelo_unico = spark.table('sand_riscos_pm_pf.T789778_base_score_unico').filter(F.col('safra').between(201608, 201611)).toPandas()
modelo_ar0 = spark.table('sand_riscos_pm_pf.T789778_base_score_ar0').filter(F.col('safra').between(201608, 201611)).toPandas()
modelo_ar1 = spark.table('sand_riscos_pm_pf.T789778_base_score_ar1').filter(F.col('safra').between(201608, 201611)).toPandas()

In [0]:
print('Modelo Único')
confusion_matrix(modelo_unico['target'], modelo_unico['y_pred_lgbm_unico'])

In [0]:
print('Modelo Auto Renew == 0')
confusion_matrix(modelo_ar0['target'], modelo_ar0['y_pred_lgbm_ar0'])

In [0]:
print('Modelo Auto Renew == 1')
confusion_matrix(modelo_ar1['target'], modelo_ar1['y_pred_log_ar1'])

## Avaliação Financeira - Modelo único

In [0]:
tn, fp, fn, tp = confusion_matrix(modelo_unico['target'], modelo_unico['y_pred_lgbm_unico']).ravel()
tn, fp, fn, tp

In [0]:
inv_proativo, inv_crm, lucro_br, lucro_lq = calcular_valores(fn, tp)
print('Investimento Proativo: $ {:,.2f}'.format(inv_proativo))
print('Investimento: $ {:,.2f}'.format(inv_crm))
print('Lucro Bruto: $ {:,.2f}'.format(lucro_br))
print('Lucro Liquido: $ {:,.2f}'.format(lucro_lq))

## Avaliação Financeira - Modelo AR0 e AR1

In [0]:
tn, fp, fn, tp =confusion_matrix(modelo_ar0['target'], modelo_ar0['y_pred_lgbm_ar0']).ravel() + confusion_matrix(modelo_ar1['target'], modelo_ar1['y_pred_log_ar1']).ravel()
tn, fp, fn, tp

In [0]:
143867-74786

In [0]:
inv_proativo, inv_crm, lucro_br, lucro_lq = calcular_valores(fn+69081, tp)
print('Investimento Proativo: $ {:,.2f}'.format(inv_proativo))
print('Investimento: $ {:,.2f}'.format(inv_crm))
print('Lucro Bruto: $ {:,.2f}'.format(lucro_br))
print('Lucro Liquido: $ {:,.2f}'.format(lucro_lq))

## Avaliação nos Clusters

In [0]:
base_score = spark.table('sand_riscos_pm_pf.T789778_base_score_unico').filter(F.col('safra').between(201608, 201611))
base_ar0 = spark.table('sand_riscos_pm_pf.T789778_base_score_ar0').filter(F.col('safra').between(201608, 201611))
base_ar1 = spark.table('sand_riscos_pm_pf.T789778_base_score_ar1').filter(F.col('safra').between(201608, 201611))

In [0]:
cluster_base = spark.table('sand_riscos_pm_pf.T789778_base_clustering').filter(F.col('safra').between(201608, 201611))
cluster_base

In [0]:
df_clusters = cluster_base.join(base_score, on=['msno', 'safra', 'target'], how='inner')
df_clusters_ar0 = (cluster_base.join(base_ar0, on=['msno', 'safra', 'target'], how='inner')
                   .withColumnRenamed('score_lgbm_ar0', 'score_ar')
                   .withColumnRenamed('y_pred_lgbm_ar0', 'y_pred')
                   .withColumnRenamed('best_threshold_ar0', 'best_threshold')
                   )
df_clusters_ar1 = (cluster_base.join(base_ar1, on=['msno', 'safra', 'target'], how='inner')
                   .withColumnRenamed('score_log_ar1', 'score_ar')
                   .withColumnRenamed('y_pred_log_ar1', 'y_pred')
                   .withColumnRenamed('best_threshold_ar1', 'best_threshold')
                   )
df_clusters_ar = df_clusters_ar0.union(df_clusters_ar1)

In [0]:
df_clusters = (df_clusters.withColumn('tp', F.when((F.col('y_pred_lgbm_unico') == 1) & (F.col('target') == 1), 1).otherwise(0))
                                     .withColumn('fn', F.when((F.col('y_pred_lgbm_unico') == 0) & (F.col('target') == 1), 1).otherwise(0))
                                     .withColumn('fp', F.when((F.col('y_pred_lgbm_unico') == 1) & (F.col('target') == 0), 1).otherwise(0))
                                     .withColumn('tn', F.when((F.col('y_pred_lgbm_unico') == 0) & (F.col('target') == 0), 1).otherwise(0))
                                     )

df_clusters_ar = (df_clusters_ar.withColumn('tp', F.when((F.col('y_pred') == 1) & (F.col('target') == 1), 1).otherwise(0))
                                   .withColumn('fn', F.when((F.col('y_pred') == 0) & (F.col('target') == 1), 1).otherwise(0))
                                   .withColumn('fp', F.when((F.col('y_pred') == 1) & (F.col('target') == 0), 1).otherwise(0))
                                   .withColumn('tn', F.when((F.col('y_pred') == 0) & (F.col('target') == 0), 1).otherwise(0))
                                   )

## Clusters Únicos

In [0]:
calcular_valores_por_cluster(df_clusters, 'cluster_kmeans').display()

In [0]:
calcular_valores_por_cluster(df_clusters, 'cluster_hdbscan').display()

## Cluster Segmentado

In [0]:
calcular_valores_por_cluster(df_clusters_ar, 'cluster_kmeans').display()

In [0]:
calcular_valores_por_cluster(df_clusters_ar, 'cluster_hdbscan').display()