## Imports

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import DataFrame,SparkSession

## Funções

In [0]:
def safra_aggroup(df, col):
    agg_data = df.groupBy('safra', col).count()
    total_counts = agg_data.groupBy('safra').agg(F.sum('count').alias('total_count'))
    normalized_data = agg_data.join(total_counts, 'safra').withColumn('percentage', F.col('count') / F.col('total_count') * 100)

    pivot_data = normalized_data.groupBy('safra').pivot(col).sum('percentage').fillna(0).orderBy('safra').toPandas()

    pivot_data.set_index('safra', inplace=True)
    pivot_data.plot(kind='bar', stacked=True, figsize=(20, 12))

    plt.xlabel('Safra')
    plt.ylabel('Percentage')
    plt.title(f'{col} por Safra')
    plt.legend(loc = 'lower center', bbox_to_anchor=(1.05, 0), ncol=1, title=col)
    plt.show()

In [0]:
def plot_mean_median(df, num_col):
    stats_df = df.groupBy('safra').agg(
        F.mean(num_col).alias('mean'),
        F.expr(f'percentile_approx({num_col}, 0.5)').alias('median')
    ).orderBy('safra').toPandas()
    
    plt.figure(figsize=(14, 8))
    plt.plot([str(s) for s in stats_df['safra']], stats_df['mean'], label='Mean', marker='o')
    plt.plot([str(s) for s in stats_df['safra']], stats_df['median'], label='Median', marker='*')

    plt.xlabel('Safra')
    plt.ylabel('Value')
    plt.title(f'Mean and Median of {num_col} over safra')
    plt.legend(['Mean', 'Median'], loc='best')
    plt.xticks(rotation=45)
    plt.show()

## Base

In [0]:
base = spark.table("sand_riscos_pm_pf.T789778_spine_target_dm_v3")
base

In [0]:
lista = [
'payment_method_id',
'payment_plan_days',
'plan_list_price',
'actual_amount_paid',
'is_auto_renew',
'city',
'gender'
]

In [0]:
for k in lista:
    safra_aggroup(base, k)

## Definição do Público

In [0]:
churn_safra = base.groupBy('safra').agg(F.round(F.avg(F.col('target')*100), 2).alias('churn_rate')).orderBy('safra')
churn_safra_df = churn_safra.toPandas()
churn_safra_df

In [0]:
plt.figure(figsize=(14, 8))
bars = plt.bar([str(s) for s in churn_safra_df['safra']], churn_safra_df['churn_rate'], color='red')
plt.xlabel('Safra')
plt.ylabel('Churn Rate (%)')
plt.title('Taxa de Churn por Safra')
plt.xticks(rotation=45)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1f}%', ha='center', va='bottom')

plt.show()

## Público - Filtro

Neste caso, acredito que devemos filtrar apenas o público que possui um plano e não realiza pagamento mensal (plan_list_price) e que o plano tem duração de 0 dias (payment_plan_days).

In [0]:
base_final = (base
              #.filter(~F.col('plan_list_price').isin(0))
              .filter(~F.col('actual_amount_paid').isin(0))
              .filter(~F.col('payment_plan_days').isin(0))
              )

In [0]:
churn_safra = base_final.groupBy('safra').agg(F.round(F.avg(F.col('target')*100), 2).alias('churn_rate')).orderBy('safra')
churn_safra_df = churn_safra.toPandas()
churn_safra_df

In [0]:
plt.figure(figsize=(14, 8))
bars = plt.bar([str(s) for s in churn_safra_df['safra']], churn_safra_df['churn_rate'], color='red')
plt.xlabel('Safra')
plt.ylabel('Churn Rate (%)')
plt.title('Taxa de Churn por Safra')
plt.xticks(rotation=45)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1f}%', ha='center', va='bottom')

plt.show()

## Auto Renew == 0

In [0]:
churn_safra = base_final.filter(F.col('is_auto_renew').isin(0)).groupBy('safra').agg(F.round(F.avg(F.col('target')*100), 2).alias('churn_rate')).orderBy('safra')
churn_safra_df = churn_safra.toPandas()
churn_safra_df

In [0]:
plt.figure(figsize=(14, 8))
bars = plt.bar([str(s) for s in churn_safra_df['safra']], churn_safra_df['churn_rate'], color='red')
plt.xlabel('Safra')
plt.ylabel('Churn Rate (%)')
plt.title('Taxa de Churn por Safra')
plt.xticks(rotation=45)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1f}%', ha='center', va='bottom')

plt.show()

## Auto Renew == 1

In [0]:
churn_safra = base_final.filter(F.col('is_auto_renew').isin(1)).groupBy('safra').agg(F.round(F.avg(F.col('target')*100), 2).alias('churn_rate')).orderBy('safra')
churn_safra_df = churn_safra.toPandas()
churn_safra_df

In [0]:
plt.figure(figsize=(14, 8))
bars = plt.bar([str(s) for s in churn_safra_df['safra']], churn_safra_df['churn_rate'], color='red')
plt.xlabel('Safra')
plt.ylabel('Churn Rate (%)')
plt.title('Taxa de Churn por Safra')
plt.xticks(rotation=45)

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width() / 2, height, f'{height:.1f}%', ha='center', va='bottom')

plt.show()

## Análise Inicial

In [0]:
for k in lista:
    safra_aggroup(base_final, k)

In [0]:
plot_mean_median(base, 'num_25')

In [0]:
plot_mean_median(base, 'num_50')

In [0]:
plot_mean_median(base, 'num_75')

In [0]:
plot_mean_median(base, 'num_985')

In [0]:
plot_mean_median(base, 'num_100')

# EDA Base Final

Criação de Variáveis e Tratamentos

### Geral

In [0]:
base = spark.table("sand_riscos_pm_pf.T789778_base_final_dm")
base

In [0]:
lista = [
'payment_method_id_1m',
'payment_plan_days_1m',
'plan_list_price_1m',
'actual_amount_paid_1m',
'is_auto_renew_1m',
'city_1m',
'gender_1m'
]

In [0]:
for k in lista:
    safra_aggroup(base, k)

### Churn

In [0]:
churn = base.filter(F.col('target').isin(1))
churn

In [0]:
for k in lista:
    safra_aggroup(churn, k)

O que parece haver é um churn devido a finalização de alguma campanha que acabou em 201601, pois o churn se concentra em um método de pagamento específico.