## Imports

In [0]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.window import Window
from pyspark.sql.types import *
from pyspark.sql import DataFrame, SparkSession

from sklearn.metrics import roc_auc_score, precision_score, recall_score, f1_score, accuracy_score, confusion_matrix, roc_curve, auc
import seaborn as sns
from sklearn.metrics import roc_curve, auc, confusion_matrix, classification_report, precision_recall_curve
from scipy.stats import ks_2samp
from scipy import stats

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import roc_auc_score
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.preprocessing import StandardScaler, OneHotEncoder

import shap

## Funções Objetivo

In [0]:
def plot_roc_curve(model, X_test, y_test):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    roc_auc = auc(fpr, tpr)
    
    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

def plot_confusion_matrix(model, X_test, y_test, threshold):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = [1 if pred >= threshold else 0 for pred in y_pred_proba]
    cm = confusion_matrix(y_test, y_pred)
    
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.title('Confusion Matrix')
    plt.show()

def plot_feature_importances(model, feature_names):
    importances = model.feature_importances_
    indices = np.argsort(importances)[::-1]
    
    plt.figure(figsize=(12, 6))
    plt.title('Feature Importances')
    plt.bar(range(len(importances)), importances[indices], align='center')
    plt.xticks(range(len(importances)), [feature_names[i] for i in indices], rotation=90)
    plt.tight_layout()
    plt.show()

def plot_classification_report(model, X_test, y_test, threshold):
    y_pred_proba = model.predict_proba(X_test)[:, 1]
    y_pred = [1 if pred >= threshold else 0 for pred in y_pred_proba]
    report = classification_report(y_test, y_pred, output_dict=True)
    
    plt.figure(figsize=(10, 6))
    sns.heatmap(pd.DataFrame(report).iloc[:-1, :].T, annot=True, cmap='Blues')
    plt.title('Classification Report')
    plt.show()

In [0]:
def ks_grouped_by_safra(df, target_col, prob_col, safras):
    ks_df = pd.DataFrame(columns=['safra', 'ks_stat', 'p_value'])
    for safra in sorted(safras):
        df_safra = df[df['safra'] == safra]
        ks_stat, p_value = stats.ks_2samp(df_safra[df_safra[target_col] == 1][prob_col], df_safra[df_safra[target_col] == 0][prob_col])
        ks_df = ks_df.append({'safra': safra, 'ks_stat': ks_stat, 'p_value': p_value}, ignore_index=True)
    return ks_df

In [0]:
def calculate_churn_percentage(df, target_col, pred_col, pred_col_bin, threshold):
    actual_churn_rate = df[target_col].mean()
    df[pred_col_bin] = (df[pred_col] >= threshold).astype(int)
    predicted_churn_rate = df[pred_col_bin].mean()
    return actual_churn_rate, predicted_churn_rate

def calculate_churn_by_safra(df, target_col, pred_col):
    grouped = df.groupby('safra').agg(
        actual_churn_rate=(target_col, 'mean'),
        predicted_churn_rate=(pred_col, 'mean')
    ).reset_index()
    return grouped

In [0]:

def balance_dataset(df, target_col):
    counts = df.groupBy(target_col).count().collect()
    min_count = min(row['count'] for row in counts)
    
    fractions = {row[target_col]: min_count / row['count'] for row in counts}
    balanced_df = df.sampleBy(target_col, fractions, seed=42)
    
    return balanced_df

## Base de Modelagem

In [0]:
base_spine = (spark.table("sand_riscos_pm_pf.T789778_base_final_dm_v3")
                .filter(F.col('safra').between(201512,201611))
                .filter(F.col('is_auto_renew_1m').isin(1))
                .sample(0.25)
                .drop(*['features', 'is_auto_renew_1m_index'])
                )
base_spine

In [0]:
# base_spine = balance_dataset(base_inicial, 'target')
# base_spine

## Variáveis

In [0]:
id_vars = ['msno', 'safra']
target = ['target']

In [0]:
## 0.003

variaveis_num = ['payment_plan_days_1m',
 'actual_amount_paid_max_2m',
 'account_time_1m',
 'actual_amount_paid_avg_4m',
 'actual_amount_paid_median_4m',
 'actual_amount_paid_min_3m',
 'actual_amount_paid_max_3m',
 'actual_amount_paid_avg_2m',
 'actual_amount_paid_1m',
 'actual_amount_paid_avg_3m',]

variaveis_cat = [
'payment_method_id_1m_index',
'city_1m_index',
'age_group_1m_index',
'gender_1m_index',
]

In [0]:
len(variaveis_num), len(variaveis_cat)

## Base Spine em Pandas

In [0]:
base_spine_pd = base_spine.select(id_vars+variaveis_num+variaveis_cat+target).toPandas()

## One Hot Encoding  - Categóricas

In [0]:
encoder = OneHotEncoder(sparse=False)

In [0]:
encoded_columns = encoder.fit_transform(base_spine_pd[variaveis_cat])
encoded_df = pd.DataFrame(encoded_columns, columns=encoder.get_feature_names_out(variaveis_cat))

In [0]:
encoded_df.replace(0, -1, inplace=True)
encoded_df

In [0]:
base_spine_encoded_pd = pd.concat([base_spine_pd, encoded_df], axis=1).drop(columns=variaveis_cat)
base_spine_encoded_pd

## Normalização

In [0]:
scaler = StandardScaler()

In [0]:
df_id_target = base_spine_encoded_pd[id_vars+target]
df_scale = base_spine_encoded_pd[variaveis_num]
df_keep = base_spine_encoded_pd.drop(columns=variaveis_num+id_vars+target)

In [0]:
df_scaled_columns = scaler.fit_transform(df_scale)
df_scale_final = pd.DataFrame(df_scaled_columns, columns=variaveis_num, index=base_spine_encoded_pd.index)

In [0]:
base_spine_norm = pd.concat([df_id_target, df_scale_final, df_keep], axis=1)
base_spine_norm

In [0]:
base_spine = spark.createDataFrame(base_spine_norm)
base_spine

In [0]:
for col in base_spine.columns:
    if col.endswith(".0"):
        new_col = col.replace(".0", "_0")
        base_spine = base_spine.withColumnRenamed(col, new_col)

In [0]:
base_spine = balance_dataset(base_spine, 'target')
base_spine

## Variáveis

In [0]:
variaveis_rf = [k for k in base_spine.columns if k not in id_vars+target]

## Salvar

In [0]:
# spark.sql('drop table if exists sand_riscos_pm_pf.t789778_base_spine_norm_log_v3')
# base_spine.write.mode('overwrite').saveAsTable("sand_riscos_pm_pf.t789778_base_spine_norm_log_v3")
# print("sand_riscos_pm_pf.t789778_base_spine_norm_log_v3")

## Treino - Teste - Validação

In [0]:
train_data = base_spine.filter(~F.col('safra').isin(201601)).filter(F.col('safra').between(201512,201604)).select(id_vars+variaveis_rf+target).toPandas()
test_data = base_spine.filter(F.col('safra').between(201605,201607)).select(id_vars+variaveis_rf+target).toPandas()
validation_data = base_spine.filter(F.col('safra').between(201608,201611)).select(id_vars+variaveis_rf+target).toPandas()

In [0]:
X_train, y_train = train_data[variaveis_rf], train_data[target]
X_test, y_test = test_data[variaveis_rf], test_data[target]
X_val, y_val = validation_data[variaveis_rf], validation_data[target]

# Regressão Logística

## Optimização de Hiperparâmetros

In [0]:
def objective(params):
    clf = LogisticRegression(**params)
    score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=5).mean()
    return {'loss': -score, 'status': STATUS_OK}

In [0]:
# def objective(params):
#     clf = LogisticRegression(**params)
#     score = cross_val_score(clf, X_train, y_train, scoring='roc_auc', cv=5).mean()
#     return {'loss': -score, 'status': STATUS_OK}

In [0]:
space = {
    'C': hp.loguniform('C', -4, 4),
    'solver': hp.choice('solver', ['liblinear', 'saga']),
    'max_iter': hp.choice('max_iter', [100, 200, 300]),
    # 'class_weight': hp.choice('class_weight', ['balanced'])
    'penalty': hp.choice('penalty', ['l1', 'l2'])
}

In [0]:
trials = Trials()

In [0]:
best_params = fmin(fn=objective, 
                   space=space, 
                   algo=tpe.suggest, 
                   max_evals=50, 
                   trials=trials)

In [0]:
best_params['solver'] = ['liblinear','saga'][best_params['solver']]
best_params['max_iter'] = [100, 200, 300][best_params['max_iter']]
# best_params['class_weight'] = ['balanced'][best_params['class_weight']]
best_params['penalty'] = ['l1', 'l2'][best_params['penalty']]

In [0]:
print("Melhores hiperparâmetros para Regressão Logística:")
print(best_params)

In [0]:
# {'C': 0.001, 'max_iter': 1000, 'penalty': 'l1', 'solver': 'liblinear'}

In [0]:
final_log_model = LogisticRegression(**best_params)
final_log_model.fit(X_train, y_train)

## Escorar Bases

In [0]:
train_data_escorado = final_log_model.predict_proba(X_train)[:,1]
test_data_escorado = final_log_model.predict_proba(X_test)[:,1]
val_data_escorado = final_log_model.predict_proba(X_val)[:,1]

In [0]:
train_data['score_log'] = train_data_escorado
test_data['score_log'] = test_data_escorado
validation_data['score_log'] = val_data_escorado

In [0]:
auc_train = roc_auc_score(train_data['target'], train_data_escorado)
auc_test = roc_auc_score(test_data['target'], test_data_escorado)
auc_val = roc_auc_score(validation_data['target'], val_data_escorado)

In [0]:
print(f"AUC Train: {auc_train:.4f}")
print(f"AUC Test: {auc_test:.4f}")
print(f"AUC Validation: {auc_val:.4f}")

## Análises

In [0]:
plt.hist(train_data['score_log'], bins=100, alpha=0.5, label='Train')
plt.hist(test_data['score_log'], bins=100, alpha=0.5, label='Test')
plt.hist(validation_data['score_log'], bins=100, alpha=0.5, label='Validation')
plt.title('Score LightGBM')
plt.xlabel('Score')
plt.ylabel('Frequência')
plt.legend()

In [0]:
plt.hist(train_data[train_data['target']==0]['score_log'], bins=100, alpha=0.5, label='Train')
plt.hist(test_data[test_data['target']==0]['score_log'], bins=100, alpha=0.5, label='Test')
plt.hist(validation_data[validation_data['target']==0]['score_log'], bins=100, alpha=0.5, label='Validation')
plt.title('Score LightGBM Target 0')
plt.xlabel('Score')
plt.ylabel('Frequência')
plt.legend()

In [0]:
plt.hist(train_data[train_data['target']==1]['score_log'], bins=100, alpha=0.5, label='Train')
plt.hist(test_data[test_data['target']==1]['score_log'], bins=100, alpha=0.5, label='Test')
plt.hist(validation_data[validation_data['target']==1]['score_log'], bins=100, alpha=0.5, label='Validation')
plt.title('Score LightGBM Target 1')
plt.xlabel('Score')
plt.ylabel('Frequência')
plt.legend()

In [0]:
thresholds = np.arange(0.0, 1.05, 0.005)
f1_scores = []
best_threshold = 0
best_f1_score = 0

for threshold in thresholds:
    y_pred_bin = np.where(test_data['score_log'] >= threshold, 1, 0)

    f1 = f1_score(test_data['target'], y_pred_bin)
    f1_scores.append(f1)

    if f1 > best_f1_score:
        best_f1_score = f1
        best_threshold = threshold


print(f"Melhor Threshold: {best_threshold:.2f}")
print(f"Melhor F1-Score: {best_f1_score:.4f}")

In [0]:
thresholds_f1_scores = list(zip(thresholds, f1_scores))
thresholds_f1_scores

In [0]:
plt.figure(figsize=(10, 6))
plt.scatter(thresholds, f1_scores)
plt.xlabel('Thresholds')
plt.ylabel('F1-Scores')

max_f1_score = max(f1_scores)
max_f1_index = f1_scores.index(max_f1_score)
max_f1_threshold = thresholds[max_f1_index]

plt.annotate(f'Max F1-Score: {max_f1_score:.4f}\n(x, y) = ({max_f1_threshold:.2f}, {max_f1_score:.4f})', 
             xy=(max_f1_threshold, max_f1_score), 
             xytext=(max_f1_threshold, max_f1_score + 0.09),
             arrowprops=dict(facecolor='black', shrink=0.05))

plt.show()

## Rodar

In [0]:
best_threshold = 0.5

In [0]:
y_pred_proba = test_data['score_log']
precision, recall, _ = precision_recall_curve(test_data['target'], y_pred_proba)

plt.figure(figsize=(10, 6))
plt.plot(recall, precision, marker='o')
plt.title('Recall vs Precision')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.show()

In [0]:
def calculate_model_statistics(y_true, y_pred_proba, threshold):
    stats = {}

    stats['AUC'] = roc_auc_score(y_true, y_pred_proba)
    
    y_pred_binary = [1 if pred >= threshold else 0 for pred in y_pred_proba]
    
    stats['Precision'] = precision_score(y_true, y_pred_binary)
    
    stats['Recall'] = recall_score(y_true, y_pred_binary)

    stats['F1-Score'] = f1_score(y_true, y_pred_binary)
    
    stats['KS'] = ks_2samp(y_pred_proba[y_true == 1], y_pred_proba[y_true == 0]).statistic
    
    return stats

In [0]:
calculate_model_statistics(train_data['target'], train_data['score_log'], best_threshold)

In [0]:
calculate_model_statistics(test_data['target'], test_data['score_log'], best_threshold)

In [0]:
calculate_model_statistics(validation_data['target'], validation_data['score_log'], best_threshold)

In [0]:
train_stats = calculate_model_statistics(train_data['target'], train_data['score_log'], best_threshold)
test_stats = calculate_model_statistics(test_data['target'], test_data['score_log'], best_threshold)
val_stats = calculate_model_statistics(validation_data['target'], validation_data['score_log'], best_threshold)

train_stats_df = pd.DataFrame([train_stats])
test_stats_df = pd.DataFrame([test_stats])
val_stats_df = pd.DataFrame([val_stats])

train_stats_df['Period'] = 'Train'
test_stats_df['Period'] = 'Test'
val_stats_df['Period'] = 'Validation'

combined_stats = pd.concat([train_stats_df, test_stats_df, val_stats_df], ignore_index=True)
combined_stats = combined_stats[['Period'] + [col for col in combined_stats.columns if col != 'Period']]
combined_stats.iloc[:, 1:] = combined_stats.iloc[:, 1:].applymap(lambda x: f"{x:.2%}")
display(combined_stats)

In [0]:
plot_roc_curve(final_log_model, X_test, y_test)
plot_confusion_matrix(final_log_model, X_test, y_test, threshold=best_threshold)
# plot_feature_importances(final_log_model, X_test.columns)
plot_classification_report(final_log_model, X_test, y_test, threshold=best_threshold)

In [0]:
plot_roc_curve(final_log_model, X_val, y_val)
plot_confusion_matrix(final_log_model, X_val, y_val, threshold=best_threshold)
# plot_feature_importances(final_log_model, X_val.columns)
plot_classification_report(final_log_model, X_val, y_val, threshold=best_threshold)

In [0]:
feature_importances = final_log_model.coef_[0]
features = X_train.columns

importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

plt.figure(figsize=(15, 12))
plt.barh(importance_df['Feature'], importance_df['Importance'])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Feature Importance - Logistic Regression')
plt.gca().invert_yaxis()
plt.show()

In [0]:
feature_importances = final_log_model.coef_[0]
features = X_train.columns

importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})

importance_df = importance_df[importance_df['Importance'] != 0]
importance_df = importance_df.sort_values(by='Importance', ascending=False)
importance_df

In [0]:
importance_df['Feature'].tolist()

In [0]:
s1 = train_data['safra'].unique()
s2 = test_data['safra'].unique()
s3 = validation_data['safra'].unique()

ks_s1 = ks_grouped_by_safra(train_data, 'target', 'score_log', sorted(s1))
ks_s2 = ks_grouped_by_safra(test_data, 'target', 'score_log', sorted(s2))
ks_s3 = ks_grouped_by_safra(validation_data, 'target', 'score_log', sorted(s3))

ks_union = pd.concat([ks_s1, ks_s2, ks_s3])

# Plotting the KS statistics
plt.figure(figsize=(10, 6))
plt.plot(ks_union['safra'].astype(str), ks_union['ks_stat']*100, marker='o')
# plt.axvline(x=str(sorted(s2)[0]), color='r', linestyle='--')
# plt.axvline(x=str(sorted(s3)[0]), color='g', linestyle='--')
plt.ylim(0, 70)
plt.xlabel('Safra')
plt.ylabel('KS Statistic')
plt.title('KS Statistic by Safra')
plt.xticks(rotation=45)
plt.show()

In [0]:
train_data['score_log'] = train_data_escorado
test_data['score_log'] = test_data_escorado
validation_data['score_log'] = val_data_escorado

train_data['y_pred_log'] = (train_data['score_log'] > best_threshold).astype(int)
test_data['y_pred_log'] = (test_data['score_log'] > best_threshold).astype(int)
validation_data['y_pred_log'] = (validation_data['score_log'] > best_threshold).astype(int)

In [0]:
train_churn_actual, train_churn_predicted = calculate_churn_percentage(train_data, 'target', 'score_log', 'y_pred', best_threshold)
test_churn_actual, test_churn_predicted = calculate_churn_percentage(test_data, 'target', 'score_log', 'y_pred', best_threshold)
val_churn_actual, val_churn_predicted = calculate_churn_percentage(validation_data, 'target', 'score_log', 'y_pred', best_threshold)

churn_stats = pd.DataFrame({
    'Period': ['Train', 'Test', 'Validation'],
    'Actual Churn Rate': [train_churn_actual, test_churn_actual, val_churn_actual],
    'Predicted Churn Rate': [train_churn_predicted, test_churn_predicted, val_churn_predicted]
})

churn_stats.iloc[:, 1:] = churn_stats.iloc[:, 1:].applymap(lambda x: f"{x:.2%}")
display(churn_stats)

train_churn_by_safra = calculate_churn_by_safra(train_data, 'target', 'y_pred')
test_churn_by_safra = calculate_churn_by_safra(test_data, 'target', 'y_pred')
val_churn_by_safra = calculate_churn_by_safra(validation_data, 'target', 'y_pred')

churn_by_safra = pd.concat([
    train_churn_by_safra.assign(Period='Train'),
    test_churn_by_safra.assign(Period='Test'),
    val_churn_by_safra.assign(Period='Validation')
], ignore_index=True)

churn_by_safra['actual_churn_rate'] *= 100
churn_by_safra['predicted_churn_rate'] *= 100

plt.figure(figsize=(12, 6))
sns.barplot(data=churn_by_safra.melt(id_vars=['safra', 'Period'], value_vars=['actual_churn_rate', 'predicted_churn_rate']),
            x='safra', y='value', hue='variable', ci=None)
plt.title('Churn Rates by Safra')
plt.xlabel('Safra')
plt.ylabel('Churn Rate (%)')
plt.legend(title='Churn Rate Type', labels=['Actual Churn Rate', 'Predicted Churn Rate'])
plt.xticks(rotation=45)
plt.show()