## Imports

In [0]:
import pandas as pd
import numpy as np

import pyspark.sql.functions as F
from pyspark.sql.window import Window

from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

from pyspark.ml.classification import RandomForestClassifier, GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator

from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
import matplotlib.pyplot as plt

In [0]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    """
    auxiliar function to extract feature names from rf's feature importance
    """
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))


## Base de Variáveis

In [0]:
base_spine = spark.table('sand_riscos_pm_pf.T789778_spine_final_variaveis_dm_v3').filter(F.col('is_auto_renew_1m').isin(0))
base_spine.printSchema()

In [0]:
num_vars = [k for k in base_spine.columns if k.startswith('num_')]+[k for k in base_spine.columns if k.startswith('total')]+\
 [k for k in base_spine.columns if k.startswith('completion')]+[k for k in base_spine.columns if k.startswith('repeat_')]+ \
 [k for k in base_spine.columns if k.startswith('diversity')]+[k for k in base_spine.columns if k.startswith('intensity')]+ \
 [k for k in base_spine.columns if k.startswith('plan_list')]+[k for k in base_spine.columns if k.startswith('actual_amount')]+ \
 [k for k in base_spine.columns if k.startswith('payment_plan_days')]+ [k for k in base_spine.columns if k.startswith('account')]+ \
 [k for k in base_spine.columns if k.startswith('plan_list_price')]


categorical_columns = [k for k in base_spine.columns if k.startswith('payment_method')]+[k for k in base_spine.columns if k.startswith('city')]+\
[k for k in base_spine.columns if k.startswith('is_auto_renew')]+[k for k in base_spine.columns if k.startswith('age_group')]+[k for k in base_spine.columns if k.startswith('gender')]

## Janela de priorização

Como temos alguns casos em que temos mais de um usuário por safra, devido a cancelamentos anteriores a data que o plano expira, vamos fazer uma Windows Function priorizando casos de churn.

In [0]:
Window = Window.partitionBy("msno", "safra").orderBy("target")

In [0]:
base_spine = (base_spine
              .withColumn("row_number",F.row_number().over(Window))
              .filter(F.col("row_number").isin(1))
).drop("row_number")

# Treino - Validação - Teste

Usarei como base de treino 201505-201604 (Pois como temos variáveis históricos em um horizonte de 4 meses para trás, começamos com Abril, pois as informações referentes a este usuário é desde Janeiro). Será chamado de Período de Treino.

Como Validação, usarei 201605-201607. que será chamado de Out of Time (OOT)

Como Teste, usarei 201608-201611, que será chamado de True the Door (TTD).

Os dados 201612-201702 não podem ser utilizados, pois não há janela temporal de 3 meses para marcação da target.

In [0]:
init_vars = base_spine.columns
init_vars.remove('msno')
init_vars.remove('safra')
init_vars.remove('target')
init_vars.remove('is_ativo')

ident_vars = ['msno', 'safra']

In [0]:
train_data = base_spine.filter(F.col('safra').between(201505,201604)).select(ident_vars+init_vars+['target'])
test_data = base_spine.filter(F.col('safra').between(201605,201607)).select(ident_vars+init_vars+['target'])
validation_data = base_spine.filter(F.col('safra').between(201608,201611)).select(ident_vars+init_vars+['target'])

## Feature Selection por Random Forest

In [0]:
indexers = []

for col in categorical_columns:
    indexer = StringIndexer(inputCol=col, outputCol=f"{col}_index", handleInvalid='keep')
    indexers.append(indexer)

assembler = VectorAssembler(
    inputCols=[f"{col}_index" for col in categorical_columns] + num_vars,
    outputCol="features"
)

In [0]:
rf = RandomForestClassifier(featuresCol="features", 
                            labelCol="target", 
                            maxDepth = 15, 
                            numTrees=100, 
                            maxBins=50)

pipeline = Pipeline(stages=indexers + [assembler, rf])

model = pipeline.fit(train_data)

rf_model = model.stages[-1]

In [0]:
model = pipeline.fit(train_data)

rf_model = model.stages[-1]

In [0]:
importances = rf_model.featureImportances
feature_names = [f"{col}_index" for col in categorical_columns] + num_vars
feature_importance = dict(zip(feature_names, importances.toArray()))

# Exibir as importâncias ordenadas
sorted_importance = sorted(feature_importance.items(), key=lambda x: x[1], reverse=True)
for feature, importance in sorted_importance:
    print(f"{feature}: {importance}")

In [0]:
selected_features_rf = [feature for feature, importance in sorted_importance if importance > 0.001]

print("Variáveis selecionadas (Feature Importance > 0.001):")
selected_features_rf

In [0]:
len(selected_features_rf)

## Salvar

In [0]:
feature_names = [f"{col}_index" for col in categorical_columns] + num_vars
id_vars = ['msno', 'safra']
target = ['target']

In [0]:
pipeline = Pipeline(stages=indexers + [assembler])

df_final = pipeline.fit(base_spine).transform(base_spine)

In [0]:
df_final

In [0]:
# spark.sql('drop table if exists sand_riscos_pm_pf.T789778_base_final_dm')
# df_final.write.mode('overwrite').saveAsTable('sand_riscos_pm_pf.T789778_base_final_dm')
# print('sand_riscos_pm_pf.T789778_base_final_dm')

## Feature Selection por RFE

In [0]:
base_spine = spark.table('sand_riscos_pm_pf.T789778_base_final_dm')
base_spine

In [0]:
feature_names = [f"{col}_index" for col in categorical_columns] + num_vars
id_vars = ['msno', 'safra']
target = ['target']

In [0]:
train_data = base_spine.filter(F.col('safra').between(201505,201604)).select(ident_vars+feature_names+['target']).toPandas()
test_data = base_spine.filter(F.col('safra').between(201605,201607)).select(ident_vars+feature_names+['target'])
validation_data = base_spine.filter(F.col('safra').between(201608,201611)).select(ident_vars+feature_names+['target'])

In [0]:
X_train = train_data[feature_names]
y_train = train_data[target]

In [0]:
model = LogisticRegression()
rfe = RFE(model, n_features_to_select=100, step=2)
fit = rfe.fit(X_train, y_train)

In [0]:
selected_features = X_train.columns[fit.support_]
selected_features

In [0]:
len(selected_features)