# Description
Create a GBM model

# Setup

In [8]:
import pandas as pd
import plotly.express as px
import re
import numpy as np
from IPython.display import Image, clear_output
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
import joblib
import shap

import findspark
import pyspark
from pyspark.sql import SparkSession
import pyspark
from pyspark.sql import DataFrame
import pyspark.sql.functions as sql
from pyspark.sql.window import Window
from pyspark.sql.functions import *
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder, Imputer

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.regression import GeneralizedLinearRegression
from pyspark.ml.classification import GBTClassifier, RandomForestClassifier
from pyspark.ml.functions import vector_to_array
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, TrainValidationSplit
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from Utils import addWoeFromSavedDF, spark, output_path, input_path, pysparkGiniPerGroups
import random
from tqdm import tqdm

# Data

In [2]:
train_data = spark.read.parquet(f"{output_path}train_df_woe.parquet")
test_data = spark.read.parquet(f"{output_path}test_df_woe.parquet")

                                                                                

# Analysis

In [3]:
from synapse.ml.lightgbm import LightGBMClassifier

numeric_list = ['loan_amnt',
 'funded_amnt',
 'funded_amnt_inv',
 'int_rate',
 'installment',
 'annual_inc',
 'dti',
 'delinq_2yrs',
 'fico_range_low',
 'fico_range_high',
 'inq_last_6mths',
 'open_acc',
 'pub_rec',
 'revol_bal',
 'revol_util',
 'total_acc']

categ_list = ['term',
 'grade',
 'sub_grade',
 'home_ownership',
 'verification_status',
 'purpose',
 'zip_code',
 'addr_state',
 'initial_list_status']

target_col = "default_flag"
categ_idx_list = [f"{i}_idx" for i in categ_list]
categ_dummy_list = [f"{i}_dummy" for i in categ_list]

imp = Imputer(inputCols=numeric_list, outputCols=numeric_list, strategy='mean')
stridx = StringIndexer(inputCols=categ_list, outputCols=categ_idx_list, handleInvalid="keep")
data_pipeline = Pipeline(stages=[imp, stridx])

data_transformer = data_pipeline.fit(train_data)

train_data_treated  =  data_transformer.transform(train_data)
test_data_treated  =  data_transformer.transform(test_data)

25/06/19 10:03:06 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

## Hyperparameters

In [4]:
# train_data_treated\
# .write\
# .mode("overwrite")\
# .save(f"{output_path}train_data_treated.parquet")

# test_data_treated\
# .write\
# .mode("overwrite")\
# .save(f"{output_path}test_data_treated.parquet")

In [4]:
train_data_treated = spark.read.parquet(f"{output_path}train_data_treated.parquet")
test_data_treated = spark.read.parquet(f"{output_path}test_data_treated.parquet")

In [None]:
# def withTrainTestColumn(df: DataFrame, frac: float):
#     w1 = Window.orderBy(F.rand(seed=42))
#     result = df\
#             .withColumn("rand_rank", F.rank().over(w1) / lit(df.count()))\
#             .withColumn("train_test_split", when(F.col("rand_rank") <= F.lit(frac), "train").otherwise("test"))\
#             .drop("rand_rank")
#     return result

# train_split = train_data_treated\
# .sample(0.1, seed=42)\
# .transform(withTrainTestColumn, 0.6)

# def withStringToCateg(df: pd.DataFrame):
#     dtypes_dict = df.dtypes.to_dict()
#     astype_dict = dict()
#     for col_i, type_i in dtypes_dict.items():
#         if type_i == "object":
#             astype_dict[col_i] = "category"
#     return df.astype(astype_dict)

# train_ = train_split.where(F.col("train_test_split")=="train").toPandas().pipe(withStringToCateg)
# test_  = train_split.where(F.col("train_test_split")=="test").toPandas().pipe(withStringToCateg)

# Xtrain = train_[numeric_list + categ_list]
# ytrain = train_[target_col]

# Xtest = test_[numeric_list + categ_list]
# ytest = test_[target_col]


# import lightgbm as lgbm
# from sklearn.model_selection import ParameterSampler

# param_grid = {
#     "boosting" :["gbdt", "rf"],
#     "learning_rate" : [0.01, 0.05, 0.1],
#     "max_depth" : [-1, 10, 15],
#     "num_leaves" : [10, 31, 100],
#     "feature_fraction" : [0.2, 0.5, 1.0],
#     "min_data_in_leaf" : [1, 20, 30, 40],
#     "min_sum_hessian_in_leaf" : [0.001],
#     "min_gain_to_split" : [0.0],
#     "num_iterations" : [100],
#     "lambda_l1" : [0.0, 0.01, 0.05],
#     "lambda_l2" : [0.0, 0.01, 0.05],
#     "bagging_fraction" : [0.6, 0.8, 1.0],
#     "bin_sample_count" : [100, 500, 2000, 200000],
#     "bagging_freq" : [0],
# }
                 
# param_list = ParameterSampler(param_grid, 10, random_state=42)
# results = list()
# for i, param_i in tqdm(enumerate(param_list), total=len(param_list)):
#     model = lgbm.LGBMClassifier(random_state=42, verbosity=-1)
#     model = model.set_params(**param_i)
#     model.fit(Xtrain, ytrain, 
#               eval_metric='auc',
#               eval_set=[(Xtrain, ytrain),
#                         (Xtest, ytest)],
#               eval_names=['train', 'test'])
#     result_dict = {
#         'model_number' : i,
#         'model' : model,
#         'params' : model.get_params(),
#         'auc_train' : model.best_score_['train']['auc'],
#         'auc_test' : model.best_score_['test']['auc'],
#         'gini_train' : model.best_score_['train']['auc']*2-1,
#         'gini_test' : model.best_score_['test']['auc']*2-1
#     }
#     results.append(result_dict)

# result_df = pd.DataFrame(results)
# result_df['overfit'] = np.abs(result_df['gini_train'] - result_df['gini_test'])
# result_df['score'] = result_df['gini_test'] - result_df['overfit']
# result_df

In [16]:
class hyperParametersOpt:
    def __init__(self,
                 model,
                 features: list,
                 param_grid: dict,
                 output_path: str,
                 n_samples: int = 50,
                 train_test_split_frac: float = 0.5,
                 train_data_sample_frac: float = 1.0,
                 batch_counter_cache: int = 20,
                 random_state: int = 42):  
        self.model = model
        self.param_grid = param_grid
        self.n_samples = n_samples
        self.output_path = output_path
        self.train_data_sample_frac = train_data_sample_frac
        self.random_state = random_state
        self.train_test_split_frac = train_test_split_frac
        self.features = features
        self.batch_counter_cache = batch_counter_cache
    
    def withTrainTestColumn(self, df: DataFrame, frac: float):
        w1 = Window.orderBy(F.rand(seed=self.random_state))
        
        result = df\
                .withColumn("rand_rank", F.rank().over(w1) / lit(df.count()))\
                .withColumn("train_test_split", when(F.col("rand_rank") <= F.lit(frac), "train").otherwise("test"))\
                .drop("rand_rank")
        return result
    
    def getTrainTestSplit(self, df: DataFrame):

        df\
        .sample(self.train_data_sample_frac, seed=self.random_state)\
        .transform(self.withTrainTestColumn, self.train_test_split_frac)\
        .write\
        .mode("overwrite")\
        .save(self.output_data_paths['train_data_sampled'])

        self.train_split = spark.read.parquet(self.output_data_paths['train_data_sampled'])
        self.train_ = self.train_split.where(F.col("train_test_split")=="train").repartition(4).cache()
        clear_output() 
    
    def pysparkGiniPerGroups(self, df: DataFrame, group_list: list, pred_col: str, target_col: str, weights:str = None):
        """
        Calculate gini by groups from pyspark dataframe
        """

        def pysparkAucPerGroups(df: DataFrame, group_list: list, pred_col: str, target_col: str, weights:str = None):
            """
            Calculate auc by groups from pyspark dataframe
            """
            
            def aucFromDf(df: pd.DataFrame, pred_col: str, target_col: str, weights: str = None):
                """
                Calculate auc from pandas dataframe
                """
                from sklearn.metrics import roc_auc_score
                pred = df[pred_col]
                target = df[target_col]
                if weights is None:
                    return roc_auc_score(target, pred, sample_weight = weights)
                else:
                    return roc_auc_score(target, pred, sample_weight = df[weights])
            
            def aucPerGroups(df: pd.DataFrame, group_list: list, pred_col: str, target_col: str, weights:str=None):
                """
                Calculate auc by groups from pandas dataframe
                """
                result = pd.DataFrame(df.groupby(group_list).apply(aucFromDf, pred_col, target_col, weights, include_groups=True), columns=['auc']).reset_index()
                return result
            
            group_df_list = group_list + [pred_col, target_col]
            if weights is None:
                agg_column = count(lit(1))
            else:
                agg_column = sum(col(weights))
            df_grouped = df.groupBy(group_df_list).agg(agg_column.alias('count'))

            schema = ''
            for col_i in group_list:
                schema = schema + f'{col_i} {df_grouped.schema[col_i].dataType.simpleString()}, '
            schema = schema + 'auc double'
            result = df_grouped.groupBy(group_list).applyInPandas(lambda row: aucPerGroups(row, group_list, pred_col, target_col, 'count'), schema=schema)
            return result
        
        auc_df = pysparkAucPerGroups(df, group_list, pred_col, target_col, weights)
        result = auc_df.withColumn('gini', col('auc') * 2 - 1)
        return result    
    
    def fit(self, df: DataFrame):
        from sklearn.model_selection import ParameterSampler
        from functools import reduce
        
        self.output_data_paths = {
            'train_data_sampled' : f"{self.output_path}train_splited.parquet",
            'pred_data' : f"{self.output_path}lgbm_hp_opt_preds.parquet",
            'score_data' : f"{self.output_path}lgbm_hp_opt_scores.parquet"
        }
        self.getTrainTestSplit(df)
        param_list = ParameterSampler(self.param_grid, self.n_samples, random_state=self.random_state)
        self.result_models = dict()
        
        vectoriser = VectorAssembler(inputCols=self.features, outputCol="features")
        self.model_pipeline = Pipeline(stages=[vectoriser, self.model])
        
        for i, param_i in tqdm(enumerate(param_list), total=len(param_list)):
            self.model_pipeline.getStages()[-1].setParams(**param_i)    
            lgbm_model = self.model_pipeline.fit(self.train_)
            self.result_models[i] = {'model' : lgbm_model, 'params': param_i}
            prob_col = lgbm_model.stages[-1].getProbabilityCol()
            pred_df_temp = self.train_split\
                                .transform(lgbm_model.transform)\
                                .withColumn("pred", F.round(F.get(vector_to_array(F.col(prob_col)), 1), 2))\
                                .select(
                                    "train_test_split",
                                    lit(i).alias("model"),
                                    "default_flag",
                                    "pred"
                                )
                                
            if i == 0:
                pred_df_temp\
                    .write.mode("overwrite").save(self.output_data_paths['pred_data'])
                batch_counter = 1
                pred_dfs = []
            else:
                if ((batch_counter >= self.batch_counter_cache)
                    or (i >= len(param_list) - 1)):
                    pred_dfs.append(pred_df_temp)
                    pred_df = reduce(DataFrame.unionByName, pred_dfs)
                    pred_df\
                    .write.mode("append").save(self.output_data_paths['pred_data'])
                    
                    batch_counter = 1
                    pred_dfs = []
                else:
                    pred_dfs.append(pred_df_temp)
                    batch_counter = batch_counter + 1
            clear_output()
        
        self.pred_df = spark.read.parquet(self.output_data_paths['pred_data'])
        self.pred_df\
            .transform(self.pysparkGiniPerGroups, ['train_test_split', 'model'], "pred", "default_flag")\
            .groupBy("model")\
            .pivot("train_test_split")\
            .agg(sum("gini"))\
            .withColumn("score", F.col("test") - F.abs(F.col("train") - F.col("test")))\
            .withColumn("max_score", max("score").over(Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)))\
            .write.mode("overwrite").save(self.output_data_paths['score_data'])
        
        self.gini_df = spark.read.parquet(self.output_data_paths['score_data'])
        
        best_param_i = self.gini_df\
            .where(F.col("score") == F.col("max_score"))\
            .select(collect_list("model"))\
            .first()[0][0]
        
        self.best_params = self.result_models[best_param_i]['params']
        self.model_pipeline.getStages()[-1].setParams(**self.best_params)
        
        self.best_estimator = self.model_pipeline.fit(df)        
        return self

In [17]:
Slgbm = LightGBMClassifier(
    objective="binary", 
    featuresCol="features", 
    labelCol=target_col, 
    seed=42,
    verbosity=-1,
    deterministic=True
)

param_grid = {
    "boostingType" : ["gbdt", "rf"],
    "learningRate" : [0.01, 0.05, 0.1],
    "learningRate" : [0.01, 0.05, 0.1],
    "maxDepth" : [5, 10, 15],
    "numLeaves" : [10, 31, 100],
    "featureFraction" : [0.2, 0.5, 1.0],
    "minDataInLeaf" : [1, 20, 30, 40],
    "minSumHessianInLeaf" : [0.001],
    "minGainToSplit" : [0.0],
    "numIterations" : [100],
    "lambdaL1" : [0.0, 0.01, 0.05],
    "lambdaL2" : [0.0, 0.01, 0.05],
    "baggingFraction" : [0.6, 0.8, 0.99],
    "binSampleCount" : [100, 500, 2000],
    "baggingFreq" : [1, 5],
    "isUnbalance": [True, False],
}

hpopt = hyperParametersOpt(model= Slgbm,
                            features= numeric_list + categ_idx_list,
                            param_grid=param_grid,
                            output_path=output_path,
                            n_samples=150,
                            train_data_sample_frac=0.1,
                            train_test_split_frac=0.6,
                            batch_counter_cache=50,
                            random_state=42)

hpopt = hpopt.fit(train_data_treated)

100%|██████████| 150/150 [24:38<00:00,  9.85s/it]


25/06/19 10:48:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 10:48:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 10:48:11 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 10:48:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 10:48:14 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 10:48:15 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 

[LightGBM] [Info] Loaded reference dataset: 25 features, 809048 num_data


                                                                                

In [18]:
hpopt.pred_df.groupBy("model").agg(count(lit(1)).alias("count")).orderBy(F.col("count").desc()).show()



+-----+-----+
|model|count|
+-----+-----+
|  133|80665|
|  108|80665|
|  122|80665|
|   86|80665|
|    3|80665|
|  139|80665|
|  117|80665|
|  127|80665|
|   72|80665|
|   35|80665|
|  114|80665|
|    4|80665|
|   23|80665|
|  129|80665|
|   97|80665|
|   82|80665|
|   25|80665|
|   73|80665|
|   70|80665|
|   29|80665|
+-----+-----+
only showing top 20 rows



                                                                                

In [19]:
hpopt.gini_df.orderBy(F.col("score").desc()).show()

+-----+-------------------+-------------------+-------------------+------------------+
|model|               test|              train|              score|         max_score|
+-----+-------------------+-------------------+-------------------+------------------+
|  121| 0.4122366207910697| 0.4131492041443332| 0.4113240374378062|0.4113240374378062|
|   93|0.41182844630749615|0.41346472793109657|0.41019216468389574|0.4113240374378062|
|   54| 0.4115911521247102| 0.4135748530018071|0.40960745124761333|0.4113240374378062|
|   77| 0.4095775635766059|0.40928614829901955|0.40928614829901955|0.4113240374378062|
|   40|  0.422899993507754| 0.4365706370919127| 0.4092293499235953|0.4113240374378062|
|   13| 0.4124014440971193| 0.4160153169257286|0.40878757126851006|0.4113240374378062|
|   84|0.40976417988884783|0.41090726789638543|0.40862109188131024|0.4113240374378062|
|   65|0.42270489131544764| 0.4368208581316002|0.40858892449929507|0.4113240374378062|
|   68|  0.422636553804969|0.43673070392863

In [20]:
hpopt.gini_df.orderBy(F.col("score")).select(collect_list("test")).first()[0][0]

0.4045607039774095

In [23]:
hpopt.result_models[121]

{'model': PipelineModel_87b5cb549a7d,
 'params': {'numLeaves': 10,
  'numIterations': 100,
  'minSumHessianInLeaf': 0.001,
  'minGainToSplit': 0.0,
  'minDataInLeaf': 30,
  'maxDepth': 15,
  'learningRate': 0.01,
  'lambdaL2': 0.0,
  'lambdaL1': 0.0,
  'isUnbalance': False,
  'featureFraction': 0.2,
  'boostingType': 'gbdt',
  'binSampleCount': 500,
  'baggingFreq': 1,
  'baggingFraction': 0.6}}

In [22]:
hpopt.best_params

{'numLeaves': 10,
 'numIterations': 100,
 'minSumHessianInLeaf': 0.001,
 'minGainToSplit': 0.0,
 'minDataInLeaf': 30,
 'maxDepth': 15,
 'learningRate': 0.01,
 'lambdaL2': 0.0,
 'lambdaL1': 0.0,
 'isUnbalance': False,
 'featureFraction': 0.2,
 'boostingType': 'gbdt',
 'binSampleCount': 500,
 'baggingFreq': 1,
 'baggingFraction': 0.6}

In [24]:
Slgbm = LightGBMClassifier(
    objective="binary", 
    featuresCol="features", 
    labelCol=target_col, 
    seed=42,
    verbosity=-1,
    deterministic=True
)
Slgbm.setParams(**hpopt.best_params)
vectoriser = VectorAssembler(inputCols=numeric_list + categ_idx_list, outputCol="features")
model_pipeline = Pipeline(stages=[vectoriser, Slgbm])
prob_col = Slgbm.getProbabilityCol()
lgbm_models = model_pipeline.fit(hpopt.train_)
pred_df = hpopt.train_split\
        .transform(lgbm_models.transform)\
        .withColumn("pred", F.round(F.get(vector_to_array(F.col(prob_col)), 1), 2))\
        .select("train_test_split",
                lit(1).alias("model"),
                "default_flag",
                "pred")
pred_df\
    .transform(hpopt.pysparkGiniPerGroups,
            ['train_test_split', 'model'],
             "pred",
             "default_flag" )\
    .groupBy("model")\
    .pivot("train_test_split")\
    .agg(sum("gini"))\
    .show()

[LightGBM] [Info] Saving data reference to binary buffer
[LightGBM] [Info] Loaded reference dataset: 25 features, 48399 num_data


[Stage 1257:>                                                       (0 + 1) / 1]

+-----+------------------+------------------+
|model|              test|             train|
+-----+------------------+------------------+
|    1|0.4122366207910697|0.4131492041443332|
+-----+------------------+------------------+



                                                                                

## Sequential Feature Selection

In [77]:
class sequentialFeatureElimination:
    def __init__(self,
                 model,
                 features: list,
                 params: dict,
                 output_path: str = None,
                 train_data_sample_frac: float = 0.1,
                 train_test_split_frac: float = 0.5,
                 random_state: int = 42,
                 ):
        self.model = model
        self.prob_col = model.getProbabilityCol()
        self.features = features
        self.train_data_sample_frac = train_data_sample_frac
        self.train_test_split_frac = train_test_split_frac
        self.random_state = random_state
        self.output_path = output_path
        self.params = params
               
    def withTrainTestColumn(self, df: DataFrame, frac: float):
        w1 = Window.orderBy(F.rand(seed=self.random_state))
        
        result = df\
                .withColumn("rand_rank", F.rank().over(w1) / lit(df.count()))\
                .withColumn("train_test_split", when(F.col("rand_rank") <= F.lit(frac), "train").otherwise("test"))\
                .drop("rand_rank")
        return result
    
    def getTrainTestSplit(self, df: DataFrame):
        df\
        .sample(self.train_data_sample_frac, seed=self.random_state)\
        .transform(self.withTrainTestColumn, self.train_test_split_frac)\
        .write\
        .mode("overwrite")\
        .save(self.output_data_paths['train_data_sampled'])

        self.train_split = spark.read.parquet(self.output_data_paths['train_data_sampled'])
        self.train_ = self.train_split.where(F.col("train_test_split")=="train").repartition(4).cache()
        clear_output()
        
    def pysparkGiniPerGroups(self, df: DataFrame, group_list: list, pred_col: str, target_col: str, weights:str = None):
        """
        Calculate gini by groups from pyspark dataframe
        """

        def pysparkAucPerGroups(df: DataFrame, group_list: list, pred_col: str, target_col: str, weights:str = None):
            """
            Calculate auc by groups from pyspark dataframe
            """
            
            def aucFromDf(df: pd.DataFrame, pred_col: str, target_col: str, weights: str = None):
                """
                Calculate auc from pandas dataframe
                """
                from sklearn.metrics import roc_auc_score
                pred = df[pred_col]
                target = df[target_col]
                if weights is None:
                    return roc_auc_score(target, pred, sample_weight = weights)
                else:
                    return roc_auc_score(target, pred, sample_weight = df[weights])
            
            def aucPerGroups(df: pd.DataFrame, group_list: list, pred_col: str, target_col: str, weights:str=None):
                """
                Calculate auc by groups from pandas dataframe
                """
                result = pd.DataFrame(df.groupby(group_list).apply(aucFromDf, pred_col, target_col, weights, include_groups=True), columns=['auc']).reset_index()
                return result
            
            group_df_list = group_list + [pred_col, target_col]
            if weights is None:
                agg_column = count(lit(1))
            else:
                agg_column = sum(col(weights))
            df_grouped = df.groupBy(group_df_list).agg(agg_column.alias('count'))

            schema = ''
            for col_i in group_list:
                schema = schema + f'{col_i} {df_grouped.schema[col_i].dataType.simpleString()}, '
            schema = schema + 'auc double'
            result = df_grouped.groupBy(group_list).applyInPandas(lambda row: aucPerGroups(row, group_list, pred_col, target_col, 'count'), schema=schema)
            return result
        
        auc_df = pysparkAucPerGroups(df, group_list, pred_col, target_col, weights)
        result = auc_df.withColumn('gini', col('auc') * 2 - 1)
        return result
    
    def withPredictions(self, df: DataFrame, fitted_pipe):
        result = df\
                .transform(fitted_pipe.transform)\
                .withColumn("pred", F.round(F.get(vector_to_array(F.col(self.prob_col)), 1), 2))
        return result
    
    def getModelPipeline(self, features):
        self.vectoriser = VectorAssembler(inputCols=features, outputCol="features")
        model = self.model.copy()
        model.setParams(**self.params)
        self.model_pipeline = Pipeline(stages=[self.vectoriser, model])
    
    def getSelectedCols(self):
        w1 = Window.partitionBy("round", "train_test_split").orderBy(F.col("gini").desc())
        w2 = Window.partitionBy("train_test_split").orderBy(F.col("gini").desc())

        self.gini_df\
            .withColumn("rank", rank().over(w1))\
            .where(F.col("rank") == 1)\
            .withColumn("gini_", rank().over(w1))
     
    def fit(self, df: DataFrame):
        from IPython.display import clear_output
        import time

        self.output_data_paths = {
            'train_data_sampled' : f"{self.output_path}train_splited.parquet",
            'pred_data' : f"{self.output_path}lgbm_hp_opt_preds.parquet",
            'score_data' : f"{self.output_path}lgbm_hp_opt_scores.parquet"
        }

        self.getTrainTestSplit(df)
        self.getModelPipeline(self.features)
        lgbm_models = self.model_pipeline.fit(self.train_)
        gini_df = self.train_split\
                        .transform(self.withPredictions, lgbm_models)\
                        .select("train_test_split",
                                lit("None").alias("excluded_col"),
                                "default_flag",
                                "pred")\
                        .transform(self.pysparkGiniPerGroups,
                                ['train_test_split', 'excluded_col'],
                                "pred",
                                "default_flag" )\
                        .withColumn("round", lit(0))
        gini_df.write.mode("overwrite").save(f"{output_path}lgbm_sfe_opt_gini.parquet")
        gini_df = spark.read.parquet(f"{output_path}lgbm_sfe_opt_gini.parquet")

        best_performance = gini_df.collect()[0]

        excluded_cols_list = [
            {
                'round' : 0,
                'excluded_col' : "None",
                'included_cols' : self.features,
                'auc' : best_performance['auc'],
                'gini' : best_performance['gini'],
            }
        ]

        round_i = 1
        result_pred_df = list()
        input_cols_list = self.features.copy()
        
        while len(input_cols_list) >= 2:
            tested_cols = []
            for col_i in tqdm(input_cols_list):
                cols_in = [i for i in input_cols_list if i != col_i]
                print(f"round {round_i}")
                self.getModelPipeline(cols_in)
                lgbm_models = self.model_pipeline.fit(self.train_)
                pred_df = self.train_split\
                        .transform(self.withPredictions, lgbm_models)\
                        .select("train_test_split",
                                lit(col_i).alias("excluded_col"),
                                "default_flag",
                                "pred")
                result_pred_df.append(pred_df)
                tested_cols.append(col_i)
                clear_output()
                            
            pred_df_result = functools.reduce(DataFrame.unionByName, result_pred_df)
            result_pred_df = list()

            gini_df = pred_df_result\
                        .transform(self.pysparkGiniPerGroups,
                                ['train_test_split', 'excluded_col'],
                                "pred",
                                "default_flag" )\
                        .withColumn("round", lit(round_i))
                
            gini_df.write.mode("append").save(f"{output_path}lgbm_sfe_opt_gini.parquet")
            self.gini_df = spark.read.parquet(f"{output_path}lgbm_sfe_opt_gini.parquet")
            
            best_performance = self.gini_df\
                            .where((F.col("round") == round_i)
                                 & (F.col("train_test_split") == "test"))\
                            .orderBy(F.col("gini").desc()).collect()[0]
            
            excluded_col = best_performance['excluded_col']

            print(f"removed: {excluded_col}")
            excluded_cols_list.append(
                {
                    'round' : round_i,
                    'excluded_col' : excluded_col,
                    'included_cols' : [i for i in input_cols_list if i != excluded_col],
                    'auc' : best_performance['auc'],
                    'gini' : best_performance['gini'],
                }
            )
            input_cols_list = [i for i in input_cols_list if i != excluded_col]
            round_i = round_i + 1
        self.result_df = spark.createDataFrame(excluded_cols_list)
        return self


In [None]:
Slgbm = LightGBMClassifier(
    objective="binary", 
    featuresCol="features", 
    labelCol=target_col, 
    seed=42,
    verbosity=-1,
    deterministic=True
)

params = {'numLeaves': 10,
          'numIterations': 100,
          'minSumHessianInLeaf': 0.001,
          'minGainToSplit': 0.0,
          'minDataInLeaf': 30,
          'maxDepth': 15,
          'learningRate': 0.01,
          'lambdaL2': 0.0,
          'lambdaL1': 0.0,
          'isUnbalance': False,
          'featureFraction': 0.2,
          'boostingType': 'gbdt',
          'binSampleCount': 500,
          'baggingFreq': 1,
          'baggingFraction': 0.6}

features = numeric_list + categ_idx_list

sfe = sequentialFeatureElimination(
    model = Slgbm,
    features = features,
    params = params,
    output_path = output_path,
    train_data_sample_frac = 0.1,
    train_test_split_frac = 0.5,
    random_state = 42
)

sfe.fit(train_data_treated)

100%|██████████| 24/24 [03:19<00:00,  8.32s/it]
25/06/19 20:29:02 WARN DAGScheduler: Broadcasting large task binary with size 3.9 MiB
ERROR:root:KeyboardInterrupt while sending command.               (43 + 4) / 72]
Traceback (most recent call last):
  File "/home/neon/Documents/lending-club-analysis/lending-club-analysis/.venv/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/neon/Documents/lending-club-analysis/lending-club-analysis/.venv/lib/python3.10/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 



In [84]:
w1 = Window.partitionBy("round", "train_test_split").orderBy(F.col("gini").desc())
w2 = Window.orderBy(F.col("round").desc())
w3 = Window.orderBy(F.col("round").desc()).rowsBetween(-4, 0)
w4 = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

threshold = 0.0001

df_result = sfe.result_df\
    .withColumn("gini_change", coalesce(F.col("gini") - lag(F.col("gini"),1).over(w2), F.col("gini")))\
    .withColumn("gini_change_next", sum("gini_change").over(w3))\
    .withColumn("perf_aux", max(when(F.col("gini_change_next")<=threshold, F.col("round"))).over(w4))\
    .toPandas()

df_result[df_result['round']==10]['included_cols'].iloc[0]

25/06/19 20:14:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 20:14:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 20:14:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 20:14:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 20:14:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 20:14:23 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
25/06/19 2

array(['int_rate', 'installment', 'dti', 'inq_last_6mths', 'revol_bal',
       'total_acc'], dtype=object)

In [89]:
fig = px.scatter(
    df_result,
    x='round',
    y='gini',
    width=700,
    template='none'
)
fig