In [1]:
import os
import sys
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
%load_ext autoreload
%autoreload 2
from src.config import get_spark
from src.utils import spark_utils
from pyspark.ml.feature import VectorAssembler, StringIndexer
import pyspark.sql.functions as F
from pyspark.ml import Pipeline
from sklearn.model_selection import TimeSeriesSplit
from pyspark.ml.classification import GBTClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
import random
import time

In [2]:
spark = get_spark('FraudDetectionModeling')

In [3]:
train = spark.read.parquet("../data_processed/train_cleaned.parquet").orderBy('index')
test = spark.read.parquet("../data_processed/test_cleaned.parquet").orderBy('index')

In [4]:
train.show(5)

+----+--------+-------+--------------+---------------------+-----------------+------------------+------------------+------------------+---------------------+------------------------------+-----+
|step|    type|isFraud|isFlaggedFraud|newbalanceOrig_isZero|       amount_log| oldbalanceOrg_log|oldbalanceDest_log|newbalanceDest_log|nameOrig__X__nameDest|nameOrig__X__nameDest__X__type|index|
+----+--------+-------+--------------+---------------------+-----------------+------------------+------------------+------------------+---------------------+------------------------------+-----+
|   1| PAYMENT|      0|             0|                    0|9.194276028581655| 12.04435927383651|               0.0|               0.0|                  C_M|                   C_M_PAYMENT|    0|
|   1| PAYMENT|      0|             0|                    0|7.531166454857185| 9.964112174352563|               0.0|               0.0|                  C_M|                   C_M_PAYMENT|    1|
|   1|TRANSFER|      1|  

In [5]:
feature_cols = spark_utils.get_except_cols(train, ['isFraud', 'isFlaggedFraud', 'step', 'index'])
cat_cols = spark_utils.get_cat_cols(train, feature_cols)
num_cols = [c for c in feature_cols if c not in cat_cols]
train = train.withColumn('label', F.col('isFraud').cast('double'))
test = test.withColumn('label', F.col('isFraud').cast('double'))
# weight
pos_count = train.filter(F.col('isFraud') == 1).count()
neg_count = train.filter(F.col('isFraud') == 0).count()
train = (
    train
    .withColumn(
        'weight',
        F.when(
            F.col('isFraud')==1,
            neg_count/pos_count
        )
        .otherwise(1)
    )
)
train.cache()

DataFrame[step: int, type: string, isFraud: int, isFlaggedFraud: int, newbalanceOrig_isZero: int, amount_log: double, oldbalanceOrg_log: double, oldbalanceDest_log: double, newbalanceDest_log: double, nameOrig__X__nameDest: string, nameOrig__X__nameDest__X__type: string, index: int, label: double, weight: double]

In [6]:
tscv = TimeSeriesSplit(n_splits=5)

In [7]:
indexer = [
    StringIndexer(
        inputCol=c,
        outputCol=f'{c}_idx',
        handleInvalid='keep'
    )
    for  c in cat_cols
]
feature_cols_idx = [f'{c}_idx' for c in cat_cols] + num_cols

In [8]:
assembler = VectorAssembler(
    inputCols=feature_cols_idx,
    outputCol="features"
)

In [9]:
# 基准模型
gbt = GBTClassifier(
    featuresCol='features',
    labelCol='label',
    weightCol='weight',
    maxDepth=5,
    maxIter=50,
    stepSize=0.05,
    subsamplingRate=0.8,
    maxBins=64,
    minInstancesPerNode=10,
    seed=42
)

In [10]:
evaluator = BinaryClassificationEvaluator(
    labelCol='label',
    rawPredictionCol='rawPrediction',
    metricName='areaUnderPR'
)

In [11]:
pipe = Pipeline(stages = indexer + [assembler, gbt])

In [12]:
# Hold_out方法测试基准模型的泛化能力（训练集中的80%数据用于训练，20%数据用于验证）
score, model = spark_utils.hold_out(train, pipe, evaluator)
print(f'PR-AUC score: {score:.2f}%')

PR-AUC score: 87.14%


In [None]:
# 随机搜索
n_iter = 5
random.seed(42)

maxDepth = [3, 4, 5, 6, 8]
maxIter = [40, 60, 80, 100]
stepSize = [0.03, 0.05, 0.07, 0.1]
subsamplingRate = [0.6, 0.7, 0.8, 0.9]

best_result_1 = None

for i in range(1, n_iter+1):
    
    t0 = time.time()
    
    param_map = {
        gbt.maxDepth: random.choice(maxDepth),
        gbt.maxIter: random.choice(maxIter),
        gbt.stepSize: random.choice(stepSize),
        gbt.subsamplingRate: random.choice(subsamplingRate)
    }
    readable_map = {k.name: v for k, v in param_map.items()}
    print(f'iteration {i}: {readable_map}')
    
    pipe_copy = pipe.copy(param_map)
    
    score, model = spark_utils.hold_out(train, pipe_copy, evaluator)
    print(f'score: {score:.2f}%')
    
    if best_result_1 is None or score > best_result_1['score']:
        best_result_1 = {
            'model': model,
            'params': param_map,
            'score': score,
            'iteration': i,
            'readable_map': readable_map
        }
    
    t1 = time.time()
    elapsed = t1 - t0
    print(f'  Time used: {int(elapsed//60)} min {elapsed%60:.2f} s')


print(f"\nBest score: {best_result_1['score']:.2f}%")
print(f"Best model: iteration {best_result_1['iteration']}")
print(f"Best params: {best_result_1['readable_map']}")

iteration 1: {'maxDepth': 3, 'maxIter': 40, 'stepSize': 0.07, 'subsamplingRate': 0.7}
score: 83.25
  Time used: 2 min 30.44 s
iteration 2: {'maxDepth': 4, 'maxIter': 60, 'stepSize': 0.03, 'subsamplingRate': 0.6}
score: 83.66
  Time used: 4 min 14.77 s
iteration 3: {'maxDepth': 8, 'maxIter': 100, 'stepSize': 0.03, 'subsamplingRate': 0.6}
score: 89.9
  Time used: 16 min 39.55 s
iteration 4: {'maxDepth': 3, 'maxIter': 60, 'stepSize': 0.05, 'subsamplingRate': 0.6}
score: 82.02000000000001
  Time used: 3 min 4.17 s
iteration 5: {'maxDepth': 8, 'maxIter': 60, 'stepSize': 0.1, 'subsamplingRate': 0.7}
score: 90.24
  Time used: 7 min 57.91 s

Best score: 90.24%
Best model: iteration 5
Best params: {'maxDepth': 8, 'maxIter': 60, 'stepSize': 0.1, 'subsamplingRate': 0.7}


In [None]:
# 局部随机搜索
n_iter = 5
random.seed(42)

maxDepth = [6, 7, 8, 9]
maxIter = [50, 60, 70, 80]
stepSize = [0.08, 0.1, 0.12]
subsamplingRate = [0.6, 0.7, 0.8]

best_result_2 = None

for i in range(1, n_iter+1):
    
    t0 = time.time()
    
    
    param_map = {
        gbt.maxDepth: random.choice(maxDepth),
        gbt.maxIter: random.choice(maxIter),
        gbt.stepSize: random.choice(stepSize),
        gbt.subsamplingRate: random.choice(subsamplingRate)
    }
    readable_map = {k.name: v for k, v in param_map.items()}
    print(f'iteration {i}: {readable_map}')
    
    pipe_copy = pipe.copy(param_map)
    
    score, model = spark_utils.hold_out(train, pipe_copy, evaluator)
    print(f'score: {score:.2f}%')
    
    if best_result_2 is None or score > best_result_2['score']:
        best_result_2 = {
            'model': model,
            'params': param_map,
            'score': score,
            'iteration': i,
            'readable_map': readable_map
        }
    
    t1 = time.time()
    elapsed = t1 - t0
    print(f'  Time used: {int(elapsed//60)} min {elapsed%60:.2f} s')

print(f"\nBest score: {best_result_2['score']:.2f}%")
print(f"Best model: iteration {best_result_2['iteration']}")
print(f"Best params: {best_result_2['readable_map']}")

iteration 1: {'maxDepth': 6, 'maxIter': 50, 'stepSize': 0.12, 'subsamplingRate': 0.7}
score: 89.32
  Time used: 4 min 24.19 s
iteration 2: {'maxDepth': 7, 'maxIter': 60, 'stepSize': 0.08, 'subsamplingRate': 0.8}
score: 89.5
  Time used: 6 min 15.00 s
iteration 3: {'maxDepth': 6, 'maxIter': 50, 'stepSize': 0.12, 'subsamplingRate': 0.7}
score: 89.32
  Time used: 4 min 26.15 s
iteration 4: {'maxDepth': 6, 'maxIter': 50, 'stepSize': 0.08, 'subsamplingRate': 0.6}
score: 88.77
  Time used: 4 min 24.89 s
iteration 5: {'maxDepth': 7, 'maxIter': 50, 'stepSize': 0.12, 'subsamplingRate': 0.6}
score: 89.83
  Time used: 5 min 10.98 s

Best score: 89.83%
Best model: iteration 5
Best params: {'maxDepth': 7, 'maxIter': 50, 'stepSize': 0.12, 'subsamplingRate': 0.6}


经过两轮调参，最佳的参数组合为第一次随机搜索的第五次结果：**'maxDepth': 8, 'maxIter': 60, 'stepSize': 0.1, 'subsamplingRate': 0.7**  
后面将会使用这个参数组合进行模型训练。

In [20]:
final_pipe = pipe.copy(best_result_1['params'])

In [21]:
model_final = final_pipe.fit(train)

In [22]:
# 导出最终数据与模型
train.write.mode('overwrite').parquet('../data_processed/train_final.parquet')
test.write.mode('overwrite').parquet('../data_processed/test_final.parquet')
best_result_2['model'].write().overwrite().save('../models/model_cv')
model_final.write().overwrite().save('../models/model_final')