In [None]:
import os
import csv

import warnings
warnings.filterwarnings('ignore')

from sklearn.experimental import enable_iterative_imputer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix, auc, get_scorer
from sklearn.model_selection import cross_validate
from sklearn.impute import SimpleImputer, MissingIndicator, IterativeImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import average_precision_score, roc_auc_score

from xgboost import XGBClassifier

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

from scipy.stats import pearsonr, spearmanr, kendalltau, ttest_ind, chisquare

from statistics import mean

from plotnine import ggplot, aes, geom_histogram, \
                     theme_tufte, labs, element_text, \
                     theme, element_line, scale_y_continuous, \
                     facet_wrap, after_stat, labeller

from collections import Counter

from ast import literal_eval

from itertools import product

from tqdm.notebook import tqdm

random_state = 102

In [None]:
# Mount into drive
from google.colab import drive
drive.mount("/content/drive")

如果在本地机器上运行，请将 `data_directory` 变量修改为 IEEE-CIS 欺诈检测数据集所在的位置。

In [None]:
data_directory = "./data/"

train_identity = pd.read_csv(data_directory + "train_identity.csv")
train_transaction = pd.read_csv(data_directory + "train_transaction.csv")

# EDA

# 数据说明
来自`train_transaction.csv`：

`TransactionDT`：距离某个参考时间点的时间差（不是实际时间戳）
`TransactionAMT`：交易金额（美元）
`ProductCD`：产品代码，每笔交易的产品类型
`card1 - card6`：支付卡信息，如卡类型、卡类别、发卡银行、国家等
`addr`：地址
`dist`：距离
`P_` 和 `(R_)` `emaildomain`：购买者和收件人的邮箱域名
`C1 - C14`：计数特征，如与支付卡关联的地址数量等，具体含义已被隐藏
`D1 - D15`：时间差特征，如与前一笔交易的天数等
`M1 - M9`：匹配特征，如卡和地址上的姓名是否一致等
`Vxxx`：Vesta 工程化的丰富特征，包括排名、计数及其他实体关系


来自 `train_identity.csv`：

`id01 - id11`：身份相关的数值特征，由 Vesta 及其安全合作伙伴收集，如设备评分、IP 域评分、代理评分等，也记录了行为指纹（如账户登录次数/失败次数、停留时长等），由于合作条款无法详细说明
`id_30`：设备操作系统
`id_31`：设备使用的网页浏览器
`id_33`：设备屏幕分辨率
`DeviceType`：设备类型，可能为 mobile 或 desktop
`DeviceInfo`：设备及其操作系统的综合信息
本表中的变量为身份信息——与交易相关的网络连接信息（IP、ISP、代理等）和数字签名（UA/浏览器/操作系统/版本等）。
这些信息由 Vesta 的反欺诈系统和数字安全合作伙伴收集。
（字段名称已被隐藏，且不会提供一一对应的字典，以保护隐私和遵守合同协议）


## 特征工程

由于本数据集包含 400 多个特征，无法全部进行分析，因此使用如下参数（n_estimators=1000, max_depth=10, class_weight={0: 5, 1: 95}, min_samples_split=32, min_samples_leaf=32）训练了一个随机森林模型，并选取了对模型最重要的前 6 个特征。按重要性排序，这些特征分别是：C14、C13、V264、V317、V258、V294、V257 和 C4

In [None]:
pd.read_csv("./drive/My Drive/ieee-fraud-detection/output/feature_importances.csv", index_col = 0)[:20]

## 无监督特征分析

In [None]:
def unsupervised_hist(feature, bins=30):
  return ggplot(train_transaction, aes(x=feature)) + geom_histogram(bins = bins) + \
    labs(title=f"Distribution of Feature {feature}",  x=f"Value of {feature}", y="Frequency") + \
    theme_tufte() + \
    theme(panel_grid_major_y = element_line(color=".9"),
          panel_grid_minor_y = element_line(color=".9")) + \
    scale_y_continuous(trans='log10')

### C14

`C14`基本上是双峰分布。在数值为 `1.0` 处有一个巨大的峰值，随后频率迅速下降，直到在数值为 `110` 处又出现一个较小的次峰。从直方图来看，数据中可能还存在其他局部峰值，但这两个峰最为显著。

In [None]:
train_transaction.C14.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('C14', bins=100)

### C13

`C13` 是单峰分布。其数值在` 1.0 `处达到峰值，之后逐渐下降

In [None]:
train_transaction.C14.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('C13', bins=100)

### V264

`V264 `是多峰分布。最大峰值出现在` 0.0`，但在看似规律的区间（如 50、100、150、200 等）还会出现许多其他局部峰值。

In [None]:
train_transaction.V264.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('V264', bins=100)

### V317

`V317`是多峰分布，其中大多数数据（如下面分位数所示，超过 80%）集中在`0.0`。其他峰值分散在不同的数值区间，如直方图所示。

In [None]:
train_transaction.V317.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('V317', bins=100)

### V258

`V258` 是单峰且右偏分布。绝大多数取值为 `1.0`


In [None]:
train_transaction.V258.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('V258', bins=70)

### V294

`V294` 在 `0.0` 处有一个陡峭的峰值，随后迅速下降。大多数取值为 `0.0`。不过，从直方图来看，在大约 `700-800` 区间还有一个较小的次峰

In [None]:
train_transaction.V294.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('V294', bins=100)

### C8

另一个严重右偏的特征。从分位数可以看出，`90%`以上的数据取值小于或等于 `1.0`

In [None]:
train_transaction.C8.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('C8', bins=100)

### V318

超过 90% 的数据取值为 `0.0`，但也存在一些离群值。该分布是单峰且右偏的，这与我们这里的大多数特征类似

In [None]:
train_transaction.V318.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('V318', bins=100)

### V257

从分位数可以看出，超过 80% 的取值小于或等于 `1.0`。该分布同样是单峰且右偏的

In [None]:
train_transaction.V257.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('V257', bins=40)

### C4

从分位数可以看出，超过 90% 的取值小于或等于 `1.0`。该分布同样是右偏的，但在较大取值处存在一些小的局部峰值

In [None]:
train_transaction.C4.quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1])

In [None]:
unsupervised_hist('C4', bins=100)

## 有监督的特征分析

In [None]:
def supervised_hist(feature, bins=30):
  return ggplot(train_transaction, aes(x=feature, y=after_stat("density"))) + geom_histogram(bins = bins) + \
    labs(title=f"Distribution of Feature {feature}",  x=f"Value of {feature}", y="Density") + \
    theme_tufte() + \
    theme(panel_grid_major_y = element_line(color=".9"),
          panel_grid_minor_y = element_line(color=".9")) + \
    scale_y_continuous(trans='log10') + \
    facet_wrap("isFraud", labeller=labeller(cols = lambda x : "Fraud" if x == '1' else "Not Fraud"))


def supervised_analysis(dataframe, features, y_column):
  pearson = []
  spearman = []
  kendall_tau = []
  test_type = []
  statistic = []
  p_value = []

  features = list(features)

  if y_column in features:
    features.remove(y_column)

  for feature in tqdm(features):

    temp_df = dataframe[["isFraud", feature]].dropna()

    numeric = False if temp_df[feature].dtype == "object" else True

    normal = temp_df[temp_df[y_column] == 0][feature]
    fraud = temp_df[temp_df[y_column] == 1][feature]

    if numeric:
      feature_as_float = temp_df[feature] = temp_df[feature].astype(float)
      # Various Correlations
      pearson.append(pearsonr(temp_df[y_column], feature_as_float)[0])
      spearman.append(spearmanr(temp_df[y_column], feature_as_float)[0])
      kendall_tau.append(kendalltau(temp_df[y_column], feature_as_float)[0])

      # Student T-Test
      t = ttest_ind(normal, fraud)
      test_type.append("Student T-Test")
      p_value.append(t.pvalue)
      statistic.append(t.statistic)
    else:
      pearson.append(np.NaN)
      spearman.append(np.NaN)
      kendall_tau.append(np.NaN)

      observed_counter = Counter(fraud)
      expected_counter = Counter(normal)

      keys = set(observed_counter.keys()).union(set(expected_counter.keys()))
      keys = list(sorted(keys))

      observed = np.array([observed_counter[key] for key in keys])
      expected = np.array([expected_counter[key] for key in keys])

      scale_factor = np.sum(expected) / np.sum(observed)
      observed_adjusted = observed * scale_factor

      expected[expected == 0] = 1e-10

      # Chi-Squared test
      cs = chisquare(f_obs=observed_adjusted, f_exp=expected)
      test_type.append("Chi-Squared")
      p_value.append(cs.pvalue)
      statistic.append(cs.statistic)

  results = {
      "Top Features" : features,
      "Pearson Correlation" : pearson,
      "Spearman Ranked" : spearman,
      "Kendall-Tau" : kendall_tau,
      "Test Type": test_type,
      "Statistic": statistic,
      "P-Value" : p_value
      }

  return pd.DataFrame(results)

if not os.path.isfile("./drive/My Drive/ieee-fraud-detection/output/supervised_feature_analysis.csv"):
  whole = train_transaction.merge(train_identity, on='TransactionID', how='left')
  analysis = supervised_analysis(whole, whole.columns, "isFraud")
  analysis.to_csv("./drive/My Drive/ieee-fraud-detection/output/supervised_feature_analysis.csv")
else:
  analysis = pd.read_csv("./drive/My Drive/ieee-fraud-detection/output/supervised_feature_analysis.csv")

analysis

In [None]:
top_features = ["C14", "C13", "V264", "V317", "V258", "V294", "C8", "V318"]

supervised_analysis(train_transaction, top_features, "isFraud")

### C14

In [None]:
supervised_hist("C14", bins=30)

### C13

In [None]:
supervised_hist("C13", bins=30)

### V264

In [None]:
supervised_hist("V264", bins=30)

### V317

In [None]:
supervised_hist("V317", bins=30)

### V258

In [None]:
supervised_hist("V258", bins=30)

### V294

In [None]:
supervised_hist("V294", bins=30)

### C8

In [None]:
supervised_hist("C8", bins=30)

### V318

In [None]:
supervised_hist("V318", bins=30)

### V257

In [None]:
supervised_hist("V257", bins=30)

### C4

In [None]:
supervised_hist("C4", bins=30)

# 数据预处理

我们首先将训练数据集中的各个特征合并为一个单一的 DataFrame，然后将其划分为训练集、测试集和验证集

In [None]:
whole = train_transaction.merge(train_identity, on='TransactionID', how='left')

# one hot encode the categorical variables

whole = whole.sort_values(by="TransactionDT")

X = whole[whole.columns.difference(['isFraud'])]
X = pd.get_dummies(X, drop_first=True)
y = whole['isFraud']

n = len(whole)

del whole

X_train = X[:-int(n * 0.2)]
X_test = X[-int(n * 0.2):]

y_train = y[:-int(n * 0.2)]
y_test = y[-int(n * 0.2):]

del X
del y

## 数据填充

### 均值/中位数/众数

In [None]:
def basic_impute(X, impute_method = "mean"):
  preprocessor = ColumnTransformer(
      transformers=[
          ("num", SimpleImputer(strategy=impute_method),
           selector(dtype_exclude="object")),
          ("cat", SimpleImputer(strategy="most_frequent"),
           selector(dtype_include="object")),
      ]
  )

  preprocessor.fit(X)

  imputed_values = pd.DataFrame(preprocessor.transform(X),
                                columns=X.columns)

  ind_names = [name + "_ind" for name in X.columns]
  indicators = pd.DataFrame(MissingIndicator(features="all").fit_transform(X),
                            columns=ind_names)

  return pd.concat([imputed_values, indicators], axis = 1)

In [None]:
whole = train_transaction.merge(train_identity, on='TransactionID', how='left')
whole = whole.sort_values(by="TransactionDT")

X = whole[whole.columns.difference(['isFraud'])]
y = whole['isFraud']

del whole

In [None]:
save_location = "./drive/My Drive/ieee-fraud-detection/interpolated/mean_interpolated.csv"
if not os.path.isfile(save_location):
  X_mean = basic_impute(X)
  X_mean.to_csv(save_location)
  del X_mean

In [None]:
save_location = "./drive/My Drive/ieee-fraud-detection/interpolated/median_interpolated.csv"
if not os.path.isfile(save_location):
  X_median = basic_impute(X, impute_method = "median")
  X_median.to_csv(save_location)
  del X_median

### 多重插补法（MICE，多变量链式插补）

In [None]:
def mice_impute(X, iterations=10):
  preprocessor = ColumnTransformer(
      transformers=[
          ("num",IterativeImputer(max_iter=iterations,
                                  random_state=random_state),
           selector(dtype_exclude="object")),
          ("cat", SimpleImputer(strategy="most_frequent"),
           selector(dtype_include="object")),
      ]
  )

  imputed_values = pd.DataFrame(preprocessor.fit_transform(X),
                                columns=X.columns)

  ind_names = [name + "_ind" for name in X.columns]
  indicators = pd.DataFrame(MissingIndicator(features="all").fit_transform(X),
                            columns=ind_names)

  return pd.concat([imputed_values, indicators], axis = 1)


save_location = "./drive/My Drive/ieee-fraud-detection/interpolated/mice_interpolated.csv"
if not os.path.isfile(save_location):
  X_mice = mice_impute(X)
  X_mice.to_csv(save_location)
  del X_mice

# 模型训练

## Helper Func

In [None]:
def find_untested_combos(parameters, csv_location = None):
  """find all the untested combos of parameters for a grid search"""
  parameter_combos = [{k:v for k, v in zip(parameters.keys(), row)} for row in product(*parameters.values())]

  if not csv_location or not os.path.isfile(csv_location):
    print(f"Untested combos: No csv found...")
    return parameter_combos

  existing_output = []
  with open(csv_location) as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        existing_output.append(row)

  def process(x):
    if x.isnumeric() or "{" in x or "(" in x or \
       (x.count(".") == 1 and x.replace(".", "").isnumeric()):
      return literal_eval(x)
    else:
      return x

  exclude = [{k : process(v) for k, v in row.items() if k in parameters.keys()} for row in existing_output]

  print(exclude)

  untested_combos = []
  for combo in parameter_combos:
    match = False
    for exclusion in exclude:
      for i, parameter in enumerate(parameters.keys()):
        if combo[parameter] != exclusion[parameter]:
          break
        elif i == len(parameters.keys()) - 1:
          match = True
      if match:
        break
    if not match:
      untested_combos.append(combo)

  print(untested_combos)

  return untested_combos

def run_grid_search(classifier, X, y, combos, scoring_metric, csv_location,
                    load_csv_stub, cv = 5):

  X = pd.get_dummies(X, drop_first=True)

  current_impute = None
  for parameter_dict in combos:

    if "impute_strat" in parameter_dict:
      if current_impute != parameter_dict["impute_strat"]:
        csv_file = load_csv_stub + parameter_dict["impute_strat"] + "_interpolated.csv"
        current_impute = parameter_dict["impute_strat"]
        del X
        X = pd.get_dummies(pd.read_csv(csv_file)[:-int(n * 0.2)], drop_first=True)
      del parameter_dict["impute_strat"]

    clf = classifier(**parameter_dict)

    n = len(X)
    unit = int(n / cv)

    scores = {
          "training_score" : [],
          "validation_score" : []
      }

    scorer = get_scorer(scoring_metric)._score_func

    for i in range(cv - 1):
      index = (i + 1) * unit
      X_train = X[index - unit:index]
      y_train = y[index - unit:index]
      X_val = X[index:index + unit]
      y_val = y[index:index + unit]

      if "class_weight" in parameter_dict:
        weights = [parameter_dict["class_weight"][x] for x in y_train]
      else:
          weights = [1] * len(y_train)

      if classifier is XGBClassifier:
        clf.fit(X_train, y_train, sample_weight=weights)
      else:
        clf.fit(X_train, y_train)

      y_pred_train = clf.predict(X_train)
      y_pred_val = clf.predict(X_val)

      training_score = scorer(y_train, y_pred_train)
      scores["training_score"].append(training_score)

      validation_score = scorer(y_val, y_pred_val)
      scores["validation_score"].append(validation_score)

    if current_impute:
      parameter_dict.update({"impute_strat" : current_impute})

    parameter_dict.update({"scoring_metric" : scoring_metric})

    score_means = {key : mean(value) for key, value in scores.items()}
    parameter_dict.update(score_means)

    if not os.path.isfile(csv_location):
        print(f"File {csv_location} does not exist, creating...")
        print(f"results: {parameter_dict}")
        with open(csv_location, 'w', newline='') as csvfile:
            writer = csv.DictWriter(csvfile,
                                    fieldnames=parameter_dict.keys())
            writer.writeheader()
            writer.writerow(parameter_dict)
    else:
        print(f"Appending results to {csv_location}...")
        print(f"results: {parameter_dict}")
        with open(csv_location, 'a', newline='') as csvfile:
            writer = csv.DictWriter(csvfile,
                                    fieldnames=parameter_dict.keys())
            writer.writerow(parameter_dict)



## Logistic Regression


In [None]:
parameters = {
    'impute_strat' : ['mean', 'median', 'mice'],
    'C' : [100, 10, 1, 0.1, .01, .001],
    'class_weight' : [{0: 1, 1: 1},  {0: 1, 1: 10}, {0: 1, 1: 20},
                      {0: 1, 1: 30}, {0: 1, 1: 40}, {0: 1, 1: 50},
                      {0: 1, 1: 60}, {0: 1, 1: 70}, {0: 1, 1: 80},
                      {0: 1, 1: 90}]
}
scoring_metric = 'average_precision'

untested_combos = find_untested_combos(parameters, "./drive/My Drive/ieee-fraud-detection/output/lr_results.csv")
run_grid_search(LogisticRegression, X_train, y_train, untested_combos, scoring_metric,
                csv_location="./drive/My Drive/ieee-fraud-detection/output/lr_results.csv",
                load_csv_stub="./drive/My Drive/ieee-fraud-detection/interpolated/")

In [None]:
pd.read_csv("./drive/My Drive/ieee-fraud-detection/output/lr_results.csv")\
.sort_values(by="validation_score", ascending=False).head()

In [None]:
whole = train_transaction.merge(train_identity, on='TransactionID', how='left')
whole = whole.sort_values(by="TransactionDT")

csv_file = "./drive/My Drive/ieee-fraud-detection/interpolated/median_interpolated.csv"
X = pd.read_csv(csv_file).sort_values(by="TransactionDT")
X = pd.get_dummies(X, drop_first=True)
y = whole['isFraud']

n = len(whole)

del whole

X_train = X[:-int(n * 0.2)]
X_test = X[-int(n * 0.2):]

y_train = y[:-int(n * 0.2)]
y_test = y[-int(n * 0.2):]

del X
del y

In [None]:
clf = LogisticRegression(C = 100, class_weight={0: 1, 1: 10})
clf.fit(X_train, y_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_prob_train = clf.predict_proba(X_train)[:,1]
y_pred_test = clf.predict(X_test)
y_prob_test = clf.predict_proba(X_test)[:,1]

In [None]:
prob_prediction = pd.DataFrame({
    "actual" : y_test,
    "probability" : y_prob_test
}).sort_values(by="probability", ascending=False)

In [None]:
metrics_lr = {
    "Model" : "Logistic Regression",
    "Train AUC" : roc_auc_score(y_train, y_prob_train),
    "Test AUC" : roc_auc_score(y_test, y_prob_test),
    "Train PR AUC" : average_precision_score(y_train, y_prob_train),
    "Test PR AUC" : average_precision_score(y_test, y_prob_test),
    "Precision K5": np.mean(prob_prediction.actual[:5]),
    "Precision K10": np.mean(prob_prediction.actual[:10]),
}

metrics_lr

## Multi-Layer Perceptron

In [None]:
options = list(reversed([2 ** i for i in range(4, 10)]))
options = options + [0]

layer_combos = []
for i, layer_1 in enumerate(options[:-1]):
  for j, layer_2 in enumerate(options[i:]):
    if layer_2 == 0:
      layer_combos.append((layer_1,))
      continue
    for layer_3 in options[i+j:]:
      if layer_3 == 0:
        layer_combos.append((layer_1, layer_2))
        continue
      layer_combos.append((layer_1, layer_2, layer_3))

In [None]:
parameters = {
    'impute_strat' : ['mean', 'median', 'mice'],
    'hidden_layer_sizes' : layer_combos,
    'max_iter' : [50]
}
scoring_metric = 'average_precision'

untested_combos = find_untested_combos(parameters, "./drive/My Drive/ieee-fraud-detection/output/mlp_results.csv")
run_grid_search(MLPClassifier, X_train, y_train, untested_combos, scoring_metric,
                csv_location="./drive/My Drive/ieee-fraud-detection/output/mlp_results.csv",
                load_csv_stub="./drive/My Drive/ieee-fraud-detection/interpolated/")

In [None]:
pd.read_csv("./drive/My Drive/ieee-fraud-detection/output/mlp_results.csv")\
.sort_values(by="validation_score", ascending=False).head()

In [None]:
whole = train_transaction.merge(train_identity, on='TransactionID', how='left')
whole = whole.sort_values(by="TransactionDT")

csv_file = "./drive/My Drive/ieee-fraud-detection/interpolated/mice_interpolated.csv"
X = pd.read_csv(csv_file).sort_values(by="TransactionDT")
X = pd.get_dummies(X, drop_first=True)
y = whole['isFraud']

n = len(whole)

del whole

X_train = X[:-int(n * 0.2)]
X_test = X[-int(n * 0.2):]

y_train = y[:-int(n * 0.2)]
y_test = y[-int(n * 0.2):]

del X
del y

In [None]:
clf = MLPClassifier(hidden_layer_sizes=(256, 256), max_iter=200)
clf.fit(X_train, y_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_prob_train = clf.predict_proba(X_train)[:,1]
y_pred_test = clf.predict(X_test)
y_prob_test = clf.predict_proba(X_test)[:,1]

In [None]:
prob_prediction = pd.DataFrame({
    "actual" : y_test,
    "probability" : y_prob_test
}).sort_values(by="probability", ascending=False)

In [None]:
metrics_mlp = {
    "Model" : "Multi-Layer Perceptron",
    "Train AUC" : roc_auc_score(y_train, y_prob_train),
    "Test AUC" : roc_auc_score(y_test, y_prob_test),
    "Train PR AUC" : average_precision_score(y_train, y_prob_train),
    "Test PR AUC" : average_precision_score(y_test, y_prob_test),
    "Precision K5": np.mean(prob_prediction.actual[:5]),
    "Precision K10": np.mean(prob_prediction.actual[:10]),
}

metrics_mlp

## Random Forest

In [None]:
parameters = {
    'n_estimators' : [100, 1000],
    'criterion' : ['gini'],
    'max_depth' : [10],
    'class_weight' : [{0: 1, 1: 10}, {0: 1, 1: 20}, {0: 1, 1: 30},
                      {0: 1, 1: 40}, {0: 1, 1: 50}, {0: 1, 1: 60},
                      {0: 1, 1: 70}, {0: 1, 1: 80}, {0: 1, 1: 90}],
    'min_samples_split' : [32],
    'min_samples_leaf' : [32]
}

scoring_metric = 'average_precision'

untested_combos = find_untested_combos(parameters, "./drive/My Drive/ieee-fraud-detection/output/rf_results.csv")
run_grid_search(RandomForestClassifier, X_train, y_train, untested_combos,
                scoring_metric, csv_location="./drive/My Drive/ieee-fraud-detection/output/rf_results.csv",
                load_csv_stub="./drive/My Drive/ieee-fraud-detection/interpolated/")

In [None]:
pd.read_csv("./drive/My Drive/ieee-fraud-detection/output/rf_results.csv")\
.sort_values(by="validation_score", ascending=False).head()

In [None]:
whole = train_transaction.merge(train_identity, on='TransactionID', how='left')
whole = whole.sort_values(by="TransactionDT")

X = whole[whole.columns.difference(['isFraud'])]
X = pd.get_dummies(X, drop_first=True)
y = whole['isFraud']

n = len(whole)

del whole

X_train = X[:-int(n * 0.2)]
X_test = X[-int(n * 0.2):]

y_train = y[:-int(n * 0.2)]
y_test = y[-int(n * 0.2):]

del X
del y

In [None]:
clf = RandomForestClassifier(n_estimators=100, criterion="gini", max_depth=10,
                             class_weight={0: 1, 1: 10}, min_samples_split=32,
                             min_samples_leaf=32)
clf.fit(X_train, y_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_prob_train = clf.predict_proba(X_train)[:,1]
y_pred_test = clf.predict(X_test)
y_prob_test = clf.predict_proba(X_test)[:,1]

In [None]:
prob_prediction = pd.DataFrame({
    "actual" : y_test,
    "probability" : y_prob_test
}).sort_values(by="probability", ascending=False)

In [None]:
metrics_rf = {
    "Model" : "Random Forest",
    "Train AUC" : roc_auc_score(y_train, y_prob_train),
    "Test AUC" : roc_auc_score(y_test, y_prob_test),
    "Train PR AUC" : average_precision_score(y_train, y_prob_train),
    "Test PR AUC" : average_precision_score(y_test, y_prob_test),
    "Precision K5": np.mean(prob_prediction.actual[:5]),
    "Precision K10": np.mean(prob_prediction.actual[:10]),
}

metrics_rf

## Extreme Gradient Boosting

In [None]:
parameters = {
    'learning_rate' : [0.1, 1],
    'max_iter' : [50, 100, 200],
    'max_depth' : [1, 2, 4, 6, 8, 10],
    'class_weight' : [{0: 1, 1: 1},  {0: 1, 1: 10}, {0: 1, 1: 20},
                      {0: 1, 1: 30}, {0: 1, 1: 40}, {0: 1, 1: 50},
                      {0: 1, 1: 60}, {0: 1, 1: 70}, {0: 1, 1: 80},
                      {0: 1, 1: 90}]
}

untested_combos = find_untested_combos(parameters, "./drive/My Drive/ieee-fraud-detection/output/xgb_results.csv")
run_grid_search(XGBClassifier, X_train, y_train, untested_combos,
                scoring_metric="average_precision",
                csv_location="./drive/My Drive/ieee-fraud-detection/output/xgb_results.csv",
                load_csv_stub="./drive/My Drive/ieee-fraud-detection/interpolated/")

In [None]:
pd.read_csv("./drive/My Drive/ieee-fraud-detection/output/xgb_results.csv")\
.sort_values(by="validation_score", ascending=False).head()

In [None]:
whole = train_transaction.merge(train_identity, on='TransactionID', how='left')
whole = whole.sort_values(by="TransactionDT")

X = whole[whole.columns.difference(['isFraud'])]
X = pd.get_dummies(X, drop_first=True)
y = whole['isFraud']

n = len(whole)

del whole

X_train = X[:-int(n * 0.2)]
X_test = X[-int(n * 0.2):]

y_train = y[:-int(n * 0.2)]
y_test = y[-int(n * 0.2):]

del X
del y

In [None]:
clf = XGBClassifier(learning_rate=0.1, max_iter=200, max_depth=10)
clf.fit(X_train, y_train)

In [None]:
y_pred_train = clf.predict(X_train)
y_prob_train = clf.predict_proba(X_train)[:,1]
y_pred_test = clf.predict(X_test)
y_prob_test = clf.predict_proba(X_test)[:,1]

In [None]:
prob_prediction = pd.DataFrame({
    "actual" : y_test,
    "probability" : y_prob_test
}).sort_values(by="probability", ascending=False)

In [None]:
metrics_xgb = {
    "Model" : "XGBoost",
    "Train AUC" : roc_auc_score(y_train, y_prob_train),
    "Test AUC" : roc_auc_score(y_test, y_prob_test),
    "Train PR AUC" : average_precision_score(y_train, y_prob_train),
    "Test PR AUC" : average_precision_score(y_test, y_prob_test),
    "Precision K5": np.mean(prob_prediction.actual[:5]),
    "Precision K10": np.mean(prob_prediction.actual[:10]),
}

metrics_xgb

# Results

In [None]:
pd.DataFrame([metrics_lr, metrics_mlp, metrics_rf, metrics_xgb])