# setteing

In [1]:
%matplotlib inline 

import gc
import json
import requests
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import category_encoders as ce

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit
import lightgbm as lgb
import optuna

from sklearn.metrics import roc_curve
from sklearn.metrics import auc
from sklearn.model_selection import learning_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import confusion_matrix

from sklearn.inspection import partial_dependence
import shap

# dataset

In [None]:
df_transaction_train = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")
df_identity_train = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
df_identity_train["specifyIdentity"] = 1
df_train = pd.merge(df_transaction_train, df_identity_train, how='left', on="TransactionID")
df_train["isTest"] = 0
df_train["specifyIdentity"] = df_train["specifyIdentity"].fillna(0) 
df_train.describe()

In [None]:
df_transaction_test = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")
df_identity_test = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
df_identity_test["specifyIdentity"] = 1
df_test = pd.merge(df_transaction_test, df_identity_test, how='left', on="TransactionID")
df_test["isTest"] = 1
df_test["specifyIdentity"] = df_test["specifyIdentity"].fillna(0)
df_test.describe()

In [None]:
del df_transaction_train
del df_identity_train
del df_transaction_test
del df_identity_test

gc.collect()

In [None]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df.describe()

In [None]:
list_col_constant_val_train = [col for col in df_train.columns if len(df_train[col].unique()) == 1]
list_col_constant_val_train

In [None]:
del df_train
del df_test

gc.collect()

# preprocessing

In [None]:
list_col_constant_val = [col for col in df.columns if len(df[col].unique()) == 1]
list_col_constant_val

In [None]:
df = df.drop(columns=list_col_constant_val)

In [None]:
df["input_rate"] = 1 - df.isna().sum(axis=1) / len(df.columns)
df["input_rate"].describe()

In [None]:
# https://www.kaggle.com/danofer/ieee-fraud-features-xgboost-0-934-lb

START_DATE = "2017-12-01"
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
df["TransactionDT"] = df["TransactionDT"].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

# df["month"] = df["TransactionDT"].dt.month
df["TransactionDT_dow"] = df["TransactionDT"].dt.dayofweek
df["TransactionDT_hour"] = df["TransactionDT"].dt.hour
df["TransactionDT_day"] = df["TransactionDT"].dt.day
df["TransactionDT_part_of_month"] = [ "head" if day < 10 else "middle" if day < 20 else "tail" for day in df["TransactionDT_day"]]

#  df.drop(["TransactionDT"],axis=1,inplace=True)

In [None]:
# #https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
dict_emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
list_us_emails = ['gmail', 'net', 'edu']

for col in ['P_emaildomain', 'R_emaildomain']:
    df[col + '_bin'] = df[col].map(dict_emails)
    df[col + '_suffix'] = df[col].map(lambda x: str(x).split('.')[-1])
    df[col + '_suffix'] = df[col + '_suffix'].map(lambda x: x if str(x) not in list_us_emails else 'us')

In [None]:
list_categorical_feature = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "DeviceType", "DeviceInfo", "id_12", "id_13", "id_14", "id_15", "id_16", "id_17", "id_18", "id_19", "id_20", "id_21", "id_22", "id_23", "id_24", "id_25", "id_26", "id_27", "id_28", "id_29", "id_30", "id_31", "id_32", "id_33", "id_34", "id_35", "id_36", "id_37", "id_38"]

In [None]:
list_categorical_feature.extend(["TransactionDT_dow", "TransactionDT_hour", "TransactionDT_day", "TransactionDT_part_of_month", "P_emaildomain_bin", "P_emaildomain_suffix", "R_emaildomain_bin", "R_emaildomain_suffix"])

In [None]:
for col in list_categorical_feature:
    df[col] = df[col].astype('category')
    df[col] = df[col].cat.codes

# modeling

In [None]:
X_train = df.query('isTest == 0').drop(columns=["TransactionDT", "isFraud"])
y_train = df.query('isTest == 0')["isFraud"]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, train_size=0.8, random_state=175, shuffle=True, stratify=y_train)

In [None]:
key_train = X_train["TransactionID"]
X_train = X_train.drop(columns=["TransactionID"])
key_valid = X_valid["TransactionID"]
X_valid = X_valid.drop(columns=["TransactionID"])

X_test = df.query('isTest == 1').drop(columns=["TransactionDT", "isFraud"])
key_test = X_test["TransactionID"]
X_test = X_test.drop(columns=["TransactionID"])

In [None]:
X_train = X_train.drop(columns=["isTest"])
X_valid = X_valid.drop(columns=["isTest"])
X_test = X_test.drop(columns=["isTest"])

In [None]:
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=list_categorical_feature, free_raw_data=False)
eval_data = lgb.Dataset(X_valid, label=y_valid, reference= train_data, categorical_feature=list_categorical_feature, free_raw_data=False)


In [None]:
y_train.value_counts()

In [None]:
y_train.value_counts()[0]

In [None]:
neg_pos_ratio = y_train.value_counts()[0] / y_train.value_counts()[1]
neg_pos_ratio

In [None]:
# def objective(trial):
#     params = {
#         'boosting': 'gbdt', 
#         'objective': 'binary', 
#         'metric':'auc', 
#         'seed': 175, 
# #         'max_depth': trial.suggest_int('max_depth', 3, 10), 
#         'num_leaves': 127, 
#         'learning_rate': trial.suggest_loguniform('learning_rate', 1e-8, 1.0), 
# #         'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.5, 1.0), 
#         'pos_bagging_fraction': 1,
#         'neg_bagging_fraction': 1 / neg_pos_ratio, 
#         'bagging_freq': 1, 
# #         'feature_fraction': trial.suggest_uniform('feature_fraction', 0.5, 1.0)
#         'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-3, 1e3), 
#         'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-3, 1e3)
#     }
    
#     gbm = lgb.train(
#         params,
#         train_data,
#         valid_sets=[train_data, eval_data],
#         valid_names=['Train', 'Eval'],
#         num_boost_round=10000,
#         early_stopping_rounds=100, 
#         verbose_eval=20
#     )
#     y_pred_prob = gbm.predict(X_valid)
#     fpr, tpr, thresholds = roc_curve(y_valid, y_pred_prob, pos_label=1)
#     score = auc(fpr, tpr)
#     return score * -1

# study = optuna.create_study(direction='maximize')
# study.optimize(objective, n_trials=30)

# print('Number of finished trials: {}'.format(len(study.trials)))

# print('Best trial:')
# trial = study.best_trial

# print('  Value: {}'.format(trial.value))

# print('  Params: ')
# for key, value in trial.params.items():
#     print('    {}: {}'.format(key, value))

In [None]:
# # best_max_depth = trial.params['max_depth']
# best_learning_rate = trial.params['learning_rate']
# # best_bagging_fraction = trial.params['bagging_fraction']
# # best_feature_fraction = trial.params['feature_fraction']
# best_l1 = trial.params['lambda_l1']
# best_l2 = trial.params['lambda_l2']

In [None]:
# params = {
#     'boosting': 'gbdt', 
#     'objective': 'binary', 
#     'metric':'auc', 
#     'seed': 175, 
#     'num_leaves': 127, 
# #     'max_depth': best_max_depth, 
#     'learning_rate':best_learning_rate, 
# #     'bagging_fraction': best_bagging_fraction, 
#     'pos_bagging_fraction': 1,
#     'neg_bagging_fraction': 1 / neg_pos_ratio, 
#     'bagging_freq': 1, 
# #     'feature_fraction': best_feature_fraction, 
#     'lambda_l1': best_l1, 
#     'lambda_l2': best_l2, 
#     'verbosity': -1
# }
# params

In [None]:
# params = {
#     'boosting': 'gbdt', 
#     'objective': 'binary', 
#     'metric':'auc', 
#     'seed': 175, 
#     'num_leaves': 127, 
#     'max_depth': -1, 
#     'learning_rate': 0.02, 
# #     'bagging_fraction': 0.8, 
#     'pos_bagging_fraction': 1,
#     'neg_bagging_fraction': 1 / neg_pos_ratio, 
#     'bagging_freq': 1, 
#     'feature_fraction': 0.8, 
# #     'scale_pos_weight': neg_pos_ratio, 
# #     'is_unbalance': True, 
#     'verbosity': -1
# }

In [None]:
# params = {
#     'learning_rate': 0.009, 
#     'max_depth': 10, 
#     'boosting': 'gbdt', 
#     'objective': 'binary', 
#     'metric':'auc', 'seed': 4, 
#     'num_iterations': 10000, 
#     'early_stopping_round': 100, 
#     'verbose_eval': 200, 
#     'num_leaves': 64, 
#     'feature_fraction': 0.8, 
#     'bagging_fraction': 0.8, 
#     'bagging_freq': 5
# }

In [None]:
params = {
          'objective':'binary',
          'boosting_type':'gbdt',
          'metric':'auc',
          'n_jobs':-1,
          'max_depth':-1,
          'tree_learner':'serial',
          'min_data_in_leaf':30,
#           'n_estimators':1800,
          'n_estimators':10000,
          'max_bin':255,
          'verbose':-1,
          'seed': 1229,
          'learning_rate': 0.01,
          'early_stopping_rounds':200,
          'colsample_bytree': 0.5,          
          'num_leaves': 256, 
          'reg_alpha': 0.35, 
         }

In [None]:
evaluation_results = {}
gbm = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, eval_data],
    valid_names=['Train', 'Eval'],
    evals_result=evaluation_results,
    num_boost_round=10000,
    early_stopping_rounds=100, 
    verbose_eval=20
)

In [None]:
lgb.plot_importance(gbm, importance_type='gain', max_num_features=30, figsize=(12, 6))

In [None]:
lgb.plot_importance(gbm, importance_type='split', max_num_features=30, figsize=(12, 6))

In [None]:
# lgb.plot_split_value_histogram(gbm, "TransactionAmt")

In [None]:
# lgb.plot_metric(gbm)

In [None]:
y_pred_prob = gbm.predict(X_valid)
y_pred_prob

In [None]:
df_valid = X_valid.copy()
df_valid["isFraud"] = y_valid
df_valid["predict_prob"] = y_pred_prob

In [None]:
# sns.catplot(x="predict_prob", y="isFraud", data=df_valid, kind='violin')

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
sns.distplot(df_valid.loc[df_valid["isFraud"]==0, "predict_prob"], bins=50, hist=True, kde=False, norm_hist=True, color="steelblue", label="not Fruad", ax=ax1)
sns.distplot(df_valid.loc[df_valid["isFraud"]==1, "predict_prob"], bins=50, hist=True, kde=False, norm_hist=True, color="darkred", label="Fruad", ax=ax1)

ax2 = fig.add_subplot(212, sharex=ax1)
sns.distplot(df_valid.loc[df_valid["isFraud"]==0, "predict_prob"], bins=50, hist=False, kde=True, norm_hist=False, color="steelblue", label="not Fruad", ax=ax2)
sns.distplot(df_valid.loc[df_valid["isFraud"]==1, "predict_prob"], bins=50, hist=False, kde=True, norm_hist=False, color="darkred", label="Fruad", ax=ax2)

plt.xlim([0, 1])

In [None]:
sns.distplot(df_valid.loc[df_valid["isFraud"]==1, "predict_prob"], bins=50, hist=True, kde=False, norm_hist=False, color="darkred", label="Fruad")

In [None]:
df_valid.loc[df_valid["isFraud"]==1, ].sort_values("predict_prob").head()

In [None]:
df_valid.loc[df_valid["isFraud"]==1, ].sort_values("predict_prob").tail()

In [None]:
# df_valid["input_rate"] = 1 - df_valid.isna().sum(axis=1) / len(df_valid.columns)
# df_valid["input_rate"].describe()

In [None]:
# sns.lmplot(x="predict_prob", y="input_rate", data=df_valid.query('isFraud == 1'))

In [None]:
# sns.relplot(x="predict_prob", y="input_rate", col="isFraud", data=df_valid)

In [None]:
# grid = sns.FacetGrid(df_valid, col="specifyIdentity", row="isFraud")
# grid.map(sns.distplot, "predict_prob", hist=False, kde=True)

In [None]:
# def plot_learning_curve(estimator, X, y, scoring_metrix, cv, 
#                         train_sizes=np.linspace(0.1, 1.0, 10), 
#                         title="learning curve", ylim=(0.0, 1.01)):
#     plt.figure()
#     train_sizes, train_scores, test_scores = learning_curve(estimator, X, y, 
#         groups=None, train_sizes=train_sizes, cv=cv, scoring=scoring_metrix, 
#         exploit_incremental_learning=False, n_jobs=-1, pre_dispatch='all', 
#         verbose=0, shuffle=False, random_state=0, error_score='raise-deprecation')
#     train_scores_mean = np.mean(train_scores, axis=1)
#     train_scores_std = np.std(train_scores, axis=1)
#     test_scores_mean = np.mean(test_scores, axis=1)
#     test_scores_std = np.std(test_scores, axis=1)

#     plt.fill_between(train_sizes, train_scores_mean - train_scores_std, 
#         train_scores_mean + train_scores_std, alpha=0.1, color='royalblue')
#     plt.fill_between(train_sizes, test_scores_mean - test_scores_std, 
#         test_scores_mean + test_scores_std, alpha=0.1, color='orange')
#     plt.plot(train_sizes, train_scores_mean, 'o-', color='royalblue', label="training score")
#     plt.plot(test_sizes, test_scores_mean, 'o-', color='royalblue', label="cross-validation score")

#     plt.set_title(title)
#     plt.grid()
#     plt.xlabel("training sample")
#     plt.ylabel("score")
#     plt.ylim(ylim)
#     plt.legend(loc='lower right')

#     return plt

In [None]:
# plot_learning_curve(gbm, X_valid, y_valid, scoring_metrix='roc_auc', cv=StratifiedShuffleSplit(n_splits=5, test_size=0.2, random_state=0))

In [None]:
plt.plot(evaluation_results['Train']['auc'], label='Train')
plt.plot(evaluation_results['Eval']['auc'], label='Eval')
plt.ylabel('auc')
plt.xlabel('Boosting round')
plt.title('Training performance')
plt.legend()

In [None]:
def plot_roc_curve(y, y_pred_prob, title="ROC curve"):
    false_positive_rate, true_positive_rate, thresholds = roc_curve(y, y_pred_prob, pos_label=1)
    auc_score = auc(false_positive_rate, true_positive_rate)

    plt.plot(false_positive_rate, true_positive_rate, label="ROC curve (area = %.4f)"%auc_score)
    plt.title(title)
    plt.grid()
    plt.xlabel("false positive rate")
    plt.ylabel("true positive rate")
    plt.legend()

    return plt

In [None]:
plot_roc_curve(y_valid, y_pred_prob)

In [None]:
def plot_score_changed_thresholds(y, y_pred_prob, title="model-score changed thresholds"):
    precision, recall, thresholds = precision_recall_curve(y, y_pred_prob, pos_label=1)
    thresholds = np.append(thresholds, 1)
    fscore = 2 * precision * recall / (precision + recall)

    plt.plot(thresholds, precision, linestyle='-', color=sns.color_palette('tab10', 4)[0])
    plt.plot(thresholds, recall, linestyle='--', color=sns.color_palette('tab10', 4)[1])
    plt.plot(thresholds, fscore, linestyle='-.', color=sns.color_palette('tab10', 4)[2])

    leg = plt.legend(("precision", "recall", "f-measure"), loc='upper right', frameon=True)
    leg.get_frame().set_edgecolor('k')
    plt.grid()
    plt.xlim([0, 1])
    plt.title("model-score changed thresholds")
    plt.xlim([0, 1])
    plt.xlabel("thresholds")
    plt.ylim([0, 1])
    plt.ylabel("score")

    return plt


In [None]:
plot_score_changed_thresholds(y_valid, y_pred_prob)

In [None]:
# y_pred = np.where(y_pred_prob > 0.1, 1, 0)
# confusion_matrix(y_valid, y_pred)

In [None]:
sub = pd.DataFrame(pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")['TransactionID'])
sub['isFraud'] = gbm.predict(X_test)
sub.to_csv("submission.csv", index = False)

In [None]:
def write_spreadsheet(*args):
    endpoint = 'https://script.google.com/macros/s/AKfycbxZhZc3lPJ6eLATt_r9dPVQZUjuzpIvQ6vjSYFZdLOlt1TqWvtC/exec'
    requests.post(endpoint, json.dumps(args))

In [None]:
# write_spreadsheet('baseline', *scores)
# write_spreadsheet('baseline', auc)

# explain

In [None]:
# features = ["czrd1", "TransactionAmt", "card2", "C13", "D2"]
# partial_dependence(gbm, X_train, features)

In [None]:
# explainer = shap.TreeExplainer(gbm)
# shap_values = explainer.shap_values(X_valid)

In [None]:
# df_shap = pd.DataFrame(data=shap_values, index=key_valid, columns=X_valid.columns)
# df_shap.to_csv("shap_value.csv", index = False)

In [None]:
# shap.initjs()

In [None]:
# shap.summary_plot(shap_values, X_valid)

In [None]:
# sns.relplot(x="predict_prob", y="card1", col="isFraud", data=df_valid)

In [None]:
# sns.relplot(x="predict_prob", y="TransactionAmt", col="isFraud", data=df_valid)

In [None]:
# shap.dependence_plot("card1", shap_values, X_valid)

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[0,:], X_valid.iloc[0,:])

In [None]:
# index_key = df_valid["predict_prob"].reset_index(drop=True).idxmax()
# shap.force_plot(explainer.expected_value, shap_values[index_key,:], X_valid.iloc[index_key,:])

In [None]:
# index_key = df_valid["predict_prob"].reset_index(drop=True).idxmin()
# shap.force_plot(explainer.expected_value, shap_values[index_key,:], X_valid.iloc[index_key,:])

In [None]:
# shap.force_plot(explainer.expected_value, shap_values[1:1000], X_valid[1:1000])

In [None]:
# shap_interaction_values = explainer.shap_interaction_values(train_X)

In [None]:
# shap.summary_plot(shap_interaction_values, train_X)

In [None]:
def send_line_notification(message):
    line_token = 'tB3RUUxiXOOMB8KjRJXsBEWP2zAnaWzRypXKmQHsiNd'  # 終わったら無効化する
    endpoint = 'https://notify-api.line.me/api/notify'
    message = "\n{}".format(message)
    payload = {'message': message}
    headers = {'Authorization': 'Bearer {}'.format(line_token)}
    requests.post(endpoint, data=payload, headers=headers)

In [None]:
send_line_notification("execution compelete!")