# setteing

In [None]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import sys
sys.path.append('../')
import common
import gc
import math
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pandas_profiling as pdp
import missingno as msno
from matplotlib.backends.backend_pdf import PdfPages

pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{: .2f}'.format)

# dataset

In [None]:
df_transaction_train = pd.read_csv("../input/ieee-fraud-detection/train_transaction.csv")
df_identity_train = pd.read_csv("../input/ieee-fraud-detection/train_identity.csv")
df_identity_train["specifyIdentity"] = 1
df_train = pd.merge(df_transaction_train, df_identity_train, how='left', on="TransactionID")
df_train["isTest"] = 0
df_train["specifyIdentity"] = df_train["specifyIdentity"].fillna(0) 
df_train.describe()

In [None]:
df_transaction_test = pd.read_csv("../input/ieee-fraud-detection/test_transaction.csv")
df_identity_test = pd.read_csv("../input/ieee-fraud-detection/test_identity.csv")
df_identity_test["specifyIdentity"] = 1
df_test = pd.merge(df_transaction_test, df_identity_test, how='left', on="TransactionID")
df_test["isTest"] = 1
df_test["specifyIdentity"] = df_test["specifyIdentity"].fillna(0) 
df_test.describe()

In [None]:
df = pd.concat([df_train, df_test]).reset_index(drop=True)
df.describe()

In [None]:
del df_transaction_train
del df_identity_train
del df_train
del df_transaction_test
del df_identity_test
del df_test

gc.collect()

* Constant Value : C5,C9 etc<br>
* All Null : ****D11,V1,V10,dist1 etc

In [None]:
df.info()

In [None]:
df = common.reduce_mem_usage(df)

In [None]:
df.info()

In [None]:
for col in df.columns:
    if df[col].dtype == object:
        print(col)

In [None]:
list_categorical_feature = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", "R_emaildomain", "M1", "M2", "M3", "M4", "M5", "M6", "M7", "M8", "M9", "DeviceType", "DeviceInfo", "id_12", "id_13", "id_14", "id_15", "id_16", "id_17", "id_18", "id_19", "id_20", "id_21", "id_22", "id_23", "id_24", "id_25", "id_26", "id_27", "id_28", "id_29", "id_30", "id_31", "id_32", "id_33", "id_34", "id_35", "id_36", "id_37", "id_38"]

In [None]:
df[list_categorical_feature] = df[list_categorical_feature].astype('category')

In [None]:
df.info()

In [None]:
list_col_constant_val = [col for col in df.columns if len(df[col].unique()) == 1]
list_col_constant_val

In [None]:
df = df.drop(columns=list_col_constant_val)

In [None]:
df.info()

# preprocessing

In [None]:
df["input_rate"] = 1 - df.isna().sum(axis=1) / len(df.columns.drop(["TransactionID", "isTest", "specifyIdentity"]))
df["input_rate"] = df["input_rate"].astype('float32')
df["input_rate"].describe()

In [None]:
# https://www.kaggle.com/danofer/ieee-fraud-features-xgboost-0-934-lb

START_DATE = "2017-12-01"
startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')
df["TransactionDT"] = df["TransactionDT"].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))

# df["month"] = df["TransactionDT"].dt.month
df["TransactionDT_dow"] = df["TransactionDT"].dt.dayofweek
df["TransactionDT_hour"] = df["TransactionDT"].dt.hour
df["TransactionDT_day"] = df["TransactionDT"].dt.day
df["TransactionDT_part_of_month"] = [ "head" if day < 10 else "middle" if day < 20 else "tail" for day in df["TransactionDT_day"]]

#  df.drop(["TransactionDT"],axis=1,inplace=True)

In [None]:
# #https://www.kaggle.com/c/ieee-fraud-detection/discussion/100499#latest-579654
dict_emails = {'gmail': 'google', 'att.net': 'att', 'twc.com': 'spectrum', 'scranton.edu': 'other', 'optonline.net': 'other', 'hotmail.co.uk': 'microsoft', 'comcast.net': 'other', 'yahoo.com.mx': 'yahoo', 'yahoo.fr': 'yahoo', 'yahoo.es': 'yahoo', 'charter.net': 'spectrum', 'live.com': 'microsoft', 'aim.com': 'aol', 'hotmail.de': 'microsoft', 'centurylink.net': 'centurylink', 'gmail.com': 'google', 'me.com': 'apple', 'earthlink.net': 'other', 'gmx.de': 'other', 'web.de': 'other', 'cfl.rr.com': 'other', 'hotmail.com': 'microsoft', 'protonmail.com': 'other', 'hotmail.fr': 'microsoft', 'windstream.net': 'other', 'outlook.es': 'microsoft', 'yahoo.co.jp': 'yahoo', 'yahoo.de': 'yahoo', 'servicios-ta.com': 'other', 'netzero.net': 'other', 'suddenlink.net': 'other', 'roadrunner.com': 'other', 'sc.rr.com': 'other', 'live.fr': 'microsoft', 'verizon.net': 'yahoo', 'msn.com': 'microsoft', 'q.com': 'centurylink', 'prodigy.net.mx': 'att', 'frontier.com': 'yahoo', 'anonymous.com': 'other', 'rocketmail.com': 'yahoo', 'sbcglobal.net': 'att', 'frontiernet.net': 'yahoo', 'ymail.com': 'yahoo', 'outlook.com': 'microsoft', 'mail.com': 'other', 'bellsouth.net': 'other', 'embarqmail.com': 'centurylink', 'cableone.net': 'other', 'hotmail.es': 'microsoft', 'mac.com': 'apple', 'yahoo.co.uk': 'yahoo', 'netzero.com': 'other', 'yahoo.com': 'yahoo', 'live.com.mx': 'microsoft', 'ptd.net': 'other', 'cox.net': 'other', 'aol.com': 'aol', 'juno.com': 'other', 'icloud.com': 'apple'}
list_us_emails = ['gmail', 'net', 'edu']

for col in ['P_emaildomain', 'R_emaildomain']:
    df[col + '_bin'] = df[col].map(dict_emails)
    df[col + '_suffix'] = df[col].map(lambda x: str(x).split('.')[-1])
    df[col + '_suffix'] = df[col + '_suffix'].map(lambda x: x if str(x) not in list_us_emails else 'us')

In [None]:
# https://www.kaggle.com/gsnehaa21/federal-holidays-usa-19662020
df_usholidays = pd.read_csv("../input/federal-holidays-usa-19662020/usholidays.csv")
df_usholidays = df_usholidays[datetime(2017, 12, 1) <= pd.to_datetime(df_usholidays["Date"])]

In [None]:
# list_wk = [1 if  date in pd.to_datetime(df_usholidays["Date"]) else 0 for date in df["TransactionDT"].dt.date]
# np.count_nonzero(list_wk)

# EDA

In [None]:
msno.matrix(df[1:1000], sort='ascending')

In [None]:
msno.bar(df[1:1000])

In [None]:
msno.heatmap(df[1:1000])

In [None]:
msno.dendrogram(df[1:1000])

In [None]:
list_categorical_unique_val_num = []

for col in df.columns:
    if (col in list_categorical_feature) | (df[col].dtype == object):
        unique_val_num = len(df[col].unique())
        list_categorical_unique_val_num.append([col, unique_val_num])

df_categorical_unique_val_num = pd.DataFrame(list_categorical_unique_val_num, columns=["feature", "unique_values"])
df_categorical_unique_val_num.sort_values("unique_values", ascending=False)

In [None]:
sns.barplot(x="feature", y="unique_values", data=df_categorical_unique_val_num.sort_values("unique_values", ascending=False))
plt.xticks(rotation=90)

In [None]:
list_neumeric_unique_val_num = []

for col in df.select_dtypes(include='number'):
    unique_val_num = len(df[col].unique())
    list_neumeric_unique_val_num.append([col, unique_val_num])

df_neumeric_unique_val_num = pd.DataFrame(list_neumeric_unique_val_num, columns=["feature", "unique_values"])
df_neumeric_unique_val_num.sort_values("unique_values", ascending=False).tail(50)

In [None]:
def plot_all_feature(arg_df, compare_feature_name):
    pp = PdfPages("./output/plot_feature_compare_{}.pdf".format(compare_feature_name))

    for col in arg_df.columns:
        print(col + " : " + str(arg_df[col].dtype))

        if col in ["TransactionID", "TransactionDT"]:
            print(" ※skip")
            continue

        unique_val_num = len(arg_df[col].unique())

        fig = plt.figure(figsize=(12, 9))

        if (col in list_categorical_feature) | (arg_df[col].dtype == object) | (unique_val_num < 10):
            ax1 = fig.add_subplot(221)
            bars = pd.crosstab(arg_df[compare_feature_name], arg_df[col], dropna=True, normalize =False).plot(kind='bar', stacked=True, ax=ax1)
            ax1.set_title("{} input_rate: {:.2%}".format(col, 1 - df[col].isna().sum(axis=0) / len(df)))
            ax1.set_xlabel("")
            ax1.set_xticks([])
            ax1.set_ylim(ymax=len(arg_df))
            dict_xy = {}
            for bar in bars.patches:
                if bar.get_x() not in dict_xy:
                    dict_xy[bar.get_x()] = bar.get_height()
                else:
                    dict_xy[bar.get_x()] = dict_xy.get(bar.get_x()) + bar.get_height()
            for dict_x, dict_y in dict_xy.items():
                ax1.annotate(int(dict_y), xy=(dict_x + bar.get_width() / 2, dict_y), xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

            ax2 = fig.add_subplot(222)
            bars = pd.crosstab(arg_df[col], arg_df[compare_feature_name], dropna=True, normalize =False).plot(kind='bar', stacked=True, colormap='coolwarm', ax=ax2)
            ax2.set_xlabel("")
            ax2.set_xticks([])
            ax2.set_ylim(ymax=len(arg_df))
            dict_xy = {}
            for bar in bars.patches:
                if bar.get_x() not in dict_xy:
                    dict_xy[bar.get_x()] = bar.get_height()
                else:
                    dict_xy[bar.get_x()] = dict_xy.get(bar.get_x()) + bar.get_height()
            for dict_x, dict_y in dict_xy.items():
                ax2.annotate(int(dict_y), xy=(dict_x + bar.get_width() / 2, dict_y), xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

            ax3 = fig.add_subplot(223)
            pd.crosstab(arg_df[compare_feature_name], arg_df[col], dropna=True, normalize ='index').plot(kind='bar', legend=False, stacked=True, ax=ax3)

            ax4 = fig.add_subplot(224)
            pd.crosstab(arg_df[col], arg_df[compare_feature_name], dropna=True, normalize ='index').plot(kind='bar', legend=False, stacked=True, colormap='coolwarm', ax=ax4)

            plt.xticks(rotation=90)

        else:
            x_min = arg_df[col].min()
            x_max = arg_df[col].max()

            if (0 <= x_min) & (x_max <= 1):
                adjust_bins = np.arange(0, 1, 0.05)
            elif unique_val_num < 50:
                adjust_bins = np.arange(x_min, x_max, 1)
            else:
                adjust_bins = np.arange(x_min, x_max, max(1, int(abs(x_max - x_min) / 50)))

            ax1 = fig.add_subplot(221)
#             sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="steelblue", label="0", ax=ax1)
            sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="steelblue", label="0", ax=ax1)
#             sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="darkred",  label="1", ax=ax1)
            sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="darkred",  label="1", ax=ax1)
            ax1.set_title("{} input_rate: {:.2%}".format(col, 1 - df[col].isna().sum(axis=0) / len(df)))

            ax2 = fig.add_subplot(222)
            if 0 < arg_df[col].min():
                log_x_min = math.floor(np.log10(arg_df[col].min()))
                log_x_max = math.ceil(np.log10(arg_df[col].max())) 
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna(), bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="steelblue",  label="0", ax=ax2)
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna(), bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="darkred", label="1", ax=ax2)
                ax2.set_xscale('log')
            else:
                log_x_min = math.floor(np.log10(arg_df[col].min() - arg_df[col].min() + 1))
                log_x_max = math.ceil(np.log10(arg_df[col].max() - arg_df[col].min() + 1)) 
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna() - arg_df[col].min() + 1, bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="steelblue",  label="0", ax=ax2)
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna() - arg_df[col].min() + 1, bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="darkred", label="1", ax=ax2)
                ax2.set_xscale('log')
                ax2.set_xticks([])

            ax3 = fig.add_subplot(223, sharex=ax1)
            sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna(), hist=False, kde=True, norm_hist=False, color="steelblue", label="0", ax=ax3)
            sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna(), hist=False, kde=True, norm_hist=False, color="darkred", label="1", ax=ax3)

            ax4 = fig.add_subplot(224, sharex=ax2)
            if 0 < arg_df[col].min():
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna(), hist=False, kde=True, norm_hist=False, color="steelblue", label="0", ax=ax4)
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna(), hist=False, kde=True, norm_hist=False, color="darkred", label="1", ax=ax4)
                ax4.set_xscale('log')
            else:
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==0, col].dropna() - arg_df[col].min() + 1, hist=False, kde=True, norm_hist=False, color="steelblue", label="0", ax=ax4)
                sns.distplot(arg_df.loc[arg_df[compare_feature_name]==1, col].dropna() - arg_df[col].min() + 1, hist=False, kde=True, norm_hist=False, color="darkred", label="1", ax=ax4)
                ax4.set_xscale('log')
                ax4.set_xticks([])

        plt.show()
        pp.savefig(fig, forrmat='pdf')
        fig.clf()
    pp.close()

In [None]:
# plot_all_feature(df.loc[df["isFraud"].isin([0, 1]), ], "isFraud")

In [None]:
# plot_all_feature(df, "isTest")

[todo]
* ~~na率をヒストグラムに表示~~
* ~~スキップしたカテゴリ過多"DeviceInfo", "P_emaildomain", "R_emaildomain", "addr1"~~
* ~~相対棒グラフでnaのtrain/test比も集計したい~~
* ~~棒グラフに件数表示~~
* ~~対数変換できないものは最小値>0に変換する~~

In [None]:
# pp = PdfPages("./output/plot_feature_compare_label.pdf")

# for col in df_train.columns:
#     print(col + " : " + str(df_train[col].dtype))
    
#     if col in ["TransactionID", "TransactionDT"]:
#         print(" ※skip")
#         continue
    
#     unique_val_num = len(df_train[col].unique())
        
#     fig = plt.figure(figsize=(12, 9))
    
#     if (col in list_categorical_feature) | (df_train[col].dtype == object) | (unique_val_num < 10):
#         ax1 = fig.add_subplot(221)
# #             df_train.groupby(["isFraud"])[col].value_counts(dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax1)
#         pd.crosstab(df_train["isFraud"], df_train[col], dropna=True, normalize =False).plot(kind='bar', stacked=True, ax=ax1)
#         ax1.set_title("{} input_rate: {:.2%}".format(col, 1 - df[col].isna().sum(axis=0) / len(df)))
#         ax1.set_xlabel("")
#         ax1.set_xticks([])
#         ax1.set_ylim(ymax=len(df))

#         ax2 = fig.add_subplot(222)
# #             bars = df_train.groupby([col])["isFraud"].value_counts(dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, colormap='coolwarm', ax=ax2)
#         bar = pd.crosstab(df_train[col], df_train["isFraud"], dropna=True, normalize =False).plot(kind='bar', stacked=True, ax=ax2)
#         ax2.set_xlabel("")
#         ax2.set_xticks([])
#         ax2.set_ylim(ymax=len(df))
#         dict_xy = {}
#         for bar in bars.patches:
#             if bar.get_x() not in dict_xy:
#                 dict_xy[bar.get_x()] = bar.get_height()
#             else:
#                 dict_xy[bar.get_x()] = dict_xy.get(bar.get_x()) + bar.get_height()
#         for dict_x, dict_y in dict_xy.items():
#             ax2.annotate(int(dict_y), xy=(dict_x + bar.get_width() / 2, dict_y), xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

#         ax3 = fig.add_subplot(223)
# #             df_train.groupby(["isFraud"])[col].value_counts(normalize=True, dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax3)
#         pd.crosstab(df_train["isFraud"], df_train[col], dropna=True, normalize ='index').plot(kind='bar', stacked=True, ax=ax3)

#         ax4 = fig.add_subplot(224)
# #             df_train.groupby([col])["isFraud"].value_counts(normalize=True, dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, colormap='coolwarm', ax=ax4)
#         pd.crosstab(df_train[col], df_train["isFraud"], dropna=True, normalize ='index').plot(kind='bar', stacked=True, ax=ax4)

#         plt.xticks(rotation=90)

#     else:
#         x_min = df_train[col].min()
#         x_max = df_train[col].max()
        
#         if (0 <= x_min) & (x_max <= 1):
#             adjust_bins = np.arange(0, 1, 0.05)
#         elif unique_val_num < 50:
#             adjust_bins = np.arange(x_min, x_max, 1)
#         else:
#             adjust_bins = np.arange(x_min, x_max, max(1, int(abs(x_max - x_min) / 50)))
        
#         ax1 = fig.add_subplot(221)
#         sns.distplot(df_train.loc[df_train["isFraud"]==0, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="steelblue", label="non Fraud", ax=ax1)
#         sns.distplot(df_train.loc[df_train["isFraud"]==1, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="darkred",  label="Fraud", ax=ax1)
#         ax1.set_title("{} input_rate: {:.2%}".format(col, df[col].isna().sum(axis=0) / len(df)))

#         ax2 = fig.add_subplot(222)
#         if 0 < df_train[col].min():
#             log_x_min = math.floor(np.log10(df_train[col].min()))
#             log_x_max = math.ceil(np.log10(df_train[col].max())) 
#             sns.distplot(df_train.loc[df_train["isFraud"]==0, col].dropna(), bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="steelblue",  label="non Fraud", ax=ax2)
#             sns.distplot(df_train.loc[df_train["isFraud"]==1, col].dropna(), bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="darkred", label="Fraud", ax=ax2)
#             ax2.set_xscale('log')

#         ax3 = fig.add_subplot(223, sharex=ax1)
#         sns.distplot(df_train.loc[df_train["isFraud"]==0, col].dropna(), hist=False, kde=True, norm_hist=False, color="steelblue", label="non Fraud", ax=ax3)
#         sns.distplot(df_train.loc[df_train["isFraud"]==1, col].dropna(), hist=False, kde=True, norm_hist=False, color="darkred", label="Fraud", ax=ax3)

#         ax4 = fig.add_subplot(224, sharex=ax2)
#         if 0 < df_train[col].min():
#             sns.distplot(df_train.loc[df_train["isFraud"]==0, col].dropna(), hist=False, kde=True, norm_hist=False, color="steelblue", label="non Fraud", ax=ax4)
#             sns.distplot(df_train.loc[df_train["isFraud"]==1, col].dropna(), hist=False, kde=True, norm_hist=False, color="darkred", label="Fraud", ax=ax4)
#             ax4.set_xscale('log')

#     plt.show()
#     pp.savefig(fig, forrmat='pdf')
#     fig.clf()
# pp.close()

In [None]:
# pp = PdfPages("./output/plot_feature_compare_train_test.pdf")

# for col in df.columns:
#     print(col + " : " + str(df[col].dtype))
    
#     if col in ["TransactionID", "TransactionDT"]:
#         print(" ※skip")
#         continue
    
#     unique_val_num = len(df[col].unique())
        
#     fig = plt.figure(figsize=(12, 9))
    
#     if (col in list_categorical_feature) | (df[col].dtype == object) | (unique_val_num < 10):

#         ax1 = fig.add_subplot(221)
# #         df.groupby(["isTest"])[col].value_counts(dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax1)
#         pd.crosstab(df_train["isTest"], df_train[col], dropna=True, normalize =False).plot(kind='bar', stacked=True, ax=ax1)
#         ax1.set_title("{} input_rate: {:.2%}".format(col, df[col].isna().sum(axis=0) / len(df)))

#         ax2 = fig.add_subplot(222)
# #         bars = df.groupby([col])["isTest"].value_counts(dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax2)
#         bars = pd.crosstab(df_train[col], df_train["isTest"], dropna=True, normalize =False).plot(kind='bar', stacked=True, ax=ax2)
#         dict_xy = {}
#         for bar in bars.patches:
#             if bar.get_x() not in dict_xy:
#                 dict_xy[bar.get_x()] = bar.get_height()
#             else:
#                 dict_xy[bar.get_x()] = dict_xy.get(bar.get_x()) + bar.get_height()
#         for dict_x, dict_y in dict_xy.items():
#             ax2.annotate(int(dict_y), xy=(dict_x + bar.get_width() / 2, dict_y), xytext=(0, 3), textcoords='offset points', ha='center', va='bottom')

#         ax3 = fig.add_subplot(223)
# #         df.groupby(["isTest"])[col].value_counts(normalize=True, dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax3)
#         pd.crosstab(df_train["isTest"], df_train[col], dropna=True, normalize =True).plot(kind='bar', stacked=True, ax=ax3)

#         ax4 = fig.add_subplot(224)
# #         df.groupby([col])["isTest"].value_counts(normalize=True, dropna=False).unstack(fill_value=0).plot(kind='bar', stacked=True, ax=ax4)
#         pd.crosstab(df_train[col], df_train["isTest"], dropna=True, normalize =True).plot(kind='bar', stacked=True, ax=ax4)
#         plt.xticks(rotation=90)

#     else:
#         x_min = df[col].min()
#         x_max = df[col].max()
       
#         if (0 <= x_min) & (x_max <= 1):
#             adjust_bins = np.arange(0, 1, 0.05)
#         elif unique_val_num < 50:
#             adjust_bins = np.arange(x_min, x_max, 1)
#         else:
#             adjust_bins = np.arange(x_min, x_max, max(1, int(abs(x_max - x_min) / 50)))
        
#         ax1 = fig.add_subplot(221)
#         sns.distplot(df.loc[df["isTest"]==0, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="c", label="train", ax=ax1)
#         sns.distplot(df.loc[df["isTest"]==1, col].dropna(), bins=adjust_bins, hist=True, kde=False, norm_hist=True, color="gold", label="test", ax=ax1)
#         ax1.set_title("{} input_rate: {:.2%}".format(col, df[col].isna().sum(axis=0) / len(df)))
        
#         ax2 = fig.add_subplot(222)
#         if 0 < df_train[col].min():
#             log_x_min = math.floor(np.log10(df_train[col].min()))
#             log_x_max = math.ceil(np.log10(df_train[col].max())) 
#             sns.distplot(df.loc[df["isTest"]==0, col].dropna(), bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="c", label="train", ax=ax2)
#             sns.distplot(df.loc[df["isTest"]==1, col].dropna(), bins=np.logspace(log_x_min, log_x_max, 50), hist=True, kde=False, norm_hist=True, color="gold", label="test", ax=ax2)
#             ax2.set_xscale('log')
        
#         ax3 = fig.add_subplot(223, sharex=ax1)
#         sns.distplot(df.loc[df["isTest"]==0, col].dropna(), bins=adjust_bins, hist=False, kde=True, norm_hist=False, color="c", label="train", ax=ax3)
#         sns.distplot(df.loc[df["isTest"]==1, col].dropna(), bins=adjust_bins, hist=False, kde=True, norm_hist=False, color="gold", label="test", ax=ax3)
        
#         ax4 = fig.add_subplot(224, sharex=ax2)
#         if 0 < df_train[col].min():
#             sns.distplot(df.loc[df["isTest"]==0, col].dropna(), bins=np.logspace(0, 3, 50), hist=False, kde=True, norm_hist=False, color="c", label="train", ax=ax4)
#             sns.distplot(df.loc[df["isTest"]==1, col].dropna(), bins=np.logspace(0, 3, 50), hist=False, kde=True, norm_hist=False, color="gold", label="test", ax=ax4)
#             ax4.set_xscale('log')

#     plt.show()
#     pp.savefig(fig, forrmat='pdf')
#     fig.clf()
# pp.close()

In [None]:
# report = pdp.ProfileReport(df)
# report.to_file(outputfile="./output/01-001_data_overview.html")
# report

# investigate

## 入力率（train/test × label）

In [None]:
df_input_rate = pd.DataFrame([])

In [None]:
df_na = df.copy()
df_na = df_na.isna()
df_na["isTest"] = df["isTest"]
df_na["isFraud"] = df["isFraud"]

In [None]:
df_input_rate["all"] = 1 - df.isna().sum(axis=0) / len(df)

In [None]:
df_input_rate["train_and_non_fraud"] = 1 - df.query('isTest == 0 & isFraud == 0').isna().sum(axis=0) / len(df.query('isTest == 0 & isFraud == 0'))

In [None]:
df_input_rate["train_and_fraud"] = 1 - df.query('isTest == 0 & isFraud == 1').isna().sum(axis=0) / len(df.query('isTest == 0 & isFraud == 1'))

In [None]:
df_input_rate["test"] = 1 - df.query('isTest == 1').isna().sum(axis=0) / len(df.query('isTest == 1'))

In [None]:
plt.figure(figsize=(5, 40))
sns.heatmap(df_input_rate, vmin=0, vmax=1, cmap='Reds')

入力率が高いとfraud？<br>
でもデータ単位の入力率では傾向違うから特定の項目群の入力率に着目すべき？

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(211)
sns.boxenplot(x="input_rate", data=df.query('isTest == 0 & isFraud == 0'), ax=ax1)
ax1.set_xlabel("")
ax1.set_xlim([0, 1])
ax1.set_ylabel("non Fraud")

ax2 = fig.add_subplot(212, sharex=ax1)
sns.boxenplot(x="input_rate", data=df.query('isTest == 0 & isFraud == 1'), ax=ax2)
ax2.set_ylabel("Fraud")

In [None]:
df.query('isTest == 0 & isFraud == 1')

In [None]:
pd.crosstab(df_na["M5"], [df_na["isTest"], df_na["isFraud"]], dropna=False)

## 系統別の相関・散布図

### "C" feature

In [None]:
list_col_startwith_C = [col for col in df.columns if col.startswith("C")]
list_col_startwith_C

In [None]:
msno.heatmap(df[list_col_startwith_C])

In [None]:
plt.figure(figsize=(10, 7.5))
sns.heatmap(df[list_col_startwith_C].corr(), cmap='Reds', annot=True)

In [None]:
sns.pairplot(df[1:100][list_col_startwith_C])

### "D" feature

In [None]:
list_col_startwith_D = [col for col in df.columns if col.startswith("D")]
list_col_startwith_D.remove("DeviceInfo")
list_col_startwith_D.remove("DeviceType")
list_col_startwith_D

In [None]:
msno.heatmap(df[list_col_startwith_D])

In [None]:
df[["D11", "D13"]]

In [None]:
pd.crosstab(df["D11"].isna(), df["D13"].isna())

In [None]:
plt.figure(figsize=(10, 7.5))
sns.heatmap(df[list_col_startwith_D].corr(), cmap='Reds', annot=True)

In [None]:
sns.pairplot(df[1:100][list_col_startwith_D])

### "M" feature

In [None]:
list_col_startwith_M = [col for col in df.columns if col.startswith("M")]
list_col_startwith_M

In [None]:
msno.heatmap(df[list_col_startwith_M])

In [None]:
plt.figure(figsize=(10, 7.5))
sns.heatmap(df[list_col_startwith_M].corr(), cmap='Reds', annot=True)

In [None]:
df[["D11", "D13"]]

In [None]:
sns.pairplot(df[1:100][list_col_startwith_M])

### "V" feature

In [None]:
list_col_startwith_V = [col for col in df.columns if col.startswith("V")]
list_col_startwith_V

In [None]:
msno.heatmap(df[list_col_startwith_V])

In [None]:
plt.figure(figsize=(10, 7.5))
sns.heatmap(df[list_col_startwith_V].corr(), cmap='Reds', annot=True)

In [None]:
# sns.pairplot(df[1:100][list_col_startwith_V])

### "card" feature

In [None]:
list_col_startwith_card = [col for col in df.columns if col.startswith("card")]
list_col_startwith_card

In [None]:
msno.heatmap(df[list_col_startwith_card])

In [None]:
plt.figure(figsize=(10, 7.5))
sns.heatmap(df[list_col_startwith_card].corr(), cmap='Reds', annot=True)

In [None]:
df[["M4", "M7"]]

### "id_" feature

In [None]:
list_col_startwith_id = [col for col in df.columns if col.startswith("id_")]
list_col_startwith_id

In [None]:
msno.heatmap(df[list_col_startwith_id])

In [None]:
plt.figure(figsize=(10, 7.5))
sns.heatmap(df[list_col_startwith_id].corr(), cmap='Reds', annot=True)

### other feature

In [None]:
list_col_numeric = [col for col in df.select_dtypes('number')]
list_col_numeric

## TransactionAmtと相関が強い項目の特定

In [None]:
corr_tranamt_vs_num = df[list_col_numeric].corrwith(df["TransactionAmt"])
corr_tranamt_vs_num.sort_values()

In [None]:
sns.lmplot(x="TransactionAmt", y="V51", data=df[["TransactionAmt", "V51"]].dropna())

In [None]:
corr_tranamt_vs_num = df[list_col_numeric].corrwith(np.log10(df["TransactionAmt"]))
corr_tranamt_vs_num.sort_values()

In [None]:
sns.scatterplot(x=np.log10(df["TransactionAmt"]), y=df["id_02"])

In [None]:
list_col_category = [col for col in df.columns if col not in list_col_numeric]
list_col_category

In [None]:
import statsmodels.api as sm
from statsmodels.formula.api import ols
moore_lm = ols('isFraud ~ DeviceType', data=df).fit()
table = sm.stats.anova_lm(moore_lm, typ=1)
table

In [None]:
table["PR(>F)"]["DeviceType"]

In [None]:
for col in list_col_category:
#     print(col)
    if 60 < len(df[col].unique()):
        print(col)
        print(" ※skip")
        continue
    moore_lm = ols('TransactionAmt ~ {}'.format(col), data=df).fit()
    table = sm.stats.anova_lm(moore_lm, typ=1)
    
    if 0.05 <= table["PR(>F)"][col]:
        print(col)
        print(table["PR(>F)"][col])

In [None]:
sns.boxplot(x="TransactionAmt", y="M1", data=df[["TransactionAmt", "M1"]].dropna())

In [None]:
sns.boxplot(x="TransactionAmt", y="id_23", data=df[["TransactionAmt", "id_23"]].dropna())

In [None]:
sns.boxplot(x="TransactionAmt", y="id_27", data=df[["TransactionAmt", "id_27"]].dropna())

In [None]:
# df_corr_re = pd.DataFrame()
# series_corr = pd.Series()

# for col in list_col_category:
#     corr = common.correlation_ratio(df[["TransactionAmt", col]])
#     series_corr[col] = corr
# df_corr_re["TransactionAmt"] = series_corr
# df_corr_re

In [None]:
# sns.set()
# plt.figure(figsize=(25,5))
# sns.heatmap(df_corr_re.T, vmin=0, vmax=1, square=True, linewidths=0.5, annot=True, cmap='Reds')

# etc

In [None]:
# START_DATE = '2017-12-01'
# startdate = datetime.datetime.strptime(START_DATE, '%Y-%m-%d')

# train_transaction['TransactionDT'] = train_transaction['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds = x)))
# train_transaction['trans_yymm'] = train_transaction['TransactionDT'].map(lambda x: x.replace(day=1))
# train_transaction['trans_date'] = train_transaction['TransactionDT'].map(lambda x: x.date())

In [None]:
# cat_names = [
#               # Transaction
#               'ProductCD', 'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
#               *[f'card{i}' for i in range(1,7)],
#               *[f'M{i}' for i in range(1,10)],
#               # Identity
#               'DeviceType', 'DeviceInfo', 
#                *[f'id_{i}' for i in range(12,39)],
#              ]