In [41]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/ieee-fraud-detection/sample_submission.csv
/kaggle/input/ieee-fraud-detection/test_identity.csv
/kaggle/input/ieee-fraud-detection/train_identity.csv
/kaggle/input/ieee-fraud-detection/test_transaction.csv
/kaggle/input/ieee-fraud-detection/train_transaction.csv


# Extensive EDA and Modeling XGB Hyperopt

## Competition Ovjective is to detect fraud in transactions

### Data
In this competition you are predicting the probability that an online transaction is fradulent, as denoted by the binary target `isFraud`.

The data is broken into two files **identity** and **transaction**, which are joined by `TransactionID`.

    Note: Not all transactions have corresponding identity information.
   

#### Categorical Features - Transactions
* ProductCD
* emaildomain
* card1-card6
* addr1,addr2
* P_emaildomain
* R_emaildomain
* M1-M9

#### Categorical Features - Identity
* DeviceType
* DeviceInfo
* id_12-id_38

#### The TransactionDT feature is a timedelta from a given reference datetime(not an actual timestamp).

# Questions
I will start exploring based on Categorical Features and Transaction Amounts. The aim is answer some questions like:
* What type of data we have on out data?
* How many cols, rows, missing values we have?
* Whats the target distribution?
* What's the Transactions values distribution of fraud and no fraud transactions?
* What have predominat fraudulent products?
* What features or target shows some interestin patterns?
* And a lot of more questions that will raise trought the exploration.

## Importing necessary libraries

In [42]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Standard plotly imports
#import plotly.plotly as py
import plotly.graph_objs as go
import plotly.tools as tls
from plotly.offline import iplot, init_notebook_mode
#import cufflinks
#import cufflinks as cf
import plotly.figure_factory as ff

# Using plotly + cufflinks in offline mode
init_notebook_mode(connected=True)
#cufflinks.go_offline(connected=True)

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

## Hyperopt modules
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK, STATUS_RUNNING
from functools import partial

import os
import gc
print(os.listdir("../input"))

['ieee-fraud-detection']


## Importing train datasets

In [43]:
df_id = pd.read_csv('/kaggle/input/ieee-fraud-detection/train_identity.csv')
df_trans=pd.read_csv('/kaggle/input/ieee-fraud-detection/train_transaction.csv')

I will set all functions in the cell below

In [50]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary=summary.reset_index()
    summary["Name"]=summary['index']
    summary=summary[['Name','dtypes']]
    summary['Missing']=df.isnull().sum().values
    summary['Uniques']=df.nunique().values
    summary['First Value']=df.loc[0].values
    summary['Second Value']=df.loc[1].values
    summary['Third Value']=df.loc[2].values
    
    for name in summary['Name'].value_counts().index:
        summary.loc[summary['Name']==name,'Entropy']=round(stats.entropy(df[name].value_counts(normalize=True),base=2),2)
        
    return summary

## Function to reduce the DF size
def reduce_mem_usage(df, verbose=True):
    numerics=['int16','int32','int64','float16','float32','float64']
    start_mem=df.memory_usage().sum()/1024**2
    for col in df.columns:
        col_type=df[col].dtypes
        if col_type in numerics:
            c_min=df[col].min()
            c_max=df[col].max()
            if str(col_type)[:3]=='int':
                if c_min> np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col]=df[col].astype(np.int8)
                elif c_min> np.iinfo(np.int16).min and c_max <np.iinfo(np.int16).max:
                    df[col]=df[col].astype(np.int16)
                elif c_min> np.iinfo(np.int32).min and c_max <np.iinfo(np.int32).max:
                    df[col]=df[col].astype(np.int32)
                elif c_min> np.iinfo(np.int64).min and c_max <np.iinfo(np.int64).max:
                    df[col]=df[col].astype(np.int64)
            else:
                if c_min > np.finfo(np.float16).min and c_max <np.finfo(np.float16).max:
                    df[col]=df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max <np.finfo(np.float32).max:
                    df[col]=df[col].astype(np.float32)
                else:
                    df[col]=df[col].astype(np.float64)
    end_mem=df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100*(start_mem - end_mem)/start_mem))
    return df
    
def CalcOutliers(df_num):
    
    # calculating mena and std of the array
    data_mean,data_std=np.mean(df_num),np.std(df_num)
    
    # setting the cut line to both higher and lower values
    #You can change this value
    cut=data_std*3
    
    # Calculating the higher and lower cut values
    lower,upper= data_mean -cut, data_mean +cut
    
    # creating an array of lower, higher and total outlier values
    outliers_lower=[x for x in df_num if x< lower]
    outliers_higher=[x for x in df_num if x>upper]
    outliers_total=[x for x in df_num if x< lower or x>upper]
    
    #array without outlier values
    outlier_removed=[x for x in df_num if x>lower and x< upper]
    
    print("Identified lowest outliers: %d" %len(outliers_lower))
    #printing total number of values in lower cut of outliers
    print("Identified upper outliers: %d" %len(outliers_higher))
    #printing total number of values in higher cut of outliers
    print("Total outlier observations: %d" %len(outliers_total))
    #printing total number of values in lower cut of both sides
    print("Non-outlier oberservations: %d" %len(outliers_removed))
    #printing total number of non outlier values
    print("Total percentual of Outliers: ",rount((len(outliers_totla)/len(outliers_removed))*100,4))
    #Percentual of outliers in points
    
    return

In [47]:
## Reducing memory
df_trans=reduce_mem_usage(df_trans)
df_id=reduce_mem_usage(df_id)

Mem. usage decreased to 542.35 Mb (0.0% reduction)
Mem. usage decreased to 25.86 Mb (0.0% reduction)


## Knowing the data

In [51]:
resumetable(df_trans)[:25]

Dataset Shape: (590540, 394)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value,Entropy
0,TransactionID,int32,0,590540,2987000,2987001,2987002,19.17
1,isFraud,int8,0,2,0,0,0,0.22
2,TransactionDT,int32,0,573349,86400,86401,86469,19.11
3,TransactionAmt,float16,0,8195,68.5,29.0,59.0,8.1
4,ProductCD,object,0,5,W,W,W,1.28
5,card1,int16,0,13553,13926,2755,4663,9.97
6,card2,float16,8933,500,,404.0,490.0,6.32
7,card3,float16,1565,114,150.0,150.0,150.0,0.68
8,card4,object,1577,4,discover,mastercard,visa,1.09
9,card5,float16,4259,119,142.0,102.0,166.0,2.66


## Target Distribution

In [None]:
df_trans['TransactionAmt']=df_trans['TransactionAmt'].astype(float)
total=len(df_trans)
total_amt=df_trans.groupby(['isFraud'])['TransactionAmt'].sum().sum()
plt.figure(figsize=(16,6))

plt.subplot(121)
g=sns.countplot(x='isFraud',data=df_trans,)
g.set_title("Fraud Transaction Distribution \n# 0: No Fraud | 1: Fraud #",fontsize=22)
g.set_xlabel("Is fraud?",fontsize=18)
g.set_ylabel("Count",fontsize=18)
for p in g.patches:
    height= p. get_height()
    g.text(p.get_x()_+p.get_width()/2.,height +3, '{:1.2f}%'.format(height/total*100), ha="center",fontsize=15)
perc_amt=(df_trans.groupby(["isfraud"])['TransactionAmt'].sum())
perc_amt=perc_amt.reset_index()
plt.subplot(122)
g1=sns.barplot(x='isFraud',y='Transaction Amt \n# 0: No Fraud | 1: Fraud #',fontsize=22)
g1.set_xlabel("Is fraud?",fontsize=18)
g1.set_ylabel("Total Transaction Amount Scalar",fontsize=18)
for p in g1.patches:
    height=p.get_height()
    g1.text(p.get_x()+p.get_width()/2.,height+3,'{:1.2f}%'.format(height/total_amt*100),ha="center",fontsize=15)
plt.show()    

We have 3.5% of Fraud transactions in out dataset.
I think that it would be interesting to see if the amount percentual is higher or lower than 3.5% of total. I will see it later.
We have the same % when considering the Total Transaction Amount by Fraud and No Fraud.
Let's explore the Transaction amount further below.

## Transaction Amount Quantiles

Before Ploting Transaction Amount, let's see the quantiles of Transaction Amount

In [None]:
df_trans['TransactionAmt']=df_trans['TransactionAmt'].astype(float)
print("Transaction Amounts Quantiles:")
print(df_trans['TransactionAmt'].quantile([.01,.025,.1,.25,.5,.75,.9,.975,.99]))

# Ploting Transaction Amount Values Distribution

In [None]:
plt.figure(figsize=(16,12))
plt.suptitle('Transaction Values Distribution', fontsize=22)
plt.subplot(221)
g = sns.distplot(df_trans[df_trans['TransactionAmt'] <= 1000]['TransactionAmt'])
g.set_title("Transaction Amount Distribuition <= 1000", fontsize=18)
g.set_xlabel("")
g.set_ylabel("Probability", fontsize=15)

plt.subplot(222)
g1 = sns.distplot(np.log(df_trans['TransactionAmt']))
g1.set_title("Transaction Amount (Log) Distribuition", fontsize=18)
g1.set_xlabel("")
g1.set_ylabel("Probability", fontsize=15)

plt.figure(figsize=(16,12))


plt.subplot(212)
g4 = plt.scatter(range(df_trans[df_trans['isFraud'] == 0].shape[0]),
                 np.sort(df_trans[df_trans['isFraud'] == 0]['TransactionAmt'].values), 
                 label='NoFraud', alpha=.2)
g4 = plt.scatter(range(df_trans[df_trans['isFraud'] == 1].shape[0]),
                 np.sort(df_trans[df_trans['isFraud'] == 1]['TransactionAmt'].values), 
                 label='Fraud', alpha=.2)
g4= plt.title("ECDF \nFRAUD and NO FRAUD Transaction Amount Distribution", fontsize=18)
g4 = plt.xlabel("Index")
g4 = plt.ylabel("Amount Distribution", fontsize=15)
g4 = plt.legend()

plt.figure(figsize=(16,12))

plt.subplot(321)
g = plt.scatter(range(df_trans[df_trans['isFraud'] == 1].shape[0]), 
                 np.sort(df_trans[df_trans['isFraud'] == 1]['TransactionAmt'].values), 
                label='isFraud', alpha=.4)
plt.title("FRAUD - Transaction Amount ECDF", fontsize=18)
plt.xlabel("Index")
plt.ylabel("Amount Distribution", fontsize=12)

plt.subplot(322)
g1 = plt.scatter(range(df_trans[df_trans['isFraud'] == 0].shape[0]),
                 np.sort(df_trans[df_trans['isFraud'] == 0]['TransactionAmt'].values), 
                 label='NoFraud', alpha=.2)
g1= plt.title("NO FRAUD - Transaction Amount ECDF", fontsize=18)
g1 = plt.xlabel("Index")
g1 = plt.ylabel("Amount Distribution", fontsize=15)

plt.suptitle('Individual ECDF Distribution', fontsize=22)

plt.show()

# Seeing the Quantiles of Fraud and No Fraud Transactions

In [None]:
print(pd.concat([df_trans[df_trans['isFraud'] == 1]['TransactionAmt']\
                 .quantile([.01, .1, .25, .5, .75, .9, .99])\
                 .reset_index(), 
                 df_trans[df_trans['isFraud'] == 0]['TransactionAmt']\
                 .quantile([.01, .1, .25, .5, .75, .9, .99])\
                 .reset_index()],
                axis=1, keys=['Fraud', "No Fraud"]))

# Transaction Amount Outliers
- It's considering outlier values that are highest than 3 times the std from the mean

In [None]:
CalcOutliers(df_trans['TransactionAmt'])

If we consider only values between >= 0 to 800 we will avoid the outliers and has more confidence in our distribution. <br>
We have 10k rows with outliers that represents 1.74% of total rows.

# Now, let's known the Product Feature
- Distribution Products
- Distribution of Frauds by Product
- Has Difference between Transaction Amounts in Products? 

In [None]:
tmp = pd.crosstab(df_trans['ProductCD'], df_trans['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

plt.figure(figsize=(14,10))
plt.suptitle('ProductCD Distributions', fontsize=22)

plt.subplot(221)
g = sns.countplot(x='ProductCD', data=df_trans)
# plt.legend(title='Fraud', loc='upper center', labels=['No', 'Yes'])

g.set_title("ProductCD Distribution", fontsize=19)
g.set_xlabel("ProductCD Name", fontsize=17)
g.set_ylabel("Count", fontsize=17)
g.set_ylim(0,500000)
for p in g.patches:
    height = p.get_height()
    g.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center", fontsize=14) 

plt.subplot(222)
g1 = sns.countplot(x='ProductCD', hue='isFraud', data=df_trans)
plt.legend(title='Fraud', loc='best', labels=['No', 'Yes'])
gt = g1.twinx()
gt = sns.pointplot(x='ProductCD', y='Fraud', data=tmp, color='black', order=['W', 'H',"C", "S", "R"], legend=False)
gt.set_ylabel("% of Fraud Transactions", fontsize=16)

g1.set_title("Product CD by Target(isFraud)", fontsize=19)
g1.set_xlabel("ProductCD Name", fontsize=17)
g1.set_ylabel("Count", fontsize=17)

plt.subplot(212)
g3 = sns.boxenplot(x='ProductCD', y='TransactionAmt', hue='isFraud', 
              data=df_trans[df_trans['TransactionAmt'] <= 2000] )
g3.set_title("Transaction Amount Distribuition by ProductCD and Target", fontsize=20)
g3.set_xlabel("ProductCD Name", fontsize=17)
g3.set_ylabel("Transaction Values", fontsize=17)

plt.subplots_adjust(hspace = 0.6, top = 0.85)

plt.show()

W, C and R are the most frequent values. <br>
We can note that in W, H and R the distribution of Fraud values are slightly higher than the Non-Fraud Transactions

# Card Features
- Based on Competition Description, card features are categoricals.
- Lets understand the distribution of values
- What's the different in transactions and % of Fraud for each values in these features
- Card features has 6 columns, and 4 of them seems to be numericals, so lets see the quantiles and distributions

In [None]:
## Knowning the Card Features
resumetable(df_trans[['card1', 'card2', 'card3','card4', 'card5', 'card6']])

Card2-Card6 has some missing values. We will need to due with it later.

# Numericals Feature Card Quantiles

In [None]:
print("Card Features Quantiles: ")
print(df_trans[['card1', 'card2', 'card3', 'card5']].quantile([0.01, .025, .1, .25, .5, .75, .975, .99]))

We can see that Card 1 and Card 2 has a large distribution of values, so maybe it will be better to get the log of these columns

In [None]:

df_trans.loc[df_trans.card3.isin(df_trans.card3.value_counts()[df_trans.card3.value_counts() < 200].index), 'card3'] = "Others"
df_trans.loc[df_trans.card5.isin(df_trans.card5.value_counts()[df_trans.card5.value_counts() < 300].index), 'card5'] = "Others"

# Visualizing Card 1, Card 2 and Card 3 Distributions
- As the Card 1 and 2 are numericals, I will plot the distribution of them
- in Card 3, as we have many values with low frequencies, I decided to set value to "Others" 
- Also, in Card 3 I set the % of Fraud ratio in yaxis2

In [None]:
tmp = pd.crosstab(df_trans['card3'], df_trans['isFraud'], normalize='index') * 100
tmp = tmp.reset_index()
tmp.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

tmp2 = pd.crosstab(df_trans['card5'], df_trans['isFraud'], normalize='index') * 100
tmp2 = tmp2.reset_index()
tmp2.rename(columns={0:'NoFraud', 1:'Fraud'}, inplace=True)

plt.figure(figsize=(14,22))

plt.subplot(411)
g = sns.distplot(df_trans[df_trans['isFraud'] == 1]['card1'], label='Fraud')
g = sns.distplot(df_trans[df_trans['isFraud'] == 0]['card1'], label='NoFraud')
g.legend()
g.set_title("Card 1 Values Distribution by Target", fontsize=20)
g.set_xlabel("Card 1 Values", fontsize=18)
g.set_ylabel("Probability", fontsize=18)

plt.subplot(412)
g1 = sns.distplot(df_trans[df_trans['isFraud'] == 1]['card2'].dropna(), label='Fraud')
g1 = sns.distplot(df_trans[df_trans['isFraud'] == 0]['card2'].dropna(), label='NoFraud')
g1.legend()
g1.set_title("Card 2 Values Distribution by Target", fontsize=20)
g1.set_xlabel("Card 2 Values", fontsize=18)
g1.set_ylabel("Probability", fontsize=18)

plt.subplot(413)
g2 = sns.countplot(x='card3', data=df_trans, order=list(tmp.card3.values))
g22 = g2.twinx()
gg2 = sns.pointplot(x='card3', y='Fraud', data=tmp, 
                    color='black', order=list(tmp.card3.values))
gg2.set_ylabel("% of Fraud Transactions", fontsize=16)
g2.set_title("Card 3 Values Distribution and % of Transaction Frauds", fontsize=20)
g2.set_xlabel("Card 3 Values", fontsize=18)
g2.set_ylabel("Count", fontsize=18)
for p in g2.patches:
    height = p.get_height()
    g2.text(p.get_x()+p.get_width()/2.,
            height + 25,
            '{:1.2f}%'.format(height/total*100),
            ha="center") 

plt.subplot(414)
g3 = sns.countplot(x='card5', data=df_trans, order=list(tmp2.card5.values))
g3t = g3.twinx()
g3t = sns.pointplot(x='card5', y='Fraud', data=tmp2, 
                    color='black', order=list(tmp2.card5.values))
g3t.set_ylabel("% of Fraud Transactions", fontsize=16)
g3.set_title("Card 5 Values Distribution and % of Transaction Frauds", fontsize=20)
g3.set_xticklabels(g3.get_xticklabels(),rotation=90)
g3.set_xlabel("Card 5 Values", fontsize=18)
g3.set_ylabel("Count", fontsize=18)
for p in g3.patches:
    height = p.get_height()
    g3.text(p.get_x()+p.get_width()/2.,
            height + 3,
            '{:1.2f}%'.format(height/total*100),
            ha="center",fontsize=11) 
    
plt.subplots_adjust(hspace = 0.6, top = 0.85)


plt.show()