In [8]:
import pandas as pd

df = pd.read_csv('card_transdata.csv')

In [9]:
df[:10]

Unnamed: 0,distance_from_home,distance_from_last_transaction,ratio_to_median_purchase_price,repeat_retailer,used_chip,used_pin_number,online_order,fraud
0,57.877857,0.31114,1.94594,1.0,1.0,0.0,0.0,0.0
1,10.829943,0.175592,1.294219,1.0,0.0,0.0,0.0,0.0
2,5.091079,0.805153,0.427715,1.0,0.0,0.0,1.0,0.0
3,2.247564,5.600044,0.362663,1.0,1.0,0.0,1.0,0.0
4,44.190936,0.566486,2.222767,1.0,1.0,0.0,1.0,0.0
5,5.586408,13.261073,0.064768,1.0,0.0,0.0,0.0,0.0
6,3.724019,0.956838,0.278465,1.0,0.0,0.0,1.0,0.0
7,4.848247,0.320735,1.27305,1.0,0.0,1.0,0.0,0.0
8,0.876632,2.503609,1.516999,0.0,0.0,0.0,0.0,0.0
9,8.839047,2.970512,2.361683,1.0,0.0,0.0,1.0,0.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 8 columns):
distance_from_home                1000000 non-null float64
distance_from_last_transaction    1000000 non-null float64
ratio_to_median_purchase_price    1000000 non-null float64
repeat_retailer                   1000000 non-null float64
used_chip                         1000000 non-null float64
used_pin_number                   1000000 non-null float64
online_order                      1000000 non-null float64
fraud                             1000000 non-null float64
dtypes: float64(8)
memory usage: 61.0 MB


In [11]:
#Filter null values:
null_df = df[df.isnull().any(axis=1)]

In [12]:
#filter rows with inappropriate values 
tf = [0,1]
wrong_value_df = df[(df['distance_from_home'] < 0) | (df['distance_from_last_transaction'] < 0 ) | 
                    (df['ratio_to_median_purchase_price'] < 0) | (~df['repeat_retailer'].isin(tf)) | 
                    (~df['used_chip'].isin(tf)) | (~df['used_pin_number'].isin(tf)) |
                    (~df['online_order'].isin(tf))
                   ]

In [30]:
#detect outliers for distance_from_home

Q1 = df['distance_from_home'].quantile(0.25)
Q3 = df['distance_from_home'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

#add outlier indicator column
df['distance_from_home_outlier'] = (
    (df['distance_from_home'] < lower_bound) |
    (df['distance_from_home'] > upper_bound)
).astype(int)

In [31]:
#detect outliers for distance_from_last_transaction

Q1 = df['distance_from_last_transaction'].quantile(0.25)
Q3 = df['distance_from_last_transaction'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

#add outlier indicator column
df['distance_from_last_transaction_outlier'] = (
    (df['distance_from_last_transaction'] < lower_bound) |
    (df['distance_from_last_transaction'] > upper_bound)
).astype(int)

In [32]:
#detect outliers for ratio_to_median_purchase_price

Q1 = df['ratio_to_median_purchase_price'].quantile(0.25)
Q3 = df['ratio_to_median_purchase_price'].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - (1.5 * IQR)
upper_bound = Q3 + (1.5 * IQR)

#add outlier indicator column
df['ratio_to_median_purchase_price_outlier'] = (
    (df['ratio_to_median_purchase_price'] < lower_bound) |
    (df['ratio_to_median_purchase_price'] > upper_bound)
).astype(int)

In [34]:
# print(null_df.info())
# print(wrong_value_df.info())
print(len(df[df['distance_from_home_outlier'] == 1]))
print(len(df[df['distance_from_last_transaction_outlier'] == 1]))
print(len(df[df['ratio_to_median_purchase_price_outlier'] == 1]))

103631
124367
84386


In [39]:
#Find correlation

#For numeric features
numeric_cols = df.select_dtypes(include=['number']).columns
correlations = df.corr()
print(correlations['fraud'].sort_values(ascending=False))

#For categorical features
for col in ['repeat_retailer', 'used_chip', 'used_pin_number', 'online_order']:
    print(df.groupby(col)['fraud'].mean())


fraud                                     1.000000
ratio_to_median_purchase_price_outlier    0.582342
ratio_to_median_purchase_price            0.462305
distance_from_home_outlier                0.193585
online_order                              0.191973
distance_from_home                        0.187571
distance_from_last_transaction            0.091917
distance_from_last_transaction_outlier    0.054688
repeat_retailer                          -0.001357
used_chip                                -0.060975
used_pin_number                          -0.100293
Name: fraud, dtype: float64
repeat_retailer
0.0    0.088449
1.0    0.087262
Name: fraud, dtype: float64
used_chip
0.0    0.100051
1.0    0.063956
Name: fraud, dtype: float64
used_pin_number
0.0    0.096877
1.0    0.002714
Name: fraud, dtype: float64
online_order
0.0    0.013427
1.0    0.127140
Name: fraud, dtype: float64


1. print(null_df.info()) shows nothing
2. print(wrong_value_df.info()) shows nothing
3. the number of distance_from_home's outlier is 103631
4. the number of distance_from_last_transaction's outlier is 124367
5. the number of ratio_to_median_purchase_price's outlier is 84386
6. correlation matrix for each feature varies between 0.58 to -0.1. The repeat_retailer shows the least correaltion, whereas the ratio_to_median_purchase_price_outlier shows the highest correaltion.