In [2]:
#import required libraries
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt 


#to suppress warnings
import warnings
warnings.filterwarnings('ignore')


#Load data
MAX_ROWs = 10000

#Load Train data
train_identity_df = pd.read_csv("../data/raw/train_identity.csv", nrows = MAX_ROWs)
train_transaction_df = pd.read_csv("../data/raw/train_transaction.csv", nrows = MAX_ROWs)

#Load Test data
test_identity_df = pd.read_csv("../data/raw/test_identity.csv", nrows = MAX_ROWs)
test_transaction_df = pd.read_csv("../data/raw/test_transaction.csv", nrows = MAX_ROWs)

#check data for transaction
train_transaction_df.head()


FileNotFoundError: [Errno 2] No such file or directory: '../data/raw/train_identity.csv'

In [None]:
print("Credit Card Fraud Detection data set of transaction -  rows:",train_transaction_df.shape[0]," columns:", train_transaction_df.shape[1])

In [None]:
train_transaction_df.describe()

In [None]:
#Check missing data for transaction dataset
total = train_transaction_df.isnull().sum().sort_values(ascending = False)
percent = (train_transaction_df.isnull().sum()/train_transaction_df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

In [None]:
#check data for identity dataset
train_identity_df.head()

In [None]:
print("Credit Card Fraud Detection data set of identity -  rows:",train_identity_df.shape[0]," columns:", train_identity_df.shape[1])

In [None]:
train_identity_df.describe()

In [None]:
#Check missing data for identity dataset
total = train_identity_df.isnull().sum().sort_values(ascending = False)
percent = (train_identity_df.isnull().sum()/train_identity_df.isnull().count()*100).sort_values(ascending = False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent']).transpose()

In [None]:
#there are many columns and some of them contains maximum null values, So we take some columns
trans_column = ['TransactionID','isFraud','TransactionAmt','ProductCD','card1','card2','card4','addr1','P_emaildomain','R_emaildomain', 'C1','V1']
id_column = ['id_01','id_03','id_17','id_25','DeviceType', 'DeviceInfo']

In [None]:
df_train_trans = train_transaction_df[trans_column].copy()
df_train_trans.describe()

In [None]:
df_train_identity = train_identity_df[id_column].copy()
df_train_identity.describe()

In [None]:
#Join Transaction and Identity dataset
train_df = df_train_trans.merge(df_train_identity, how='left', left_index=True, right_index=True)

In [None]:
train_df.head()

In [None]:
train_df.info()

In [None]:
train_df.isnull().sum()

In [None]:
#clean columns
#function to clean columns containing numerical values
def impute_median(df, column_name):
    median_value = df[~df[column_name].astype(str).str.contains('NA')][column_name].astype(float).median()
    df.loc[df[column_name].astype(str).str.contains('NA'), column_name] = median_value
    df[column_name] = df[column_name].fillna(median_value)
    
    return df

#clean column card_2
train_df = impute_median(train_df, 'card2')

#clean column id_25
train_df = impute_median(train_df, 'id_25')

#clean column addr1
train_df = impute_median(train_df, 'addr1')

#clean column id_3
train_df = impute_median(train_df, 'id_03')

#clean column id_17
train_df = impute_median(train_df, 'id_17')

#clean column V1
train_df = impute_median(train_df, 'V1')


#median_value_card2 = train_df[~train_df['card2'].astype(str).str.contains('NA')]['card2'].astype(float).median()
#train_df.loc[train_df['card2'].astype(str).str.contains('NA'), 'card2'] = median_value_card2
#train_df['card2'] = train_df['card2'].fillna(median_value_card2)

In [None]:
train_df.isnull().sum()

In [None]:
#Clean categorical columns
#train_df['card4'].value_counts()
train_df['card4'] = train_df['card4'].fillna('visa')

In [None]:
#clean column P_emaildomain
#train_df['P_emaildomain'].value_counts()
train_df['P_emaildomain'] = train_df['P_emaildomain'].fillna('gmail.com')

In [None]:
#Clean column R_emaildomain
#train_df['R_emaildomain'].value_counts()
train_df['R_emaildomain'] = train_df['R_emaildomain'].fillna('gmail.com')

In [None]:
#clean column DeviceType
#train_df['DeviceType'].value_counts()
train_df['DeviceType'] = train_df['DeviceType'].fillna('desktop')

In [None]:
#clean column DeviceInfo
#train_df['DeviceInfo'].value_counts()
train_df['DeviceInfo'] = train_df['DeviceInfo'].fillna('Windows')

In [None]:
train_df.isnull().sum()

In [None]:
#Data exploration
train_df['isFraud'].value_counts()

In [None]:
sns.countplot(x = 'isFraud', data = train_df, palette='hls', )
plt.show()

In [None]:
count_no_fraud = len(train_df[train_df['isFraud']==0])
count_fraud = len(train_df[train_df['isFraud']==1])
pct_of_no_fraud = count_no_fraud/(count_no_fraud + count_fraud)
print("percentage of not fraud transaction is", pct_of_no_fraud*100)
pct_of_sub = count_fraud/(count_no_fraud + count_fraud)
print("percentage of fraud transaction is", pct_of_sub*100)

In [None]:
train_df.groupby('isFraud').mean(numeric_only=True)

In [None]:
# We can calculate categorical means for other categorical variables such as card4, deviceType 
# to get a more detailed sense of our data.
train_df.groupby('ProductCD').mean(numeric_only=True)

In [None]:
train_df.groupby('card4').mean(numeric_only=True)

In [None]:
train_df.groupby('DeviceType').mean(numeric_only=True)

In [None]:
#Visualization
%matplotlib inline
pd.crosstab(train_df.ProductCD,train_df.isFraud).plot(kind='bar')
plt.title('No. of transactions for ProductCD')
plt.xlabel('ProductCD')
plt.ylabel('No. of transaction')
#plt.savefig('purchase_fre_job')

In [None]:
%matplotlib inline
pd.crosstab(train_df.card4,train_df.isFraud).plot(kind='bar')
plt.title('No. of transactions for card type')
plt.xlabel('Card Type')
plt.ylabel('No. of transaction')
#plt.savefig('purchase_fre_job')

In [None]:
%matplotlib inline
pd.crosstab(train_df.P_emaildomain,train_df.isFraud).plot(kind='bar')
plt.title('No. of transactions for Email domain')
plt.xlabel('Emaildomain')
plt.ylabel('No. of transaction')
#plt.savefig('purchase_fre_job')

In [None]:
train_df.corr(numeric_only=True)

In [None]:
sns.heatmap(train_df.corr(numeric_only=True), center=0)

In [None]:
train_df['TransactionAmt'].plot(kind = 'kde')

In [None]:
train_df['V1'].plot(kind = 'kde')