# Visualizing Data

In [95]:
import pandas
train_transaction = pandas.read_csv('ieee-fraud-detection/train_transaction.csv')
train_identity = pandas.read_csv('ieee-fraud-detection/train_identity.csv')
test_transaction = pandas.read_csv('ieee-fraud-detection/test_transaction.csv')
test_identity = pandas.read_csv('ieee-fraud-detection/test_identity.csv')
train_set = train_transaction.merge(train_identity,on='TransactionID',how='left')
test_set = test_transaction.merge(test_identity,on='TransactionID',how='left')
train_set.append(test_set)
df = train_set

In [101]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


# Data Pre-processsing

### - Fill NaN with 0

In [102]:
df.isnull().any().sum()

414

In [103]:
df = df.fillna(0)

In [104]:
df.isnull().any().sum()

0

In [105]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,id_31,id_32,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,W,13926,0.0,150.0,discover,142.0,...,0,0.0,0,0,0,0,0,0,0,0
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,0,0.0,0,0,0,0,0,0,0,0
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,0,0.0,0,0,0,0,0,0,0,0
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,0,0.0,0,0,0,0,0,0,0,0
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,samsung browser 6.2,32.0,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


### Transform categorical data to numerical

In [106]:
categorical_data = df.select_dtypes(include=[object])

In [107]:
categorical_data.head()

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,W,discover,credit,0,0,T,T,T,M2,F,...,0,0,0,0,0,0,0,0,0,0
1,W,mastercard,credit,gmail.com,0,0,0,0,M0,T,...,0,0,0,0,0,0,0,0,0,0
2,W,visa,debit,outlook.com,0,T,T,T,M0,F,...,0,0,0,0,0,0,0,0,0,0
3,W,mastercard,debit,yahoo.com,0,0,0,0,M0,T,...,0,0,0,0,0,0,0,0,0,0
4,H,mastercard,credit,gmail.com,0,0,0,0,0,0,...,Android 7.0,samsung browser 6.2,2220x1080,match_status:2,T,F,T,T,mobile,SAMSUNG SM-G892A Build/NRD90M


In [108]:
df = df.drop(categorical_data.columns,axis = 1)

In [109]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_17,id_18,id_19,id_20,id_21,id_22,id_24,id_25,id_26,id_32
0,2987000,0,86400,68.5,13926,0.0,150.0,142.0,315.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,166.0,0.0,542.0,144.0,0.0,0.0,0.0,0.0,0.0,32.0


In [110]:
from sklearn import preprocessing
encoding = preprocessing.OneHotEncoder()
categorical_data.replace(0, "0", inplace=True)

In [111]:
le = preprocessing.LabelEncoder()
le_cat_data = categorical_data.apply(le.fit_transform)
le_cat_data.head()

Unnamed: 0,ProductCD,card4,card6,P_emaildomain,R_emaildomain,M1,M2,M3,M4,M5,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,4,2,2,0,0,2,2,2,3,1,...,0,0,0,0,0,0,0,0,0,0
1,4,3,2,17,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
2,4,4,3,36,0,2,2,2,1,1,...,0,0,0,0,0,0,0,0,0,0
3,4,3,3,54,0,0,0,0,1,2,...,0,0,0,0,0,0,0,0,0,0
4,1,3,2,17,0,0,0,0,0,0,...,8,124,165,4,2,1,2,2,2,955


In [112]:
df = pandas.concat([df, le_cat_data], axis=1)

In [113]:
df.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card3,card5,addr1,addr2,...,id_30,id_31,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo
0,2987000,0,86400,68.5,13926,0.0,150.0,142.0,315.0,87.0,...,0,0,0,0,0,0,0,0,0,0
1,2987001,0,86401,29.0,2755,404.0,150.0,102.0,325.0,87.0,...,0,0,0,0,0,0,0,0,0,0
2,2987002,0,86469,59.0,4663,490.0,150.0,166.0,330.0,87.0,...,0,0,0,0,0,0,0,0,0,0
3,2987003,0,86499,50.0,18132,567.0,150.0,117.0,476.0,87.0,...,0,0,0,0,0,0,0,0,0,0
4,2987004,0,86506,50.0,4497,514.0,150.0,102.0,420.0,87.0,...,8,124,165,4,2,1,2,2,2,955


### Standardize amount and date

In [114]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

df['amount'] = StandardScaler().fit_transform(df['TransactionAmt'].values.reshape(-1,1))
df['time'] = StandardScaler().fit_transform(df['TransactionDT'].values.reshape(-1,1))

df = df.drop(['TransactionDT','TransactionAmt'], axis = 1)
df.head()



Unnamed: 0,TransactionID,isFraud,card1,card2,card3,card5,addr1,addr2,dist1,dist2,...,id_33,id_34,id_35,id_36,id_37,id_38,DeviceType,DeviceInfo,amount,time
0,2987000,0,13926,0.0,150.0,142.0,315.0,87.0,19.0,0.0,...,0,0,0,0,0,0,0,0,-0.278167,-1.577987
1,2987001,0,2755,404.0,150.0,102.0,325.0,87.0,0.0,0.0,...,0,0,0,0,0,0,0,0,-0.443327,-1.577986
2,2987002,0,4663,490.0,150.0,166.0,330.0,87.0,287.0,0.0,...,0,0,0,0,0,0,0,0,-0.317889,-1.577972
3,2987003,0,18132,567.0,150.0,117.0,476.0,87.0,0.0,0.0,...,0,0,0,0,0,0,0,0,-0.355521,-1.577965
4,2987004,0,4497,514.0,150.0,102.0,420.0,87.0,0.0,0.0,...,165,4,2,1,2,2,2,955,-0.355521,-1.577964


In [115]:
X = df.drop(['isFraud'], axis = 1)
Y = df['isFraud']

### Run PCA to get top 10 features

In [116]:
pca = PCA(n_components=10)
pComponents = pca.fit_transform(X.values)
pDf = pandas.DataFrame(data = pComponents
             , columns = ['principalcomponent1', 'principal component 2', 'principal component 3', 'principal component 4', 'principal component 5',
                         'principal component 6', 'principal component 7', 'principal component 8', 'principal component 9', 'principal component 10'])
finalDf = pandas.concat([pDf, Y], axis = 1)
finalDf.head()

Unnamed: 0,principalcomponent1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6,principal component 7,principal component 8,principal component 9,principal component 10,isFraud
0,292239.794213,-55322.265207,-22345.704685,835.587779,-4105.584681,99.520017,-39.461904,-126.183954,-167.99855,-139.163764,0
1,292242.742886,-55322.018323,-22349.853656,497.290673,7061.013504,65.282876,-92.451294,-70.218793,-149.114728,-198.962696,0
2,292241.086365,-55322.195524,-22349.202832,532.901409,5153.436519,58.801461,-99.623193,-79.900319,-150.855339,-198.484838,0
3,292222.538856,-55328.157824,-22338.706225,3237.384702,-8277.851655,1387.683827,1424.935996,160.938363,-294.080958,192.39704,0
4,305467.707326,20468.811631,145331.147569,356.62548,5368.288778,155.44224,-118.537839,366.00924,6704.130916,998.240949,0


In [117]:
# import matplotlib.gridspec as gridspec
# from matplotlib import pyplot as plt
# import seaborn as sns
# from mpl_toolkits.mplot3d import Axes3D 
# plt.style.use('ggplot')
# final_features = finalDf.iloc[:,0:10].columns
# print(final_features)
# gs = gridspec.GridSpec(28, 1)
# for i, cn in enumerate(finalDf[final_features]):
#     ax = plt.subplot(gs[i])
#     sns.distplot(df[cn][df.Class == 1], bins=50)
#     sns.distplot(df[cn][df.Class == 0], bins=50)
#     ax.set_xlabel('')
#     ax.set_title('histogram of feature: ' + str(cn))
# plt.show()