In [1]:
import pandas as pd
# pd.set_option('display.max_rows', 500)
# pd.set_option('display.max_columns', 500)
# pd.set_option('display.width', 1000)

In [2]:
import numpy as np 
from numba import jit

@jit
def fast_auc(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    nfalse = 0
    auc = 0
    n = len(y_true)
    for i in range(n):
        y_i = y_true[i]
        nfalse += (1 - y_i)
        auc += y_i * nfalse
    auc /= (nfalse * (n - nfalse))
    return auc

In [3]:
df_id = pd.read_csv('./data/train_identity.csv')
df_trans = pd.read_csv('./data/train_transaction.csv')

In [4]:
test_id = pd.read_csv('./data/test_identity.csv')
test_trans = pd.read_csv('./data/test_transaction.csv')

In [5]:
df_raw = df_trans.join(df_id.set_index('TransactionID'), on='TransactionID', how='left')

In [6]:
df_test_raw = test_trans.join(test_id.set_index('TransactionID'), on='TransactionID', how='left')
ansTransID = df_test_raw['TransactionID']

In [7]:
## na value
nasum = df_raw.isna().sum()
nasum

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
id_36             449555
id_37             449555
id_38             449555
DeviceType        449730
DeviceInfo        471874
Length: 434, dtype: int64

In [8]:
## na value
df_test_raw.isna().sum()

TransactionID          0
TransactionDT          0
TransactionAmt         0
ProductCD              0
card1                  0
                   ...  
id_36             369714
id_37             369714
id_38             369714
DeviceType        369760
DeviceInfo        391634
Length: 433, dtype: int64

In [16]:
toKeepColname = ['P_emaildomain', 'R_emaildomain', 'addr1', 'addr2', 'DeviceType']
toDropColname = list(set(list(nasum[nasum>100000].index)) - set(toKeepColname))

In [17]:
df = df_raw.drop(columns=toDropColname)
df = df.drop(columns='TransactionID')
df_test = df_test_raw.drop(columns=toDropColname)
df_test = df_test.drop(columns='TransactionID')

In [18]:
print("# cols: ", len(df.columns))

# cols:  183


In [19]:
df

Unnamed: 0,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,card6,...,V313,V314,V315,V316,V317,V318,V319,V320,V321,DeviceType
0,0,86400,68.50,W,13926,,150.0,discover,142.0,credit,...,0.000000,0.000000,0.000000,0.0,117.0,0.0,0.000000,0.000000,0.000000,
1,0,86401,29.00,W,2755,404.0,150.0,mastercard,102.0,credit,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,
2,0,86469,59.00,W,4663,490.0,150.0,visa,166.0,debit,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,
3,0,86499,50.00,W,18132,567.0,150.0,mastercard,117.0,debit,...,0.000000,0.000000,0.000000,50.0,1404.0,790.0,0.000000,0.000000,0.000000,
4,0,86506,50.00,H,4497,514.0,150.0,mastercard,102.0,credit,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,mobile
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
590535,0,15811047,49.00,W,6550,,150.0,visa,226.0,debit,...,47.950001,47.950001,47.950001,0.0,0.0,0.0,0.000000,0.000000,0.000000,
590536,0,15811049,39.50,W,10444,225.0,150.0,mastercard,224.0,debit,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,
590537,0,15811079,30.95,W,12037,595.0,150.0,mastercard,224.0,debit,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.000000,
590538,0,15811088,117.00,W,7826,481.0,150.0,mastercard,224.0,debit,...,317.500000,669.500000,317.500000,0.0,2234.0,0.0,0.000000,0.000000,0.000000,


In [21]:
df['DeviceType'].unique()

array([nan, 'mobile', 'desktop'], dtype=object)

In [22]:
# cat_col = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'id_12', 'id_15', 'id_28', 'id_29', 'id_31', 'id_35', 'id_36', 'id_37', 'id_38', 'DeviceType', 'DeviceInfo']
cat_col = ['ProductCD', 'card4', 'card6', 'P_emaildomain', 'R_emaildomain', 'DeviceType']

In [23]:
for cat in cat_col:
    print(cat)
    df = pd.concat([df, pd.get_dummies(df[cat], prefix=cat,dummy_na=True)],axis=1).drop([cat],axis=1)

ProductCD
card4
card6
P_emaildomain
R_emaildomain
DeviceType


In [24]:
for cat in cat_col:
    print(cat)
    df_test = pd.concat([df_test, pd.get_dummies(df_test[cat], prefix=cat,dummy_na=True)],axis=1).drop([cat],axis=1)

ProductCD
card4
card6
P_emaildomain
R_emaildomain
DeviceType


In [25]:
import datetime
START_DATE = '2018-02-16'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
df['Date'] =df['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
df['day_of_week'] = df['Date'].dt.dayofweek
df['hour_a_day'] = df['Date'].dt.hour
df['day_of_month'] = df['Date'].dt.day
df = df.drop(columns='Date')
df = df.drop(columns='TransactionDT')
df = df.fillna(df.median())

In [26]:
import datetime
START_DATE = '2018-02-16'
startdate = datetime.datetime.strptime(START_DATE, "%Y-%m-%d")
df_test['Date'] = df_test['TransactionDT'].apply(lambda x: (startdate + datetime.timedelta(seconds=x)))
df_test['day_of_week'] = df_test['Date'].dt.dayofweek
df_test['hour_a_day'] = df_test['Date'].dt.hour
df_test['day_of_month'] = df_test['Date'].dt.day
df_test = df_test.drop(columns='Date')
df_test = df_test.drop(columns='TransactionDT')
df_test = df_test.fillna(df_test.median())

In [27]:
import numpy as np

In [28]:
labels = np.array(df['isFraud'])
df= df.drop('isFraud', axis = 1)
feature_list = list(df.columns)
features = np.array(df)
real_test_features = np.array(df_test)

In [29]:
from sklearn.model_selection import train_test_split
## Split the data into training and testing sets
train_features, test_features, train_labels, test_labels = train_test_split(features, labels, test_size = 0.1)

In [30]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=10, n_jobs=4, random_state=4)
# Train the model on training data
rf.fit(train_features, train_labels);
predictions_1 = rf.predict(test_features)

In [31]:
print('accuracy = ', sum(predictions_1 == test_labels) / len(predictions_1))

accuracy =  0.9791885393030108


In [32]:
fast_auc(test_labels, predictions_1)

0.716167946156273

In [33]:
# predictions = rf.predict(test_features)
rf.fit(features, labels);
predictions = rf.predict(real_test_features)

In [None]:
# predictions

In [34]:
# print('accuracy = ', sum(predictions == test_labels) / len(predictions))

In [35]:
import collections

In [36]:
collections.Counter(labels)

Counter({0: 569877, 1: 20663})

In [37]:
collections.Counter(predictions)

Counter({0: 500102, 1: 6589})

In [38]:
ans = rf.predict(real_test_features)

In [39]:
z=pd.DataFrame({'TransactionID': ansTransID, 'isFraud': ans})

In [40]:
z.to_csv('./submission0926.csv')

In [41]:
len(z[z['isFraud']>0.5])

6589

In [42]:
z

Unnamed: 0,TransactionID,isFraud
0,3663549,0
1,3663550,0
2,3663551,0
3,3663552,0
4,3663553,0
...,...,...
506686,4170235,0
506687,4170236,0
506688,4170237,0
506689,4170238,0


In [43]:
df.describe()

Unnamed: 0,TransactionAmt,card1,card2,card3,card5,addr1,addr2,C1,C2,C3,...,R_emaildomain_yahoo.es,R_emaildomain_yahoo.fr,R_emaildomain_ymail.com,R_emaildomain_nan,DeviceType_desktop,DeviceType_mobile,DeviceType_nan,day_of_week,hour_a_day,day_of_month
count,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,...,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0,590540.0
mean,135.027176,9898.734658,362.531959,153.186458,199.471611,291.653527,86.822813,14.092458,15.269734,0.005644,...,9.7e-05,0.000232,0.000351,0.767516,0.144215,0.094227,0.761557,3.095731,13.861923,15.581514
std,239.162522,4901.170153,156.595356,11.322604,41.15761,95.949345,2.5373,133.569018,154.668899,0.150536,...,0.009824,0.015229,0.018719,0.422416,0.351308,0.292145,0.426132,1.952824,7.607152,8.579154
min,0.251,1000.0,100.0,100.0,100.0,100.0,10.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
25%,43.321,6019.0,215.0,150.0,166.0,205.0,87.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,6.0,8.0
50%,68.769,9678.0,361.0,150.0,226.0,299.0,87.0,1.0,1.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,3.0,16.0,16.0
75%,125.0,14184.0,512.0,150.0,226.0,327.0,87.0,3.0,3.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,1.0,5.0,20.0,23.0
max,31937.391,18396.0,600.0,231.0,237.0,540.0,102.0,4685.0,5691.0,26.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,6.0,23.0,31.0


In [44]:
collections.Counter(ans)

Counter({0: 500102, 1: 6589})