In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
rng = np.random.RandomState(20180201)
DEFAULT_SEED = 123456
DATA_DIR = "../data/"
DATA_FILE = "creditcard.csv"

In [3]:
data = pd.read_csv(DATA_DIR+DATA_FILE)
stats = pd.DataFrame({'mean':data.mean(), 'std':data.std(), 'skew':data.skew()})

# normalize the data
data['Amount'] = data['Amount'].apply(lambda x: np.log(x+1e-6))
data.loc[:,'V1':'Amount'] = data.loc[:,'V1':'Amount'].apply(lambda x: (x-x.mean())/x.std())
#data.loc[:,'V1':'V28'] = data.loc[:,'V1':'V28'].apply(lambda x: (x-x.min())/(x.std(x.max()-x.min())))

In [4]:
# plot the histograms of data in each axises
plot=False
if plot:
    pos_data = data[data["Class"]==1]
    neg_data = data[data["Class"]==0]
    for key in data.keys()[1:-1]: # for all principle components
        fig = plt.figure(figsize=(4,3))
        plt.hist(pos_data[key], histtype="step", normed=True, bins=50, range=(-1,1), label="pos")
        plt.hist(neg_data[key], histtype="step", normed=True, bins=50, range=(-1,1), label="neg")
        plt.xlim(-1,1)
        plt.title(key)
        plt.legend()
        fig.savefig(key+"_Scaled.png")

In [5]:
def split_DataFrame(data, label, frac, rng=None, oversample=None):
    '''
    To split the data into two part, and remain the same event propotion in the results.
    Use simple oversampling to preprocess the imblanced dataset.
    
    Input:
        data (pd.DataFrame): the Input data
        label (string): the event class label
        frac (float): the propotion of the first part
        rng (np.RandomState): NumPy random state
        oversample (float): a constant to control how many times to duplicate (usually < 1)
    
    Return:
        subdf1: the first part of the data
        subdf2: the second part of the data
    '''
    if rng is None:
        rng = np.random.RandomState(DEFAULT_SEED)
    subdf1 = pd.DataFrame()
    subdf2 = pd.DataFrame()
    count = data[label].value_counts()
    if oversample is None:
        oversample = np.ones_like(count)
    else:
        oversample = np.int32(np.ceil(count.max()/count * oversample))
    for val, amplify in zip(count.keys(), oversample):
        df = data[data[label]==val]
        mask = rng.rand(len(df)) < frac
        subdf1 = pd.concat([df[mask]] * amplify + [subdf1]).sample(frac=1, random_state=rng).reset_index(drop=True)
        subdf2 = pd.concat([df[~mask]] * amplify + [subdf2]).sample(frac=1, random_state=rng).reset_index(drop=True)
    return (subdf1, subdf2)

# train : test : valid = 0.6 : 0.2 : 0.2 
train_df, dump = split_DataFrame(data, label='Class', frac=0.6, rng=rng, oversample=0.7)
test_df, valid_df = split_DataFrame(dump, label='Class', frac=0.5)

In [8]:
train = train_df.as_matrix()
test = test_df.as_matrix()
valid = valid_df.as_matrix()

np.savez_compressed(DATA_DIR+'ccdataset.npz', train=train, test=test, valid=valid)

In [9]:
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-0.694241,-0.044075,1.672771,0.973364,-0.245116,0.347067,0.193679,0.082637,0.331127,...,-0.024923,0.382854,-0.176911,0.110507,0.246585,-0.392170,0.330891,-0.063781,0.918935,0
1,0.0,0.608495,0.161176,0.109797,0.316522,0.043483,-0.061820,-0.063700,0.071253,-0.232494,...,-0.307376,-0.880075,0.162201,-0.561130,0.320693,0.261069,-0.022256,0.044607,-0.830589,0
2,1.0,-0.693499,-0.811576,1.169466,0.268231,-0.364571,1.351451,0.639775,0.207372,-1.378673,...,0.337631,1.063356,1.456317,-1.138090,-0.628536,-0.288446,-0.137137,-0.181021,1.323185,0
3,1.0,-0.493324,-0.112169,1.182514,-0.609726,-0.007469,0.936148,0.192070,0.316017,-1.262501,...,-0.147443,0.007267,-0.304776,-1.941024,1.241902,-0.460217,0.155396,0.186188,0.835408,0
4,2.0,-0.591329,0.531540,1.021410,0.284655,-0.295015,0.071998,0.479301,-0.226510,0.744325,...,-0.012839,1.100009,-0.220123,0.233250,-0.395201,1.041609,0.543619,0.651815,0.588171,0
5,2.0,-0.217474,0.581674,0.752584,-0.118833,0.305008,-0.022313,0.384935,0.217954,-0.517618,...,-0.283522,-0.771426,-0.042273,-0.613272,-0.446583,0.219637,0.628899,0.245636,-0.695343,0
6,4.0,0.627794,0.085389,0.029923,0.849382,0.139019,0.204694,-0.004170,0.067997,0.423217,...,-0.228333,-0.373032,-0.246779,-1.287970,1.439034,-0.533435,0.085492,0.015656,-0.561581,0
7,7.0,-0.328928,0.858691,0.708575,-0.347630,0.687510,0.321345,0.905858,-3.188224,0.560128,...,2.645884,-1.399273,0.092085,-1.072752,-0.796632,-0.107075,-2.990148,-3.288077,0.353219,0
8,7.0,-0.456572,0.173291,-0.074652,-0.191774,1.934146,2.793589,0.299205,0.712590,-0.356851,...,-0.099963,-0.369424,-0.327055,1.670266,0.715942,-0.796632,0.029104,0.431419,0.712856,0
9,9.0,-0.172697,0.678004,0.688780,-0.156926,0.361791,-0.185218,0.526705,0.058223,-0.670586,...,-0.336155,-0.873297,-0.193438,-0.635766,-0.133773,0.195341,0.610009,0.251681,-0.694159,0
