In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
rng = np.random.RandomState(20180201)
DEFAULT_SEED = 123456
DATA_DIR = "../data/"
DATA_FILE = "creditcard.csv"

In [3]:
data = pd.read_csv(DATA_DIR+DATA_FILE)
stats = pd.DataFrame({'mean':data.mean(), 'std':data.std(), 'skew':data.skew()})

# normalize the data
data['Amount'] = data['Amount'].apply(lambda x: np.log(x+1e-6))
data.loc[:,'V1':'Amount'] = data.loc[:,'V1':'Amount'].apply(lambda x: (x-x.mean())/x.std())
#data.loc[:,'V1':'V28'] = data.loc[:,'V1':'V28'].apply(lambda x: (x-x.min())/(x.std(x.max()-x.min())))

In [4]:
# plot the histograms of data in each axises
plot=False
if plot:
    pos_data = data[data["Class"]==1]
    neg_data = data[data["Class"]==0]
    for key in data.keys()[1:-1]: # for all principle components
        fig = plt.figure(figsize=(4,3))
        plt.hist(pos_data[key], histtype="step", normed=True, bins=50, range=(-1,1), label="pos")
        plt.hist(neg_data[key], histtype="step", normed=True, bins=50, range=(-1,1), label="neg")
        plt.xlim(-1,1)
        plt.title(key)
        plt.legend()
        fig.savefig(key+"_Scaled.png")

In [5]:
def split_DataFrame(data, label, frac, rng=None, oversample=None):
    '''
    To split the data into two part, and remain the same event propotion in the results.
    Use simple oversampling to preprocess the imblanced dataset.
    
    Input:
        data (pd.DataFrame): the Input data
        label (string): the event class label
        frac (float): the propotion of the first part
        rng (np.RandomState): NumPy random state
        oversample (float): a constant to control how many times to duplicate (usually < 1)
    
    Return:
        subdf1: the first part of the data
        subdf2: the second part of the data
    '''
    if rng is None:
        rng = np.random.RandomState(DEFAULT_SEED)
    subdf1 = pd.DataFrame()
    subdf2 = pd.DataFrame()
    count = data[label].value_counts()
    if oversample is None:
        oversample = np.ones_like(count)
    else:
        oversample = np.int32(np.ceil(count.max()/count * oversample))
    for val, amplify in zip(count.keys(), oversample):
        df = data[data[label]==val]
        mask = rng.rand(len(df)) < frac
        subdf1 = pd.concat([df[mask]] * amplify + [subdf1]).sample(frac=1, random_state=rng).reset_index(drop=True)
        subdf2 = pd.concat([df[~mask]] * amplify + [subdf2]).sample(frac=1, random_state=rng).reset_index(drop=True)
    return (subdf1, subdf2)

# train : test : valid = 0.6 : 0.2 : 0.2 
train_df, dump = split_DataFrame(data, label='Class', frac=0.6, rng=rng, oversample=0.7)
test_df, valid_df = split_DataFrame(dump, label='Class', frac=0.5)

In [6]:
train = train_df.as_matrix()
test = test_df.as_matrix()
valid = valid_df.as_matrix()

np.savez_compressed(DATA_DIR+'ccdataset.npz', train=train, test=test, valid=valid)