In [29]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import time

from sklearn import ensemble
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error as mse
%matplotlib inline

The [datasets](https://www.kaggle.com/mlg-ulb/creditcardfraud) contains transactions made by credit cards in September 2013 by european cardholders. This dataset presents transactions that occurred in two days, where we have 492 frauds out of 284,807 transactions. The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions.

It contains only numerical input variables which are the result of a PCA transformation. Unfortunately, due to confidentiality issues, we cannot provide the original features and more background information about the data. Features V1, V2, ... V28 are the principal components obtained with PCA, the only features which have not been transformed with PCA are 'Time' and 'Amount'. Feature 'Time' contains the seconds elapsed between each transaction and the first transaction in the dataset. The feature 'Amount' is the transaction Amount, this feature can be used for example-dependant cost-senstive learning. Feature 'Class' is the response variable and it takes value 1 in case of fraud and 0 otherwise.

In [4]:
ccf = pd.read_csv('creditcard.csv')
print(ccf.head())

   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...         V21       V22       V23       V24  \
0  0.098698  0.363787  ...   -0.018307  0.277838 -0.110474  0.066928   
1  0.085102 -0.255425  ...   -0.225775 -0.638672  0.101288 -0.339846   
2  0.247676 -1.514654  ...    0.247998  0.771679  0.909412 -0.689281   
3  0.377436 -1.387024  ...   -0.108300  0.005274 -0.190321 -1.175575   
4 -0.270533  0.817739  ...   -0.009431  0.798278 -0.137458  0.141267   

        V25       V26       V27       V28  Amount  Class  
0  0.128539 -0.189115

In [7]:
# Declare predictors.
X_statsmod = ccf.drop('Class', axis=1)

# The Statsmodels formulation requires a column with constant value 1 that
# will act as the intercept.
X_statsmod['intercept'] = 1 

# Declare and fit the model.
logit = sm.Logit(ccf['Class'], X_statsmod)
result = logit.fit()

# Lots of information about the model and its coefficients, but the
# accuracy rate for predictions is missing.
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.003914
         Iterations 13
                           Logit Regression Results                           
Dep. Variable:                  Class   No. Observations:               284807
Model:                          Logit   Df Residuals:                   284776
Method:                           MLE   Df Model:                           30
Date:                Tue, 16 Apr 2019   Pseudo R-squ.:                  0.6922
Time:                        07:44:58   Log-Likelihood:                -1114.8
converged:                       True   LL-Null:                       -3621.2
                                        LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Time       -3.742e-06   2.26e-06     -1.659      0.097   -8.16e-06    6.79e-07
V1             0.0960      0

In [9]:
# Calculate accuracy. First, get probability that each row will be admitted.
pred_statsmod = result.predict(X_statsmod)

# Code fraud as 1 if probability is greater than .5.
pred_y_statsmod = np.where(pred_statsmod < .5, 0, 1)

# Accuracy table.
table = pd.crosstab(ccf['Class'], pred_y_statsmod)

print('\n Accuracy by admission status')
print(table)
print('\n Percentage accuracy')
print((table.iloc[0,0] + table.iloc[1,1]) / (table.sum().sum()))


 Accuracy by admission status
col_0       0    1
Class             
0      284273   42
1         184  308

 Percentage accuracy
0.9992064801778047


# Logistic Classification

In [10]:
# Declare a logistic regression classifier.
# Parameter regularization coefficient C described above.
lr = LogisticRegression(C=1e9)
y = ccf['Class']
X = ccf.drop('Class', axis=1)

# Fit the model.
fit = lr.fit(X, y)

# Display.
print('Coefficients')
print(fit.coef_)
print(fit.intercept_)
pred_y_sklearn = lr.predict(X)

print('\n Accuracy by admission status')
print(pd.crosstab(pred_y_sklearn, y))

print('\n Percentage accuracy')
print(lr.score(X, y))

Coefficients
[[-7.12203044e-05  3.19003328e-01 -4.84128854e-01 -7.93512062e-01
   1.20293469e-01  5.74908256e-02 -5.40509209e-02  3.35311561e-01
  -3.74352670e-01 -3.88608714e-01 -2.07048199e-01 -2.86745969e-01
   1.86468648e-02 -3.06675484e-01 -6.94620990e-01 -4.27801926e-01
  -2.94741919e-01 -4.39987901e-01  3.10692839e-02  2.65185344e-02
   9.20026632e-02  2.48888741e-01  3.51032098e-01  6.77179768e-02
  -2.44441841e-02 -3.56187711e-01  6.07212397e-02 -8.88577214e-02
   2.77997019e-02 -5.58259143e-03]]
[-1.62885926]

 Accuracy by admission status
Class       0    1
row_0             
0      284240  203
1          75  289

 Percentage accuracy
0.9990239003957065


In [18]:
# Display results
print('Accuracy: ',round(lr.score(X, y),6))

X_train , X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
trained = lr.fit(X_train, y_train)
print('\nTraining Score: ', round(trained.score(X_train, y_train),6))
print('Testing Score: ', round(trained.score(X_test, y_test),6))

#cvs = cross_val_score(lr, X, y, cv=10)
#print('\nCross Validation Score: {} +/- {}'.format(round(cvs.mean(),6), round(cvs.std(),6)))

y_pred = lr.predict(X)
print('\nMSE: ', round(mse(y,y_pred),6))
print('rMSE: ',round(mse(y,y_pred)**.5,6))

Accuracy:  0.998982

Training Score:  0.999254
Testing Score:  0.999122

MSE:  0.000772
rMSE:  0.027793


In [23]:
predict_train = lr.predict(X_train)
predict_test = lr.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I(false positive) errors: {}\n'
    'Percent Type II(false negative) errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I(false positive) errors: {}\n'
    'Percent Type II(false negative) errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

Training set accuracy:
Percent Type I(false positive) errors: 0.00021944743136781584
Percent Type II(false negative) errors: 0.000526673835282758

Test set accuracy:
Percent Type I(false positive) errors: 0.00026333345037042236
Percent Type II(false negative) errors: 0.0006144447175309856


In [19]:
cvs = cross_val_score(lr, X, y, cv=10)
print('Cross Validation Score: {} +/- {}'.format(round(cvs.mean(),6), round(cvs.std(),6)))

Cross Validation Score: 0.997651 +/- 0.004507


# Gradient Boosting Classification

In [30]:
start_time = time.time()

# We'll make 3000 iterations, use 2-deep trees, and set our loss function.
params = {'n_estimators': 500,
          'max_depth': 4,
          'subsample' : 0.9,
          'loss': 'exponential',
          'max_leaf_nodes': 6}

# Initialize and fit the model.
clf = ensemble.GradientBoostingClassifier(**params)
clf.fit(X_train, y_train)

print("--- %s seconds ---" % (time.time() - start_time))

predict_train = clf.predict(X_train)
predict_test = clf.predict(X_test)

# Accuracy tables.
table_train = pd.crosstab(y_train, predict_train, margins=True)
table_test = pd.crosstab(y_test, predict_test, margins=True)

train_tI_errors = table_train.loc[0.0,1.0] / table_train.loc['All','All']
train_tII_errors = table_train.loc[1.0,0.0] / table_train.loc['All','All']

test_tI_errors = table_test.loc[0.0,1.0]/table_test.loc['All','All']
test_tII_errors = table_test.loc[1.0,0.0]/table_test.loc['All','All']

print((
    'Training set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}\n\n'
    'Test set accuracy:\n'
    'Percent Type I errors: {}\n'
    'Percent Type II errors: {}'
).format(train_tI_errors, train_tII_errors, test_tI_errors, test_tII_errors))

--- 800.7953999042511 seconds ---
Training set accuracy:
Percent Type I errors: 1.7555794509425268e-05
Percent Type II errors: 0.00027211481489609164

Test set accuracy:
Percent Type I errors: 0.00010533338014816895
Percent Type II errors: 0.00040377795723464763
