In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pandas_profiling
%matplotlib inline
pd.pandas.set_option('display.max_columns', None)

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score, log_loss, auc, f1_score, precision_score, recall_score, confusion_matrix
from math import sqrt
from sklearn.model_selection import cross_validate
import hyperopt
from hyperopt import STATUS_OK
from sklearn.model_selection import GridSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import scale, StandardScaler
from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


### Functions

In [2]:
def calculate_bounds (v):
    iqr = v.quantile (0.75) - v.quantile (0.25)
    
    return [v.quantile (0.25) - (3 * iqr), v.quantile (0.75) + (3 * iqr)]

### Import Data

In [3]:
# load dataset
data = pd.read_csv("C:/Users/A228744/Documents/Kaggle/Data/creditcard.csv")

# rows and columns of the data
print(data.shape)

# visualise the dataset
data.head()

(284807, 31)


Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [4]:
# Removing duplicated values of class 0
data = data[~((data.duplicated())&(data['Class'] == 0))]

# Filter for Amount less than 2500
data = data[(data['Amount'] < 2500)]

### Logic: Remove non-informative outliers using the 6 std dev
##### Non-informative we've defined as a domain of variables that does not contain examples of fraud. Variables outside the range will be removed, unless it contains fraud examples. In that case, the cut-off is moved to include the largest observed instance of fraud

In [5]:
data_fraud = data [data.Class == 1]

In [6]:
ranges = []

for i in range (1,29):
    bounds_data = calculate_bounds (data ['V' + str (i)])
    bounds_fraud = [min (data_fraud ['V' + str (i)]), max (data_fraud ['V' + str (i)])]
    
    lb = min (bounds_data [0], bounds_fraud [0])
    ub = max (bounds_data [1], bounds_fraud [1])
    ranges.append ([lb, ub])

In [7]:
data.shape

(283296, 31)

In [8]:
for i in range (1,29):
    data = data [data ['V'+ str (i)].apply (lambda x : ((x > ranges [i - 1] [0]) & (x < ranges [i - 1] [1])))]

In [9]:
data.shape

(279071, 31)

### Train Test Split

In [10]:
X = data.iloc[:,:-1]
y = data['Class']#.astype(int) # Target variable


# Splitting data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y, 
                                                    test_size=0.25,
                                                    random_state = 1)

### SMOTE Sampling

In [11]:
sm = SMOTE(random_state=42,sampling_strategy = 0.01)

In [12]:
X_train, y_train = sm.fit_resample(X_train, y_train)

### Standardize

In [13]:
# standarisation: we use the StandardScaler from sklearn
scaler = StandardScaler() # create an object
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train),columns = X_train.columns) # fit the scaler to the train set, and then transform it
X_test_scaled = pd.DataFrame(scaler.transform(X_test),columns = X_test.columns)  # transform the test set

### Variable Importance

In [14]:
# feature selection
rf = RandomForestClassifier (n_estimators = 100, random_state = 42)
model = rf.fit (X_train_scaled, y_train)

In [15]:
variable_importance = (
    pd.DataFrame (
        [X_train_scaled.columns, model.feature_importances_])
    .transpose ()
    .sort_values (by = 1, ascending = False))

In [16]:
variable_importance.columns = ['Variable','Importance']

In [17]:
variable_importance

Unnamed: 0,Variable,Importance
14,V14,0.222518
17,V17,0.18231
10,V10,0.124073
12,V12,0.108901
16,V16,0.0599779
11,V11,0.0562112
3,V3,0.0442796
4,V4,0.0331512
9,V9,0.0265468
18,V18,0.0161197


In [43]:
selected_features = variable_importance.Variable.values

### Pipeline

In [44]:
# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])

In [49]:
# Create dictionary with candidate learning algorithms and their hyperparameters
search_space = [#{"classifier": [LogisticRegression()],
                 #"classifier__penalty": ['l1','l2'],
                 #"classifier__C": np.logspace(0, 4, 10),
                 #"classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 #},
#                 {"classifier": [RandomForestClassifier()],
#                  "classifier__n_estimators": [100, 500],
#                  "classifier__max_depth":[5,30,None],
#                  "classifier__min_samples_leaf":[5,50,100,None],
#                  "classifier__max_leaf_nodes": [10,15,25,None]
#                },
                {"classifier": [AdaBoostClassifier()],
                 "classifier__n_estimators": [2000],
                 "classifier__learning_rate":[0.1,1],
               },
#                {"classifier": [SVC()],
#                 "classifier__C": [0.1, 0.25,0.5,1]
#                }
]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, search_space, cv=2, scoring='f1', verbose=0,n_jobs=-1) # Fit grid searchb

#### All Features

In [50]:
# 1000 Estimators F1 metric
best_model_af = gridsearch.fit(X_train_scaled[selected_features], y_train)

In [51]:
# 1000 Estimators F1 metric
print(best_model_af_2.best_estimator_)
print("The mean accuracy of the model is:",best_model_af.score(X_test_scaled[selected_features], y_test))
print("model precision: %.3f" % precision_score(y_test,best_model_af.predict(X_test_scaled[selected_features])))
print("model recall: %.3f" % recall_score(y_test,best_model_af.predict(X_test_scaled[selected_features])))

Pipeline(memory=None,
         steps=[('classifier',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1, n_estimators=1000,
                                    random_state=None))],
         verbose=False)
The mean accuracy of the model is: 0.867579908675799
model precision: 0.913
model recall: 0.826


In [52]:
# 1000 Estimators F1 metric
confusion_matrix(y_test,best_model_af.predict(X_test_scaled[selected_features]))

array([[69644,     9],
       [   20,    95]], dtype=int64)

In [46]:
# 2000 Estimators F1 metric
best_model_af_2 = gridsearch.fit(X_train_scaled[selected_features], y_train)

In [47]:
# 2000 Estimators F1 metric
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model_af_2.score(X_test_scaled[selected_features], y_test))
print("model precision: %.3f" % precision_score(y_test,best_model_af_2.predict(X_test_scaled[selected_features])))
print("model recall: %.3f" % recall_score(y_test,best_model_af_2.predict(X_test_scaled[selected_features])))

Pipeline(memory=None,
         steps=[('classifier',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1, n_estimators=2000,
                                    random_state=None))],
         verbose=False)
The mean accuracy of the model is: 0.8663594470046083
model precision: 0.922
model recall: 0.817


In [48]:
# 1000 Estimators F1 metric
confusion_matrix(y_test,best_model_af_2.predict(X_test_scaled[selected_features]))

array([[69645,     8],
       [   21,    94]], dtype=int64)

#### Top 10 Features

In [22]:
# 2000 Estimators F1 metric
best_model = gridsearch.fit(X_train_scaled[selected_features], y_train)

In [26]:
# 1000 Estimators F1 metric
best_model_2 = gridsearch.fit(X_train_scaled[selected_features], y_train)

In [23]:
# 2000 Estimators F1 metric
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test_scaled[selected_features], y_test))
print("model precision: %.3f" % precision_score(y_test,best_model.predict(X_test_scaled[selected_features])))
print("model recall: %.3f" % recall_score(y_test,best_model.predict(X_test_scaled[selected_features])))

Pipeline(memory=None,
         steps=[('classifier',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1, n_estimators=2000,
                                    random_state=None))],
         verbose=False)
The mean accuracy of the model is: 0.8189655172413793
model precision: 0.812
model recall: 0.826


In [24]:
# 2000 Estimators F1 metric
confusion_matrix(y_test,best_model.predict(X_test_scaled[selected_features]))

array([[69631,    22],
       [   20,    95]], dtype=int64)

In [41]:
# 1000 Estimators F1 metric
print(best_model_2.best_estimator_)
print("The mean accuracy of the model is:",best_model_2.score(X_test_scaled[selected_features], y_test))
print("model precision: %.3f" % precision_score(y_test,best_model_2.predict(X_test_scaled[selected_features])))
print("model recall: %.3f" % recall_score(y_test,best_model_2.predict(X_test_scaled[selected_features])))

Pipeline(memory=None,
         steps=[('classifier',
                 AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
                                    learning_rate=1, n_estimators=1000,
                                    random_state=None))],
         verbose=False)
The mean accuracy of the model is: 0.8154506437768241
model precision: 0.805
model recall: 0.826


In [42]:
# 1000 Estimators F1 metric
confusion_matrix(y_test,best_model_2.predict(X_test_scaled[selected_features]))

array([[69630,    23],
       [   20,    95]], dtype=int64)

### Save model in Model folder

In [53]:
import pickle

In [54]:
# save the model to disk
filename = 'credit_fraud_adaboost.sav'
pickle.dump(best_model_af_2, open(filename, 'wb'))