In [1]:
# Importing the libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Importing the datset
dataset = pd.read_csv('creditcard.csv')
x = dataset.iloc[:,1:30].values
y = dataset.iloc[:,30].values

In [3]:
# Splitting the dataset into the Training set and Test set
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 0)

In [4]:
# Feature Scaling
from sklearn.preprocessing import StandardScaler
sc_x = StandardScaler()
x_train = sc_x.fit_transform(x_train)
x_test = sc_x.transform(x_test)

In [5]:
# PCA
from sklearn.decomposition import PCA
pca = PCA(n_components = None)
x_train = pca.fit_transform(x_train)
x_test = pca.transform(x_test)
explained_variance = pca.explained_variance_ratio_

In [18]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0, max_depth = None,
                                    min_samples_split=(2), n_jobs = -1)
classifier.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix,accuracy_score
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[56854,     7],
       [   19,    82]], dtype=int64)

### Grid Search

In [7]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0, max_depth = None,
                                    min_samples_split=(2))

from sklearn.model_selection import GridSearchCV
parameters = [{'n_estimators' : [10,100,120], 'criterion' : ['entropy'], 'max_features' : ['sqrt','log2']}]
grid_search = GridSearchCV(estimator = classifier, 
                           param_grid = parameters, 
                           scoring = 'accuracy', 
                           cv = 10, 
                           n_jobs = -1)

In [8]:
grid_search = grid_search.fit(x_train,y_train)

In [11]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_
best_parameters, best_accuracy

({'criterion': 'entropy', 'max_features': 'log2', 'n_estimators': 100},
 0.9995084381387569)

In [10]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0, max_depth = None,
                                    min_samples_split=(2), n_jobs = -1, max_features = 'log2')
classifier.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[56855,     6],
       [   22,    79]], dtype=int64)

### Random Search

In [6]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', max_depth = None, min_samples_split=2)

from sklearn.model_selection import RandomizedSearchCV
parameters = [{'n_estimators' : [50,100,150], 'criterion' : ['entropy','gini'], 'max_features' : ['log2','sqrt'], 
              'class_weight' : ['balanced','balanced_subsample',None], 'max_samples' : [0.5,0.7,0.9,None]}]
random_search = RandomizedSearchCV(estimator = classifier,
                                  param_distributions = parameters,
                                  n_iter = 20,
                                  cv = 10,
                                  n_jobs = -1)

In [7]:
random_search.fit(x_train, y_train)

RandomizedSearchCV(cv=10, error_score=nan,
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    ccp_alpha=0.0,
                                                    class_weight=None,
                                                    criterion='entropy',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    max_samples=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
           

In [8]:
best_accuracy = random_search.best_score_
best_parameters = random_search.best_params_
best_parameters, best_accuracy

({'n_estimators': 100,
  'max_samples': 0.7,
  'max_features': 'sqrt',
  'criterion': 'gini',
  'class_weight': None},
 0.9995128269910719)

In [11]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'gini', random_state = 0, max_depth = None,
                                    min_samples_split=(2), n_jobs = -1, max_features = 'sqrt', max_samples = 0.7,
                                   class_weight = None)
classifier.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[56855,     6],
       [   23,    78]], dtype=int64)

### Taking a step further, lets implement ExtraTreesClassifier

Instead of looking for the most discriminative thresholds, thresholds are drawn at random for each candidate feature and the best of the these randomly generated thresholds is chosen.

In [12]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(n_estimators = 100, criterion = 'entropy', max_depth = None, 
                                  min_samples_split = 2, n_jobs = -1, max_features = 'sqrt')
classifier.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix
y_pred = classifier.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[56856,     5],
       [   25,    76]], dtype=int64)

### ADA Boost

In [21]:
from sklearn.ensemble import ExtraTreesClassifier
classifier = ExtraTreesClassifier(n_estimators = 100, criterion = 'entropy', max_depth = None, 
                                  min_samples_split = 2, n_jobs = -1, max_features = 'sqrt')

from sklearn.ensemble import AdaBoostClassifier
ada_boost = AdaBoostClassifier(base_estimator = classifier, n_estimators = 100, algorithm = 'SAMME.R')
ada_boost.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix
y_pred = ada_boost.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[56856,     5],
       [   22,    79]], dtype=int64)

In [22]:
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0, max_depth = None,
                                    min_samples_split=(2), n_jobs = -1)

from sklearn.ensemble import AdaBoostClassifier
ada_boost = AdaBoostClassifier(classifier, n_estimators = 100, algorithm = 'SAMME.R')
ada_boost.fit(x_train, y_train)

from sklearn.metrics import confusion_matrix
y_pred = ada_boost.predict(x_test)
cm = confusion_matrix(y_test, y_pred)
cm

array([[56854,     7],
       [   22,    79]], dtype=int64)