In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection  import train_test_split
from sklearn import neighbors
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_val_score
import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier

%matplotlib inline
SEED = 42

In [3]:
# Read processe file
df = pd.read_csv('../data/processed_balanced_transaction.csv')
df.shape

(872136, 34)

#### Lets separate Level and features, Scaled feature


In [6]:
X, y = df.drop(['isFraud'],axis=1), df['isFraud']
# scale the features by standard scaler
standard_scaler = StandardScaler()
X = standard_scaler.fit_transform(X)

#### Split data into train test 
 We are spliting data into train and test with ratio 30%. Means 30% test and 70% train data

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=SEED)

#### Create an AdaBoost Classifier and fit the model and report accuracy

Let's create the AdaBoost Model using Scikit-learn. AdaBoost uses Decision Tree Classifier as default Classifier.

In [8]:
from sklearn import metrics
target_names = ['Not Fraud', 'Fraud']
abc = AdaBoostClassifier(n_estimators=50,
                         learning_rate=1,
                         random_state=0)
abc.fit(X_train, y_train)
y_pred = abc.predict(X_test)
print(metrics.classification_report(y_test, y_pred, digits=3, target_names=target_names))

              precision    recall  f1-score   support

   Not Fraud      0.696     0.673     0.684    130734
       Fraud      0.684     0.707     0.695    130907

    accuracy                          0.690    261641
   macro avg      0.690     0.690     0.690    261641
weighted avg      0.690     0.690     0.690    261641



#### Cross validation
we are going to apply k-fold cross-validation.

it will split the original data set into k subsets and use one of the subsets as the testing set and the remaining as the training sets. This process iterated k times until every subset have been used as the testing set. Since 10-fold cross-validation is the most popular one, we are going to use that one.

In [None]:
cv_scores = cross_val_score(abc, X_train, y_train, cv=10)
print('Average score: {}'.format(round(np.mean(cv_scores),3)))


In [None]:
sns.distplot(cv_scores)
plt.title('Average score: {}'.format(np.mean(cv_scores)))

#### Parameter Tuning

In classification technique, there are some parameters that can be tuned to optimize the classification. 
In AdaBoost Classifier we can tune 

- base_estimator
- n_estimators
- learning_rate

Grid Search explores a range of parameters and finds the best combination of parameters. Then repeat the process several times until the best parameters are discovered. 
lets use grid search to get best params




In [None]:
from sklearn.model_selection import GridSearchCV
#Creating a grid of hyperparameters
grid_params = {    'n_estimators': [50,100]}
#Use GridSearch
abc_grid_search = GridSearchCV(AdaBoostClassifier(), grid_params, cv = 10, n_jobs = -1)

#Fit the model
abc_grid_search.fit(X_train, y_train)
print('Best score: {}'.format(abc_grid_search.best_score_))
print('Best parameters: {}'.format(abc_grid_search.best_params_))

best_abc_classifier = abc_grid_search.best_estimator_

In [None]:
best_abc_classifier.fit(X_train, y_train)
y_pred_abc = best_abc_classifier.predict(X_test)
print(metrics.classification_report(y_test, y_pred_abc, digits=3, target_names=target_names))