# Bank Marketing

## 1. Importing data to Python

In [1]:
import pandas as pd
import numpy as np

In [2]:
databank = pd.read_csv("bank.csv", sep = ";")

Dari data kita bisa lihat:
age : Numerical
job : Categorical
marital : Categorical
education : Categorical
default : Categorical
housing: Categorical
loan: Categorical
contact: Categorical
day: numerical
month: categorical
duration: numerical
campaign: numerical
pdays: numerical
previous: numerical
poutcome: categorical
y: output (desired target)

In [3]:
databank.shape

(4521, 17)

Drop data yang duplicate

In [4]:
databank = databank.drop_duplicates()
databank.shape

(4521, 17)

Input-Output Splitting

In [5]:
def extractInputOutput(data, output_column_name):
    data_output = data[output_column_name]
    data_input = data.drop(output_column_name, axis = 1)
    return data_input, data_output

x,y = extractInputOutput(databank, 'y')

In [6]:
x.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [7]:
y.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

# Train-Test Split

In [8]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x,y, test_size = 0.3, random_state=123)

In [9]:
categorical = ["job","marital","education","default","housing","loan","contact","month","poutcome"]
numerical = list(x_train.drop(categorical, axis = 1))

## Numerical imputation

In [68]:
from sklearn.preprocessing import Imputer
imput_numerical = Imputer(missing_values = 'NaN', strategy = 'median' )

In [10]:
x_train_num = x_train[numerical]

In [11]:
x_train_num.isnull().sum()

age         0
balance     0
day         0
duration    0
campaign    0
pdays       0
previous    0
dtype: int64

In [12]:
x_train_cat = x_train[categorical]

In [13]:
x_train_cat.isnull().sum()

job          0
marital      0
education    0
default      0
housing      0
loan         0
contact      0
month        0
poutcome     0
dtype: int64

In [14]:
x_train_cat.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,poutcome
3480,services,married,secondary,no,yes,no,unknown,jun,unknown
3344,blue-collar,single,primary,no,no,no,cellular,nov,unknown
1939,technician,married,secondary,no,yes,no,cellular,jun,unknown
834,unemployed,married,secondary,no,no,no,cellular,jul,unknown
3715,admin.,married,secondary,no,no,no,unknown,jun,unknown


Tidak perlu imputation, data sudah lengkap

## Preprocessing Categorical Variables

In [15]:
# Create dummy variables
def dummyVariables(data, categorical_columns):
    data_cat = data[categorical_columns]
    data_cat = pd.get_dummies(data = data_cat, prefix = categorical_columns)
    #prefix untuk memberikan nama depan di variable dummy sesuai dengan nama variable aslinya
    
    return data_cat

categorical_dummies = dummyVariables(data = x_train_cat, categorical_columns = categorical)

## Join Numerical and Categorcial

In [16]:
x_train = pd.concat([x_train_num, categorical_dummies], axis =1)

## Normalizing/Standarizing Numerical Variables

In [17]:
from sklearn.preprocessing import StandardScaler

def standardizer(data):
    data_columns = data.columns
    data_index = data.index
    normalize = StandardScaler()
    normalize.fit(data)
    
    normalized_data = pd. DataFrame(normalize.transform(data), index = data_index)
    normalized_data.columns = data_columns
    return normalized_data, normalize

x_train, normalize = standardizer(x_train)

In [18]:
x_train.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
3480,0.533437,-0.320732,-0.463258,-0.695508,-0.55916,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,2.773501,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
3344,-1.065032,-0.33537,0.632766,0.673694,-0.55916,-0.415574,-0.325031,-0.347554,1.980072,-0.198549,...,-0.360555,-0.104224,-0.660966,3.15406,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
1939,0.345382,-0.361393,1.728789,-0.646898,-0.55916,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,2.773501,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
834,0.81552,0.112553,1.607009,-0.229655,0.40044,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,-0.360555,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
3715,0.909548,0.501274,0.145644,-0.727916,-0.239293,-0.415574,-0.325031,2.877254,-0.505032,-0.198549,...,2.773501,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761


In [19]:
y_train.value_counts(normalize=True)

no     0.881163
yes    0.118837
Name: y, dtype: float64

## Training Machine Learning

### 1. K-Nearest Neighbors

In [20]:
#1. Import classifier, Randomized Search
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RandomizedSearchCV

In [21]:
def knn_fit(x_train, y_train, scoring = 'accuracy'):
    #scoring = 'accuracy' means in this case we use accuracy as score to find the best arameter
    
    knn = KNeighborsClassifier()
    #defining classifier
    
    hyperparam = {'n_neighbors' : [5, 7, 8, 9, 10, 12, 14]}
    random_knn = RandomizedSearchCV(knn, param_distributions = hyperparam, cv = 5,
                                    n_iter = 3, scoring = scoring, n_jobs=-1, random_state=123)
    
    random_knn.fit(x_train, y_train)
    
    print "Best Accuracy", random_knn.best_score_
    print "Best Param", random_knn.best_params_
    return random_knn

In [22]:
best_knn = knn_fit(x_train, y_train)

Best Accuracy 0.886852085967
Best Param {'n_neighbors': 9}


In [23]:
#define KNN dengan k hasil dari Randomized Search
knn = KNeighborsClassifier(n_neighbors= best_knn.best_params_.get('n_neighbors'))
#fitting KNN
knn.fit(x_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=9, p=2,
           weights='uniform')

In [24]:
from sklearn.model_selection import cross_val_score

In [25]:
knnCV = cross_val_score(knn, x_train, y_train, cv=5, n_jobs =-1)
print("Accuracy: %f (+/- %f) \nStandard Deviation: %f " % (knnCV.mean(), knnCV.std()*2,knnCV.std()))

Accuracy: 0.886857 (+/- 0.011192) 
Standard Deviation: 0.005596 


### 2. Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
def logreg_fit(x_train, y_train, scoring = 'accuracy'):
    logreg = LogisticRegression(random_state=123)
    
    hyperparam = {'C': [1000, 333.33, 100, 33.33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 
                        0.001, 0.00033, 0.0001]}
    random_logreg = RandomizedSearchCV(logreg, param_distributions = hyperparam, cv = 5,
                                    n_iter = 12, scoring = scoring, n_jobs=-1, random_state = 123)
    
    random_logreg.fit(x_train, y_train)
    
    print "Best Accuracy", random_logreg.best_score_
    print "Best Param", random_logreg.best_params_
    
    return random_logreg 

In [28]:
best_logreg = logreg_fit(x_train, y_train) 

Best Accuracy 0.897281921618
Best Param {'C': 10}


In [29]:
logreg = LogisticRegression(C = best_logreg.best_params_.get('C'), random_state = 123)
logreg.fit(x_train, y_train)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=123, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [30]:
logregCV = cross_val_score(logreg, x_train, y_train, cv = 5, n_jobs=-1)
print("Accuracy: %f (+/- %f) \nStandard Deviation: %f " % (logregCV.mean(), logregCV.std()*2,logregCV.std()))

Accuracy: 0.897292 (+/- 0.019862) 
Standard Deviation: 0.009931 


### 3. Linear SVC 

In [31]:
from sklearn.svm import LinearSVC

In [32]:
def linSVC_fit(x_train, y_train, scoring = 'accuracy'):
    linSVC = LinearSVC(random_state=123)

    hyperparam = {'C': [1000, 333.33, 100, 33.33, 10, 3.33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 
                        0.001, 0.00033, 0.0001]}

    random_linSVC = RandomizedSearchCV(linSVC, param_distributions = hyperparam, cv = 5,
                                    n_iter = 6, scoring = scoring, n_jobs=-1, random_state = 123, verbose = True)
    
    random_linSVC.fit(x_train, y_train)
    
    print "Best Accuracy", random_linSVC.best_score_
    print "Best Param", random_linSVC.best_params_
    
    return random_linSVC 

In [33]:
best_linSVC = linSVC_fit(x_train, y_train) 

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   11.9s finished


Best Accuracy 0.894437420986
Best Param {'C': 1}


In [34]:
linSVC = LinearSVC(C = best_linSVC.best_params_.get('C'), random_state = 123)
linSVC.fit(x_train, y_train)

LinearSVC(C=1, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=123, tol=0.0001,
     verbose=0)

In [35]:
linSVCCV = cross_val_score(linSVC, x_train, y_train, cv = 5, n_jobs=-1)
print("Accuracy: %f (+/- %f) \nStandard Deviation: %f " % (linSVCCV.mean(), linSVCCV.std()*2,linSVCCV.std()))

Accuracy: 0.894445 (+/- 0.016458) 
Standard Deviation: 0.008229 


### 4. RBF Kernel SVC

In [36]:
from sklearn.svm import SVC

In [37]:
def rbfSVC_fit(x_train, y_train, scoring = 'accuracy'):
    rbfSVC = SVC(kernel = 'rbf', random_state=123)

    hyperparam = {'C': [1000, 333.33, 100, 33.33, 10, 3.33, 10, 3.33, 1, 0.33, 0.1, 0.033, 0.01, 0.0033, 
                        0.001, 0.00033, 0.0001], 
                  'gamma' : [10, 3.33, 1, 0.33, 0.1, 0.033, 0.01]}

    random_rbfSVC = RandomizedSearchCV(rbfSVC, param_distributions = hyperparam, cv = 3,
                                    n_iter = 6, scoring = scoring, n_jobs=-1, random_state = 123)
    
    random_rbfSVC.fit(x_train, y_train)
    
    print "Best Accuracy", random_rbfSVC.best_score_
    print "Best Param", random_rbfSVC.best_params_
    
    return random_rbfSVC 

In [38]:
best_rbfSVC = rbfSVC_fit(x_train, y_train) 

Best Accuracy 0.881163084703
Best Param {'C': 0.0033, 'gamma': 10}


In [39]:
rbfSVC = SVC(kernel = 'rbf', C = best_rbfSVC.best_params_.get('C'), 
             gamma = best_rbfSVC.best_params_.get('gamma'), random_state = 123)
rbfSVC.fit(x_train, y_train)

SVC(C=0.0033, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=10, kernel='rbf',
  max_iter=-1, probability=False, random_state=123, shrinking=True,
  tol=0.001, verbose=False)

In [40]:
rbfSVCCV = cross_val_score(rbfSVC, x_train, y_train, cv = 5, n_jobs=-1)
print rbfSVCCV.mean(), rbfSVCCV.std()
print("Accuracy: %f (+/- %f) \nStandard Deviation: %f " % (rbfSVCCV.mean(), rbfSVCCV.std()*2,rbfSVCCV.std()))

0.881163517234 0.000525399731442
Accuracy: 0.881164 (+/- 0.001051) 
Standard Deviation: 0.000525 


### 5. Decision Tree 

In [41]:
from sklearn.tree import DecisionTreeClassifier

In [42]:
def decTree_fit(x_train, y_train, scoring = 'accuracy'):
    decTree = DecisionTreeClassifier(random_state=123)

    hyperparam = {'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75]}

    random_decTree = RandomizedSearchCV(decTree, param_distributions = hyperparam, cv = 5,
                                        n_iter = 15, scoring = scoring, n_jobs=-1, random_state = 123)
    
    random_decTree.fit(x_train, y_train)
    
    print "Best Accuracy", random_decTree.best_score_
    print "Best Param", random_decTree.best_params_
    
    return random_decTree

In [43]:
best_decTree = decTree_fit(x_train, y_train)

Best Accuracy 0.887168141593
Best Param {'max_features': 0.5, 'min_samples_leaf': 33}


In [44]:
decTree = DecisionTreeClassifier(min_samples_leaf = best_decTree.best_params_.get('min_samples_leaf'),
                                 max_features = best_decTree.best_params_.get('max_features'), random_state=123)
decTree.fit(x_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=0.5, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=33,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=123, splitter='best')

In [45]:
decTreeCV = cross_val_score(decTree, x_train, y_train, cv = 5, n_jobs=-1)
print decTreeCV.mean(), decTreeCV.std()

0.887179167156 0.0129918285071


### 6. Bootstrap Aggregating 

In [46]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, GradientBoostingClassifier

In [47]:
def bagging_fit(x_train, y_train, scoring = 'accuracy'):
    decTree = DecisionTreeClassifier(random_state=123)
    
    bagging = BaggingClassifier(base_estimator = decTree, random_state=123, n_jobs=-1)
    
    hyperparam = {'base_estimator__min_samples_leaf': [1, 2, 3, 4, 5, 7, 9, 10, 11, 12, 13, 14, 15, 17, 21],
                  'n_estimators': [ 300, 500, 600, 750, 800, 850, 875, 900, 1000]}
    # 'base_estimator__' sebelum 'min_samples_leaf' menandakan hyperparameter yang dicari ada di dalam base estimatornya
    # dalam hal ini berarti decTree
    # (min_samples_leaf ada di dalam decTree)
    
    random_bagging = RandomizedSearchCV(bagging, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 10, scoring = scoring, n_jobs=-1, random_state = 123)
    random_bagging.fit(x_train, y_train)
    
    print "Best Accuracy", random_bagging.best_score_
    print "Best Param", random_bagging.best_params_
    return random_bagging

In [48]:
best_bagging = bagging_fit(x_train, y_train)

Best Accuracy 0.893173198483
Best Param {'n_estimators': 900, 'base_estimator__min_samples_leaf': 12}


In [49]:
decTreeBag = DecisionTreeClassifier(min_samples_leaf = best_bagging.best_params_.get('base_estimator__min_samples_leaf'),
                                    random_state=123)
bagging = BaggingClassifier(base_estimator = decTreeBag, 
                            n_estimators = best_bagging.best_params_.get('n_estimators'),
                            random_state=123, n_jobs=-1)
bagging.fit(x_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=12,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=123, splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=900, n_jobs=-1, oob_score=False,
         random_state=123, verbose=0, warm_start=False)

In [50]:
baggingCV = cross_val_score(bagging, x_train, y_train, cv=5, n_jobs = -1)
print baggingCV.mean(), baggingCV.std()

0.893179347636 0.0119640548158


### 7. Random Forest

In [51]:
def randomForest_fit(x_train, y_train, scoring = 'accuracy'):
    randomForest = RandomForestClassifier(random_state=123)

    hyperparam = {'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75], 
                  'n_estimators': [100, 200, 300, 500, 750, 1000]}
    
    random_randomForest = RandomizedSearchCV(randomForest, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 6, scoring = scoring, n_jobs=-1, random_state = 123)
    random_randomForest.fit(x_train, y_train)
    
    print "Best Accuracy", random_randomForest.best_score_
    print "Best Param", random_randomForest.best_params_
    return random_randomForest

In [52]:
best_randForest = randomForest_fit(x_train, y_train)

Best Accuracy 0.892225031606
Best Param {'n_estimators': 300, 'max_features': 0.75, 'min_samples_leaf': 17}


In [53]:
randForest = RandomForestClassifier(random_state=123, n_jobs = -1,
                                   min_samples_leaf = best_randForest.best_params_.get('min_samples_leaf'),
                                   max_features = best_randForest.best_params_.get('max_features'),
                                   n_estimators = best_randForest.best_params_.get('n_estimators'))
randForest.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=0.75, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=17,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=300, n_jobs=-1, oob_score=False, random_state=123,
            verbose=0, warm_start=False)

In [54]:
randForestCV = cross_val_score(randForest, x_train, y_train, cv=5, n_jobs = -1)
print randForestCV.mean(), randForestCV.std()

0.892230980407 0.0117045406636


### 8. Adaptive Boosting 

In [55]:
from sklearn.ensemble import AdaBoostClassifier

In [56]:
def adaboost_fit(x_train, y_train, scoring = 'accuracy'):
    decTree = DecisionTreeClassifier(random_state=123)
    
    adaboost = AdaBoostClassifier(base_estimator = decTree, random_state=123)
    
    hyperparam = {'base_estimator__min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'base_estimator__max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75],
                  'learning_rate': [0.01, 0.015, 0.02, 0.05, 0.08, 0.1],
                  'n_estimators': [100, 200, 300, 500, 750, 1000]}
    
    random_adaboost = RandomizedSearchCV(adaboost, param_distributions = hyperparam, cv = 5, 
                                           n_iter = 6, scoring = scoring, n_jobs=-1, random_state = 123)
    random_adaboost.fit(x_train, y_train)
    
    print "Best Accuracy", random_adaboost.best_score_
    print "Best Param", random_adaboost.best_params_
    return random_adaboost

In [57]:
best_adaboost = adaboost_fit(x_train, y_train)

Best Accuracy 0.889696586599
Best Param {'n_estimators': 100, 'learning_rate': 0.015, 'base_estimator__min_samples_leaf': 9, 'base_estimator__max_features': 0.25}


In [58]:
decTreeBoost = DecisionTreeClassifier(min_samples_leaf = best_adaboost.best_params_.get('base_estimator__min_samples_leaf'),
                                      max_features = best_adaboost.best_params_.get('base_estimator__max_features'),
                                      random_state=123)
adaboost = AdaBoostClassifier(base_estimator = decTreeBoost, 
                             n_estimators = best_adaboost.best_params_.get('n_estimators'),
                             learning_rate = best_adaboost.best_params_.get('learning_rate'),
                             random_state=123)
adaboost.fit(x_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=0.25, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=9,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=123, splitter='best'),
          learning_rate=0.015, n_estimators=100, random_state=123)

In [59]:
adaboostCV = cross_val_score(adaboost, x_train, y_train, cv=5, n_jobs = -1)
print adaboostCV.mean(), adaboostCV.std()

0.889697824008 0.00685714772795


### 9. Gradient Boosting 

In [60]:
def gradBoost_fit(x_train, y_train, scoring = 'accuracy'):
    gradBoost = GradientBoostingClassifier(random_state=123)
    
    hyperparam = {'min_samples_leaf': [3, 5, 7, 9, 13, 17, 21, 27, 33, 41, 50, 60, 80, 100],
                  'max_features': ['sqrt', 'log2', 0.25, 0.5, 0.75], 
                  'n_estimators': [100, 200, 300, 500, 750, 1000],
                  'learning_rate': [0.01, 0.015, 0.02, 0.05, 0.08, 0.1] }
    random_gradBoost = RandomizedSearchCV(gradBoost, param_distributions = hyperparam, cv = 5,
                                          n_iter = 6, scoring = scoring, n_jobs=-1, 
                                          random_state = 123, verbose = True)
    random_gradBoost.fit(x_train, y_train)
    
    print "Best Accuracy", random_gradBoost.score(x_train, y_train)
    print "Best Param", random_gradBoost.best_params_
    return random_gradBoost

In [61]:
best_gradBoost = gradBoost_fit(x_train, y_train, scoring = "accuracy")

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.1s finished


Best Accuracy 0.910240202276
Best Param {'n_estimators': 100, 'max_features': 'sqrt', 'learning_rate': 0.1, 'min_samples_leaf': 41}


In [62]:
gradBoost = GradientBoostingClassifier(n_estimators=best_gradBoost.best_params_.get('n_estimators'),
                                       min_samples_leaf = best_gradBoost.best_params_.get('min_samples_leaf'),
                                       max_features = best_gradBoost.best_params_.get('max_features'),
                                       learning_rate = best_gradBoost.best_params_.get('learning_rate'), random_state=123)
gradBoost.fit(x_train, y_train)

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_split=1e-07, min_samples_leaf=41,
              min_samples_split=2, min_weight_fraction_leaf=0.0,
              n_estimators=100, presort='auto', random_state=123,
              subsample=1.0, verbose=0, warm_start=False)

In [63]:
gradBoostCV = cross_val_score(gradBoost, x_train, y_train, cv=5, n_jobs = -1)
print gradBoostCV.mean(), gradBoostCV.std()

0.897607226156 0.0111092957201


## Save Model

In [64]:
from sklearn.externals import joblib

joblib.dump(knn, 'model/knn.pkl')
joblib.dump(logreg, 'model/logreg.pkl')
joblib.dump(linSVC, 'model/linearSVC.pkl')
joblib.dump(rbfSVC, 'model/rbfSVC.pkl')
joblib.dump(decTree, 'model/decisionTree.pkl')
joblib.dump(bagging, 'model/bagging.pkl')
joblib.dump(randForest, 'model/randomForest.pkl')
joblib.dump(adaboost, 'model/adaboost.pkl')
joblib.dump(gradBoost, 'model/gradientBoosting.pkl')

['model/gradientBoosting.pkl']

## Test Model 

### Preprocessing Test Data  

Preprocessing Test Data dilakukan mirip dengan train data yang kita lakukan tadi

In [65]:
def testData(data, numerical_columns, categorical_columns, imput_numerical, standard):
    numerical_data = data[numerical_columns]
    categorical_data = data[categorical_columns]
    imput_numerical.fit(numerical_data)
    
    numerical_data = pd.DataFrame(imput_numerical.transform(numerical_data), index = data.index) # imput numerical test
    numerical_data.columns = numerical_columns
    categorical_data = categorical_data.fillna(value="KOSONG") # imput categorical
    categorical_data = pd.get_dummies(categorical_data) # Dummies categorical

    x_valid = pd.concat([ numerical_data, categorical_data], axis = 1)
    x_valid_transform = pd.DataFrame(standard.transform(x_valid), index = data.index) # standardization
    x_valid_transform.columns = x_valid.columns # samakan nama column
        
    return x_valid_transform

In [69]:
x_test = testData(  numerical_columns=numerical,
                        categorical_columns = categorical,
                        data = x_test,
                        imput_numerical = imput_numerical,
                        standard = normalize)

In [70]:
x_test.shape

(1357, 51)

In [71]:
x_test.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
2982,-0.312811,-0.086199,-0.95038,1.629706,-0.55916,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,-0.360555,-0.104224,1.512938,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
276,-0.594894,3.185562,-0.463258,1.767436,-0.239293,0.365881,0.257301,-0.347554,-0.505032,-0.198549,...,-0.360555,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,2.782145,-0.21436,-0.172084,-2.084369
4132,-0.406839,-0.118077,1.85057,0.60888,-0.239293,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,-0.360555,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
511,-0.876977,-0.197122,0.145644,1.925421,0.080573,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,-0.360555,-0.104224,-0.660966,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761
2061,-0.971005,-0.57153,0.876326,-0.241808,0.720306,-0.415574,-0.325031,-0.347554,-0.505032,-0.198549,...,-0.360555,-0.104224,1.512938,-0.317052,-0.139032,-0.111714,-0.359435,-0.21436,-0.172084,0.479761


In [72]:
def testPred(x_test, y_test, classifier, compute_score = True):
    if compute_score == True:
        score = classifier.score(x_test, y_test)
        print "Accuracy", score
        
    valid_proba = pd.DataFrame(classifier.predict_proba(x_test)) # hasil prediksi
    
    return  valid_proba, score

In [73]:
classifiers = [logreg, knn, linSVC, rbfSVC, decTree, bagging, randForest, adaboost, gradBoost]
label = ['LogReg', 'KNN', 'LinearSVC', 'RBF SVC', 'Decision Tree', 'Bagging', 'RandomForest', 
         'AdaBoost', 'GradientBoost']

for clf, i in zip(classifiers, label):
    print i,':  ', clf.score(x_test, y_test)

LogReg :   0.915254237288
KNN :   0.893146647015
LinearSVC :   0.911569638909
RBF SVC :   0.893146647015
Decision Tree :   0.898305084746
Bagging :   0.910095799558
RandomForest :   0.909358879882
AdaBoost :   0.898305084746
GradientBoost :   0.907885040531
