## Final Project Submission

Please fill out:
* Student name: Seyma Tas
* Student pace: full time
* Scheduled project review date/time: 
* Instructor name: Amber Yandow
* Blog post URL:


In [None]:
import seaborn as sns
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_style('darkgrid', {'axes.facecolor': '0.9'})

from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, ParameterGrid
from sklearn.linear_model import LogisticRegression, SGDClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from sklearn.metrics import recall_score, f1_score, fbeta_score, r2_score, roc_auc_score, roc_curve, auc, cohen_kappa_score
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
import xgboost as xgb
# from sklearn.metrics import plot_confusion_matrix

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('TelcoCustomerChurnData_cleaned_ohe.csv')

# Splitting the data into test and train sets

In [3]:
X = df.drop(['Churn'], axis=1)
y = df['Churn']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [5]:
scaler = StandardScaler()
scaled_x_train = scaler.fit_transform(X_train)
scaled_X_test = scaler.transform(X_test)
scaled_X_train = pd.DataFrame(scaled_x_train, columns=X_train.columns)


## Functions to evaluate the model

In [6]:
def roc_curve_and_auc(clf, X_train, X_test, y_train, y_test):

    # Calculate the probability scores of each point in the training set
    y_train_score = clf.fit(X_train, y_train).decision_function(X_train)

    # Calculate the fpr, tpr, and thresholds for the training set
    train_fpr, train_tpr, thresholds = roc_curve(y_train, y_train_score)

    # Calculate the probability scores of each point in the test set
    y_test_score = clf.decision_function(X_test)

    # Calculate the fpr, tpr, and thresholds for the test set
    test_fpr, test_tpr, test_thresholds = roc_curve(y_test, y_test_score)

    # ROC curve for training set
    plt.figure(figsize=(10, 8))
    lw = 2
    plt.plot(train_fpr, train_tpr, color='darkorange',
             lw=lw, label='Train ROC curve')
    plt.plot(test_fpr, test_tpr, color='blue',
             lw=lw, label='Test ROC curve')
    plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.yticks([i/20.0 for i in range(21)])
    plt.xticks([i/20.0 for i in range(21)])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic (ROC) Curve for Training and Testing Sets')
    plt.legend(loc='lower right')
    plt.show()
    # Print the area under the roc curve
    print('Training AUC: {}'.format(round(auc(train_fpr, train_tpr), 5)))
    print('Testing AUC: {}'.format(round(auc(test_fpr, test_tpr), 5)))

In [7]:
def plot_feature_importances(model):
    n_features = X_train.shape[1]
    plt.figure(figsize=(8,8))
    plt.barh(range(n_features), model.feature_importances_, align='center') 
    plt.yticks(np.arange(n_features), X_train.columns.values) 
    plt.xlabel('Feature importance')
    plt.ylabel('Feature')

In [8]:
# # Plot features importances metric evaluation fonksiyonuna bunu ekleyecegim
# imp = pd.Series(data=clf.feature_importances_, index=x.columns).sort_values(ascending=False)
# plt.figure(figsize=(10,12))
# plt.title("Feature importance")
# ax = sns.barplot(y=imp.index, x=imp.values, palette="Blues_d", orient='h')

In [None]:
def model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test):

    print('MODEL EVALUATION METRICS:\n',
          '-----------------------------------------------------')
    print('Confusion Matrix for train & test set: \n')
    print(confusion_matrix(y_train, y_pred_train),'\n')
    
    print(confusion_matrix(y_test, y_pred_test))
    
    print('-----------------------------------------------------')
    print('\nClassification Report for train & test set\n',
          '\nTrain set\n',
          classification_report(y_train, y_pred_train),
          '\n\nTest set\n',
          classification_report(y_test, y_pred_test))
    
#     print('\nRecall for train & test set:\n',
#           round(recall_score(y_train, y_pred_train), 4),
#           round(recall_score(y_test, y_pred_test), 4))
       
#     print('Precision for train & test set:\n', 
#           round(precision_score(y_train, y_pred_train), 4),
#           round(precision_score(y_test, y_pred_test), 4))

#     print('f1 score for train & test set:\n', 
#           round(f1_score(y_train, y_pred_train), 4),
#           round(f1_score(y_test, y_pred_test), 4))

#     print('Accuracy for train and test set:\n ',
#           round(accuracy_score(y_train, y_pred_train), 4),
#           round(accuracy_score(y_test, y_pred_test), 4))
    print('-----------------------------------------------------\n')
    print("Cohen's Kappa for train and test set:\n ",
            round(cohen_kappa_score(y_train, y_pred_train), 4),
            round(cohen_kappa_score(y_test, y_pred_test), 4))

    print ("f2 score for train and test set: \n ",
           round(fbeta_score(y_train, y_pred_train)), 
           round(fbeta_score(y_test, y_pred_test)))
    
    print ('roc auc score for train and test set:\n ', 
           round(roc_auc_score(y_train, y_pred_train),4),
           round(roc_auc_score(y_test, y_pred_test),4) )
    
    print('Mean Cross Validation Score:\n', round(cross_val_score(clf, X, y, cv=5).mean(), 4))   

## Handling Imbalanced Data

In [None]:
sns.countplot(y);

In [None]:
print(y.value_counts())
print(y.value_counts(normalize=True))

## SMOTE


In [10]:
smote = SMOTE()
scaled_X_train_smoted, y_train_smoted = smote.fit_sample(scaled_X_train, y_train)

In [11]:
# smote = SMOTE()
# X_train_smoted, y_train_smoted = smote.fit_sample(X_train, y_train)

## ADASYN

In [13]:
adasyn = ADASYN()
X_train_ada, y_train_ada = adasyn.fit_sample(X_train, y_train)

# RANDOM FOREST baseline


In [74]:
clf = RandomForestClassifier(random_state=123)
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [75]:
model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test)

MODEL EVALUATION METRICS:
 -----------------------------------------------------
Confusion Matrix for train & test set: 

[[3850   13]
 [  84 1327]] 

[[1160  140]
 [ 275  183]]
-----------------------------------------------------

Classification Report for train & test set
 
Train set
               precision    recall  f1-score   support

           0       0.98      1.00      0.99      3863
           1       0.99      0.94      0.96      1411

    accuracy                           0.98      5274
   macro avg       0.98      0.97      0.98      5274
weighted avg       0.98      0.98      0.98      5274
 

Test set
               precision    recall  f1-score   support

           0       0.81      0.89      0.85      1300
           1       0.57      0.40      0.47       458

    accuracy                           0.76      1758
   macro avg       0.69      0.65      0.66      1758
weighted avg       0.75      0.76      0.75      1758

---------------------------------------------

In [None]:
# roc_curve_and_auc(clf, X_train, X_test, y_train, y_test)

In [None]:
clf.feature_importances_.round(5)

In [None]:
plot_feature_importances(clf)

# RANDOM FOREST smoted

In [76]:
clf = RandomForestClassifier(random_state=123)
clf.fit(scaled_X_train_smoted, y_train_smoted)
y_pred_train = clf.predict(scaled_X_train_smoted)
y_pred_test = clf.predict(scaled_X_test)

In [77]:
model_evaluation(scaled_X_train_smoted, scaled_X_test, y_train_smoted, y_test, y_pred_train, y_pred_test)

MODEL EVALUATION METRICS:
 -----------------------------------------------------
Confusion Matrix for train & test set: 

[[3844   19]
 [  52 3811]] 

[[1117  183]
 [ 216  242]]
-----------------------------------------------------

Classification Report for train & test set
 
Train set
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      3863
           1       1.00      0.99      0.99      3863

    accuracy                           0.99      7726
   macro avg       0.99      0.99      0.99      7726
weighted avg       0.99      0.99      0.99      7726
 

Test set
               precision    recall  f1-score   support

           0       0.84      0.86      0.85      1300
           1       0.57      0.53      0.55       458

    accuracy                           0.77      1758
   macro avg       0.70      0.69      0.70      1758
weighted avg       0.77      0.77      0.77      1758

---------------------------------------------

In [None]:
# roc_curve_and_auc(clf, X_train, X_test, y_train, y_test)

In [None]:
clf.feature_importances_.round(5)

In [None]:
plot_feature_importances(clf)

# RANDOM FOREST  Grid search, not smoted, class_weight='balanced,  pipeline

In [6]:
pipe = Pipeline([('classifier', RandomForestClassifier(random_state=123))])

In [7]:
grid = [{'classifier__criterion': ['gini', 'entropy'], 
         'classifier__n_estimators':[10, 20, 50, 100, 150, 200],
         'classifier__max_depth': [3, 4, 5, 6, 7],
         'classifier__min_samples_split': [1.0, 2, 3, 4, 5, 6, 7, 8, 9, 10], 
         'classifier__min_samples_leaf': [1,2, 3,4, 5,6,7, 8, 9, 10],
         'classifier__class_weight':['balanced']}]

In [80]:
# clf = GridSearchCV(estimator= pipe, param_grid=grid, scoring={
#                           'AUC': 'roc_auc', 'Accuracy': make_scorer(_score)}, cv=5, refit='Accuracy')

In [9]:
clf = GridSearchCV(estimator= pipe, param_grid=grid, cv=5, scoring='roc_auc' )

In [None]:
# RF.get_params().keys()

In [10]:
# Fit the training data
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [None]:
model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test)

MODEL EVALUATION METRICS:
 -----------------------------------------------------
Confusion Matrix for train & test set: 

[[2936  927]
 [ 241 1170]] 

[[946 354]
 [101 357]]
-----------------------------------------------------

Classification Report for train & test set
 
Train set
               precision    recall  f1-score   support

           0       0.92      0.76      0.83      3863
           1       0.56      0.83      0.67      1411

    accuracy                           0.78      5274
   macro avg       0.74      0.79      0.75      5274
weighted avg       0.83      0.78      0.79      5274
 

Test set
               precision    recall  f1-score   support

           0       0.90      0.73      0.81      1300
           1       0.50      0.78      0.61       458

    accuracy                           0.74      1758
   macro avg       0.70      0.75      0.71      1758
weighted avg       0.80      0.74      0.76      1758

-------------------------------------------------

In [20]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

0.8772192820252397
0.8340888478333892


In [17]:
clf.best_params_

{'classifier__class_weight': 'balanced',
 'classifier__criterion': 'gini',
 'classifier__max_depth': 7,
 'classifier__min_samples_leaf': 9,
 'classifier__min_samples_split': 2,
 'classifier__n_estimators': 20}

In [None]:
clf.feature_importances_.round(5)

In [None]:
plot_feature_importances(clf)

# Bagging Classifier

In [14]:
clf = BaggingClassifier(DecisionTreeClassifier(criterion='gini', max_depth=5), 
                                 n_estimators=20)

In [15]:
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [16]:
model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test)

MODEL EVALUATION METRICS:
 -----------------------------------------------------
Confusion Matrix for train & test set: 

[[3537  326]
 [ 664  747]] 

[[1176  124]
 [ 239  219]]
-----------------------------------------------------

Classification Report for train & test set
 
Train set
               precision    recall  f1-score   support

           0       0.84      0.92      0.88      3863
           1       0.70      0.53      0.60      1411

    accuracy                           0.81      5274
   macro avg       0.77      0.72      0.74      5274
weighted avg       0.80      0.81      0.80      5274
 

Test set
               precision    recall  f1-score   support

           0       0.83      0.90      0.87      1300
           1       0.64      0.48      0.55       458

    accuracy                           0.79      1758
   macro avg       0.73      0.69      0.71      1758
weighted avg       0.78      0.79      0.78      1758

---------------------------------------------

In [18]:
# roc_curve_and_auc(clf, X_train, X_test, y_train, y_test)

# Xgboost

In [34]:
clf = xgb.XGBClassifier(objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=3,
    seed=42,
    max_depth=3,
    n_estimators=140,
    learning_rate=0.05                    
                       )
clf.fit(scaled_X_train_smoted, y_train_smoted)
y_pred_train = clf.predict(scaled_X_train_smoted)
y_pred_test = clf.predict(scaled_X_test)

In [35]:
model_evaluation(scaled_X_train_smoted, scaled_X_test, y_train_smoted, y_test, y_pred_train, y_pred_test)

MODEL EVALUATION METRICS:
 -----------------------------------------------------
Confusion Matrix for train & test set: 

[[2348 1515]
 [ 121 3742]] 

[[754 546]
 [ 45 413]]
-----------------------------------------------------

Classification Report for train & test set
 
Train set
               precision    recall  f1-score   support

           0       0.95      0.61      0.74      3863
           1       0.71      0.97      0.82      3863

    accuracy                           0.79      7726
   macro avg       0.83      0.79      0.78      7726
weighted avg       0.83      0.79      0.78      7726
 

Test set
               precision    recall  f1-score   support

           0       0.94      0.58      0.72      1300
           1       0.43      0.90      0.58       458

    accuracy                           0.66      1758
   macro avg       0.69      0.74      0.65      1758
weighted avg       0.81      0.66      0.68      1758

-------------------------------------------------

In [None]:
# roc_curve_and_auc(clf, X_train, X_test, y_train, y_test)

In [None]:
clf.feature_importances_.round(5)

In [None]:
plot_feature_importances(clf)

## Gridsearch XGBoost

In [37]:
estimator = xgb.XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=3,
    seed=42)

In [28]:
grid = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05, ]
}

In [36]:
clf = GridSearchCV(
    estimator=estimator,
    param_grid=grid,
    scoring = 'accuracy',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

In [32]:
clf.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   10.8s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  1.3min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  3.9min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed:  7.7min
[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed: 10.0min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=3, seed=42, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=10,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
                         'max_depth': range(2, 10),
                         'n_estimators': range(60, 220, 40)}

In [33]:
clf.best_params_


{'learning_rate': 0.05, 'max_depth': 3, 'n_estimators': 140}

# scoring=accuracy

In [38]:
estimator = xgb.XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    scale_pos_weight=3,
    seed=42)

In [39]:
grid = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05, ]
}

In [40]:
clf = GridSearchCV(
    estimator=estimator,
    param_grid=grid,
    scoring = 'accuracy',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

In [41]:
clf.fit(X_train, y_train)

Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    8.8s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  1.1min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed:  9.9min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 23.4min
[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed: 32.0min finished


GridSearchCV(cv=10, error_score='raise-deprecating',
             estimator=XGBClassifier(base_score=0.5, booster='gbtree',
                                     colsample_bylevel=1, colsample_bynode=1,
                                     colsample_bytree=1, gamma=0,
                                     learning_rate=0.1, max_delta_step=0,
                                     max_depth=3, min_child_weight=1,
                                     missing=None, n_estimators=100, n_jobs=1,
                                     nthread=4, objective='binary:logistic',
                                     random_state=0, reg_alpha=0, reg_lambda=1,
                                     scale_pos_weight=3, seed=42, silent=None,
                                     subsample=1, verbosity=1),
             iid='warn', n_jobs=10,
             param_grid={'learning_rate': [0.1, 0.01, 0.05],
                         'max_depth': range(2, 10),
                         'n_estimators': range(60, 220, 40)}

In [44]:
clf.best_params_


{'learning_rate': 0.1, 'max_depth': 9, 'n_estimators': 180}

In [45]:
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [None]:
model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test)

MODEL EVALUATION METRICS:
 -----------------------------------------------------
Confusion Matrix for train & test set: 

[[3658  205]
 [   2 1409]] 

[[1019  281]
 [ 172  286]]
-----------------------------------------------------

Classification Report for train & test set
 
Train set
               precision    recall  f1-score   support

           0       1.00      0.95      0.97      3863
           1       0.87      1.00      0.93      1411

    accuracy                           0.96      5274
   macro avg       0.94      0.97      0.95      5274
weighted avg       0.97      0.96      0.96      5274
 

Test set
               precision    recall  f1-score   support

           0       0.86      0.78      0.82      1300
           1       0.50      0.62      0.56       458

    accuracy                           0.74      1758
   macro avg       0.68      0.70      0.69      1758
weighted avg       0.76      0.74      0.75      1758

---------------------------------------------

[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   26.5s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  4.0min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 12.8min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 26.2min
[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed: 35.6min finished


Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   20.7s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  4.4min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 21.5min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 36.3min
[Parallel(n_jobs=10)]: Done 960 out of 960 | elapsed: 45.7min finished


Fitting 10 folds for each of 96 candidates, totalling 960 fits


[Parallel(n_jobs=10)]: Using backend LokyBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:   22.5s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:  4.2min
[Parallel(n_jobs=10)]: Done 430 tasks      | elapsed: 13.7min
[Parallel(n_jobs=10)]: Done 780 tasks      | elapsed: 28.0min


# Gradient Boosting

In [None]:
clf = GradientBoostingClassifier()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [None]:
model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test)

In [None]:
roc_curve_and_auc(clf, X_train, X_test, y_train, y_test)

In [None]:
clf.feature_importances_.round(5)

In [None]:
plot_feature_importances(clf)

# Adaboost

In [None]:
clf = AdaBoostClassifier()
clf.fit(X_train, y_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)

In [None]:
model_evaluation(X_train, X_test, y_train, y_test, y_pred_train, y_pred_test)

In [None]:
roc_curve_and_auc(clf, X_train, X_test, y_train, y_test)

In [None]:
clf.feature_importances_

In [None]:
plot_feature_importances(clf)

Feature Importance
Class Weight
Over Under SAmpling SMOTE yaptim
Stratify
Radius Neighbors Classifier Cok fazla model oldu
bagging yaptim
make_scorer

simple imputer
n_jobs = -1

cohen_kappa_score
Cohen suggested the Kappa result be interpreted as follows: values ≤ 0 as indicating no agreement and 0.01–0.20 as none to slight, 0.21–0.40 as fair, 0.41– 0.60 as moderate, 0.61–0.80 as substantial, and 0.81–1.00 as almost perfect agreement.

https://www.kaggle.com/stuarthallows/using-xgboost-with-scikit-learn