###### Setting work path

In [None]:
import os
os.getcwd()
os.chdir(u'C:/Users/dongwan.kim/Desktop/da_work/4000_게임과 로그/T5')
os.getcwd()

###### Data preperation

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import recall_score, roc_curve

df = pd.read_csv('20171212_T5_data003_Mobile_T5__log_stat_tracking.csv')
y = df.iloc[:, 0].values
X = df.drop(df.columns[0], axis=1).values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

stdsc = StandardScaler()
X_train_std = stdsc.fit_transform(X_train)
X_test_std = stdsc.transform(X_test)

class_labels = ['stay', 'churn']

###### How to handle imbalance of class

- https://beckernick.github.io/oversampling-modeling/

In [None]:
df.iloc[:, 0].value_counts()

###### applying SMOTE

In [None]:
from imblearn.over_sampling import SMOTE

sm = SMOTE(random_state=0, ratio=1.0)
X_train_sm, y_train_sm = sm.fit_sample(X_train_std, y_train)

So data sets we have.
- (X_train, y_train), (X_test, y_test)
- (X_train_std, y_train), (X_test, y_test)
- (X_train_sm, y_train_sm), (X_test, y_test)

In [None]:
print('Before applying SMOTE - 0:', len([y for y in y_train if y==0]), '1:', sum(y_train))
print('After applying SMOTE  - 0:', len([y for y in y_train_sm if y==0]), '1:', sum(y_train_sm))

###### Performance matrics

In [None]:
from sklearn.metrics import confusion_matrix, recall_score, roc_curve, classification_report
import matplotlib.pyplot as plt
import itertools
% matplotlib inline


class PerfMatrics():
    def __init__(self, y_actual, X, model, class_labels=None):
        self.y_actual = y_actual
        self.model = model
        self.X = X
        
        self.y_predict = self.model.predict(X)
        self.y_predict_proba = self.model.predict_proba(X)[:, 1]
        
        self.class_labels = class_labels
        self.conf_matrix = confusion_matrix(self.y_actual, self.y_predict)
        self.tn, self.fp, self.fn, self.tp = self.conf_matrix.ravel()

    def print_all_matrics(self):
        self._print_accuracy()
        self._print_recall()
        self._print_classification_report(self.y_actual, self.y_predict, self.class_labels)

        plt.rcParams["figure.figsize"] = (11,5)
        
        plt.subplot(1,2,1)
        self._plot_confusion_matrix(self.conf_matrix, self.class_labels)

        if self.y_predict_proba is not None:
            plt.subplot(1,2,2)
            self._plot_roc_curve()

    def _plot_confusion_matrix(self, cm, classes,
                               normalize=False,
                               title='Confusion matrix',
                               cmap=plt.cm.Blues):
        """
        This function prints and plots the confusion matrix.
        Normalization can be applied by setting `normalize=True`.
        """
        print('---------Confusion matrix---------')

        if normalize:
            cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
            print("Normalized confusion matrix")
        else:
            print('Confusion matrix, without normalization')

        print(cm)

        plt.imshow(cm, interpolation='nearest', cmap=cmap)
        plt.title(title)
        plt.colorbar()
        tick_marks = np.arange(len(classes))
        plt.xticks(tick_marks, classes, rotation=45)
        plt.yticks(tick_marks, classes)

        fmt = '.2f' if normalize else 'd'
        thresh = cm.max() / 2.
        for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
            plt.text(j, i, format(cm[i, j], fmt),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")

        plt.tight_layout()
        plt.ylabel('True label')
        plt.xlabel('Predicted label')
        print('')

    def _print_classification_report(self, y_actual, y_predict, class_labels=None):
        print('---------Classification report---------')
        print(classification_report(y_actual, y_predict, target_names=class_labels))
        print('')

    def _print_accuracy(self):
        print('---------Accuracy---------')
        accuracy = (self.tp + self.tn) / (self.tn + self.fp + self.fn + self.tp)
        print(accuracy)
        print('')

    def _print_recall(self):
        print('---------Recall---------')
        recall = recall_score(self.y_actual, self.y_predict)
        print(recall)
        print('')

    def _plot_roc_curve(self):
        print('---------ROC Curve---------')
        fpr, tpr, thresholds = roc_curve(self.y_actual, self.y_predict_proba, pos_label=1)
        plt.plot(fpr, tpr)
        plt.ylabel('TPR')
        plt.xlabel('FPR')
        
        print('')

###### Logistic regression

In [None]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='l1', C=0.1)
lr.fit(X_train_std, y_train)

print('Training accuracy:', lr.score(X_train_std, y_train))
print('Test accuracy:', lr.score(X_test_std, y_test))

Performance matrics for train data with normal LR

In [None]:
y_train

In [None]:
lr.predict_proba(X_train_std)[:, 1]

In [None]:
pm_tra_lr = PerfMatrics(y_actual=y_train,
                        X=X_train_std,
                        model=lr,
                        class_labels=class_labels)

pm_tra_lr.print_all_matrics()

Performance matrics for test data with normal LR

In [None]:
pm_tst_lr = PerfMatrics(y_actual=y_test,
                        X=X_test_std,
                        model=lr,
                        class_labels=class_labels)

pm_tst_lr.print_all_matrics()

Imbalanced or not?

In [None]:
print('total y balance:', sum(y) / len(y))
print('training set y balance:', sum(y_train) / len(y_train))
print('test set y balance:', sum(y_test) / len(y_test))

In [None]:
print('LR, Training accuracy:', lr.score(X_train_std, y_train))
print('LR, Test accuracy:', lr.score(X_test_std, y_test))

###### Logistic Regression with SMOTE

In [None]:
lr_sm = LogisticRegression(penalty='l1', C=0.1)
lr_sm.fit(X_train_sm, y_train_sm)

Performance matrics for train data with LR & SMOTE

In [None]:
pm_tra_lr_sm = PerfMatrics(y_actual=y_train,
                        X=X_train_std,
                        model=lr_sm,
                        class_labels=class_labels)

pm_tra_lr_sm.print_all_matrics()

Performance matrics for test data with LR & SMOTE

In [None]:
pm_tst_lr_sm = PerfMatrics(y_actual=y_test,
                        X=X_test_std,
                        model=lr_sm,
                        class_labels=class_labels)

pm_tst_lr_sm.print_all_matrics()

###### Random Forest

- How to rune hyper-parameters of RF model?
  - https://stackoverflow.com/questions/36107820/how-to-tune-parameters-in-random-forest-using-scikit-learn
  - https://www.fabienplisson.com/random-forest-and-grid-search/

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=50, random_state=12)
rf.fit(X_train_std, y_train)
print('RF, Test accuracy:', rf.score(X_test_std, y_test))
print('RF, Test recall:', recall_score(y_test, rf.predict(X_test_std)))

In [None]:
pm_tst_rf = PerfMatrics(y_actual=y_test,
                        X=X_test_std,
                        model=rf,
                        class_labels=class_labels)

pm_tst_rf.print_all_matrics()

###### RF with SMOTE

In [None]:
rf_sm = RandomForestClassifier(n_estimators=50, random_state=12)
rf_sm.fit(X_train_sm, y_train_sm)
print('RF, Test accuracy with SMOTE:', rf_sm.score(X_test_std, y_test))
print('RF, Test recall with SMOTE:', recall_score(y_test, rf_sm.predict(X_test_std)))

In [None]:
pm_tst_rf_sm = PerfMatrics(y_actual=y_test,
                        X=X_test_std,
                        model=rf_sm,
                        class_labels=class_labels)

pm_tst_rf_sm.print_all_matrics()

###### SVM

In [None]:
1/0

In [None]:
from sklearn import svm
help(svm.SVC)

In [None]:
len(X_train_sm)

In [None]:
from sklearn import svm
sv = svm.SVC(kernel='linear', C=1.0, random_state=0, cache_size=7000)
sv.fit(X_train_sm, y_train_sm)

In [None]:
pm_tst_svm_sm = PerfMatrics(y_actual=y_test,
                        X=X_test_std,
                        model=sv_sm,
                        class_labels=class_labels)

pm_tst_svm_sm.print_all_matrics()