# Repeat Call Classifier

## Import packages

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
import sys
sys.path.insert(0, '../scripts')

In [None]:
from clustering import *

## Read data

In [None]:
df = pd.read_csv('../datasets/data.csv')
df.head()

In [None]:
import string

translator = str.maketrans('', '', string.punctuation)

def tokenize(doc):
    doc = doc.translate(translator)
    return doc.lower().split()

In [None]:
# Pre-process 
# df['tokens'] = preproc_driver(df['doc'])
df['tokens'] = df['doc'].apply(tokenize)

## Split data into train and test

In [None]:
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction import DictVectorizer
from scipy import interp
from xgboost import XGBClassifier, plot_importance
from sklearn.preprocessing import Imputer
from sklearn.metrics import classification_report
from sklearn_pandas import DataFrameMapper, gen_features, CategoricalImputer
from sklearn.decomposition import PCA

In [None]:
df_train, df_test = train_test_split(df, stratify=df['Call Order'], test_size=0.2, random_state=42)

In [None]:
df_train.shape

In [None]:
df_test.shape

In [None]:
df_train['Call Order'].value_counts()

In [None]:
df_test['Call Order'].value_counts()

## Pipelines

In [None]:
from gensim.models import FastText

# trained fasttext embeddings
embeddings = FastText.load_fasttext_format('./vectors.bin')

In [None]:
# !wget https://s3-us-west-1.amazonaws.com/fasttext-vectors/wiki.en.zip

In [None]:
# !unzip wiki.en.zip

In [None]:
embeddings_pretrained = FastText.load_fasttext_format('./wiki.en.bin')

In [None]:
# !wget http://nlp.stanford.edu/data/glove.42B.300d.zip

In [None]:
# !unzip glove.42B.300d.zip

In [None]:
# from gensim.scripts.glove2word2vec import glove2word2vec
# glove2word2vec(glove_input_file="glove.42B.300d.txt", word2vec_output_file="gensim_glove_vectors.txt")

In [None]:
from gensim.models.keyedvectors import KeyedVectors
embeddings_glove_pretrained = KeyedVectors.load_word2vec_format("gensim_glove_vectors.txt", binary=False)

In [None]:
from sklearn.base import BaseEstimator, TransformerMixin
from gensim.corpora import Dictionary
from gensim.models.tfidfmodel import TfidfModel

class TfidfWeightedEmbeddings(BaseEstimator, TransformerMixin):
    """Returns a TfIdf weighted embedding.
        Input is tokenized docs"""
    
    def __init__(self, embeddings):
        self.model = embeddings
        self.embedding_dim = embeddings.wv.vector_size
    
    def fit(self, X, y=None, **fit_params):
        self.dictionary = Dictionary(X)
        self.corpus = [self.dictionary.doc2bow(doc) for doc in X]
        self.tfidf = TfidfModel(self.corpus, id2word=self.dictionary)
        # self.corpus_tfidf = self.tfidf[self.corpus]
        return self
    
    def transform(self, X, **transform_params):
        tfidf_vectors = []
        for i, sent in enumerate(X):
            sent_bow = self.dictionary.doc2bow(sent)
            sent_tfidf = self.tfidf[sent_bow]
            vec = self.tfidf_word_vectors(sent_tfidf, sent)
            tfidf_vectors.append(vec)
        return np.array(tfidf_vectors)
    
    def tfidf_word_vectors(self, tfidf_scores, tokens):
        doc_dict = dict(tfidf_scores)
        n = len(tokens)
        res = np.zeros(self.embedding_dim)
        if n == 0:
            return res
        for token in tokens:
            if token in self.dictionary.token2id and token in self.model.wv:
                res += doc_dict[self.dictionary.token2id[token]] * 100 * self.model.wv[token]
        return res/n
    
    def get_feature_names(self):
        names = ['embeddingDim{}'.format(i) for i in range(100)]
        return names

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

def identity_fun(doc):
    return doc

tfidf = TfidfVectorizer(
    analyzer='word',
    tokenizer=identity_fun,
    preprocessor=identity_fun,
    token_pattern=None) 

In [None]:
feature_def = gen_features(
        columns=['PLAN', 'PRODUCT_LINE', 'Location', 'REGIONCODE', 'nvcDialedNumber'],
        classes=[{'class': CategoricalImputer, 'missing_values': -1}]
    )

feature_def.extend([
    ('tokens', tfidf),
    (['CSR_Tenure_months'], Imputer())
])

mapper = DataFrameMapper(feature_def, df_out=True, default=None)

In [None]:
feature_def2 = gen_features(
        columns=['PLAN', 'PRODUCT_LINE', 'Location', 'REGIONCODE', 'nvcDialedNumber'],
        classes=[{'class': CategoricalImputer, 'missing_values': -1}]
    )

feature_def2.extend([
    ('tokens', TfidfWeightedEmbeddings(embeddings_glove_pretrained)),
    (['CSR_Tenure_months'], Imputer())
])

mapper2 = DataFrameMapper(feature_def2, df_out=True, default=None)

## Under-sampling

In [None]:
def do_undersample(X, y):
    df = pd.concat([X, y], axis=1)
    df_call0 = df[df['Call Order'] == 0].sample(n=y.value_counts()[1], random_state=42)
    df_call1 = df[df['Call Order'] == 1]
    df_undersample = pd.concat([df_call0, df_call1])
    # Shuffle
    df_undersample = df_undersample.sample(frac=1)
    assert df_undersample['Call Order'].value_counts()[0] == df_undersample['Call Order'].value_counts()[1]
    new_X = df_undersample.iloc[:, :-1]
    new_y = df_undersample.iloc[:, -1]
    return new_X, new_y

## Over-sampling

In [None]:
def do_oversample(X, y):
    df = pd.concat([X, y], axis=1)
    df_call1 = df[df['Call Order'] == 1].sample(n=y.value_counts()[0], replace=True, random_state=42)
    df_call0 = df[df['Call Order'] == 0]
    df_oversample = pd.concat([df_call0, df_call1])
    # Shuffle
    df_oversample = df_oversample.sample(frac=1)
    assert df_oversample['Call Order'].value_counts()[0] == df_oversample['Call Order'].value_counts()[1]
    new_X = df_oversample.iloc[:, :-1]
    new_y = df_oversample.iloc[:, -1]
    return new_X, new_y

## Helper Functions

In [None]:
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score

def plot_roc_pr_curves(y_test, prob, xgb_model=False):
    if not xgb_model:
        prob = prob[:,1]
    fpr, tpr, thresholds = roc_curve(y_test, prob)
    roc_auc = auc(fpr, tpr)
    p, r, thre = precision_recall_curve(y_test, prob)
    average_p = average_precision_score(y_test, prob)

    fig = plt.figure(figsize=(10,4))
    ax1 = fig.add_subplot(1,2,1)
    ax1.set_xlim([-0.05,1.05])
    ax1.set_ylim([-0.05,1.05])
    ax1.set_xlabel('False Positive Rate')
    ax1.set_ylabel('True Positive Rate')
    ax1.set_title('ROC Curve')

    ax2 = fig.add_subplot(1,2,2)
    ax2.set_xlim([-0.05,1.05])
    ax2.set_ylim([-0.05,1.05])
    ax2.set_xlabel('Recall')
    ax2.set_ylabel('Precision')
    ax2.set_title('PR Curve')
    ax1.plot(fpr, tpr, lw=1, label='Area under ROC Curve = %0.2f'%roc_auc)
    ax2.plot(r, p, lw=1, label='Area under PR Curve = %0.2f'%average_p)
    ax1.legend(loc='lower right')    
    ax2.legend(loc='lower right')
    plt.show()
    
    return roc_auc, average_p

In [None]:
import seaborn as sns

def plot_cm(y_test, pred):
    conf_mat = confusion_matrix(y_test, pred)
    df_cm = pd.DataFrame(conf_mat, index=target_names, columns=target_names)
    fig, ax = plt.subplots(figsize=(4,3))
    sns.heatmap(df_cm, annot=True, fmt="d", ax=ax)
    ax.set_ylabel('True label');
    ax.set_xlabel('Predicted label');

In [None]:
target_names = ['Non-Repeat', 'Repeat']

In [None]:
train_meta = df_train.copy()

train_meta['xgb'] = np.zeros(train_meta.shape[0])
train_meta['lr'] = np.zeros(train_meta.shape[0])
train_meta['rf'] = np.zeros(train_meta.shape[0])

In [None]:
test_meta = df_test.copy()

test_meta['xgb_avg'] = np.zeros(test_meta.shape[0])
test_meta['xgb_fold_0'] = np.zeros(test_meta.shape[0])
test_meta['xgb_fold_1'] = np.zeros(test_meta.shape[0])
test_meta['xgb_fold_2'] = np.zeros(test_meta.shape[0])
test_meta['xgb_fold_3'] = np.zeros(test_meta.shape[0])
test_meta['xgb_fold_4'] = np.zeros(test_meta.shape[0])

test_meta['lr_avg'] = np.zeros(test_meta.shape[0])
test_meta['lr_fold_0'] = np.zeros(test_meta.shape[0])
test_meta['lr_fold_0'] = np.zeros(test_meta.shape[0])
test_meta['lr_fold_0'] = np.zeros(test_meta.shape[0])
test_meta['lr_fold_0'] = np.zeros(test_meta.shape[0])
test_meta['lr_fold_0'] = np.zeros(test_meta.shape[0])

test_meta['rf_avg'] = np.zeros(test_meta.shape[0])
test_meta['rf_fold_0'] = np.zeros(test_meta.shape[0])
test_meta['rf_fold_1'] = np.zeros(test_meta.shape[0])
test_meta['rf_fold_2'] = np.zeros(test_meta.shape[0])
test_meta['rf_fold_3'] = np.zeros(test_meta.shape[0])
test_meta['rf_fold_4'] = np.zeros(test_meta.shape[0])


In [None]:
def do_kfold_cv(df_train, df_test, mapper, clf, n_splits=5, mode='original', feat_imp=False, name='model_1'):
    kf = StratifiedKFold(n_splits=n_splits, random_state=42)
    X_train = df_train.drop(['Call Order'], axis=1)
    y_train = df_train['Call Order']

    X_test = df_test.drop(['Call Order'], axis=1)
    y_test = df_test['Call Order']

    test_pred = np.zeros((X_test.shape[0]))

    accuracies = []
    rocs = []
    prs = []
    
    for i, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        print('_'*100)
        print('#'*26)
        print('###### Doing Fold {} ######'.format(i))
        print('#'*26)
        X_train_kf, X_val_kf = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
        # Resampling
        if mode == 'undersample':
            X_train_kf, y_train_kf = do_undersample(X_train_kf, y_train_kf)
        elif mode == 'oversample':
            X_train_kf, y_train_kf = do_oversample(X_train_kf, y_train_kf)
        print('X_train_kf shape: ', X_train_kf.shape)
        print('X_val_kf shape: ', X_val_kf.shape)
        # Fit
        clf.fit(mapper.fit_transform(X_train_kf), y_train_kf)
        val_pred = clf.predict(mapper.transform(X_val_kf))
        val_probas = clf.predict_proba(mapper.transform(X_val_kf))
        train_meta.loc[train_meta.iloc[val_index].index, name] = val_probas[:, 1]

        test_probas = clf.predict_proba(mapper.transform(X_test))[:, 1]
        test_meta[name + '_fold_{}'.format(i)] = test_probas
        test_pred += test_probas
        # Print model metrics
        print('Accuracy: ', accuracy_score(y_val_kf, val_pred))
        accuracies.append(accuracy_score(y_val_kf, val_pred))
        print('Classification Report: ')
        print(classification_report(y_val_kf, val_pred))
        # Plot confusion matrix 
        plot_cm(y_val_kf, val_pred)
        # Compute ROC curve and area the curve
        roc_auc_fold, pr_fold = plot_roc_pr_curves(y_val_kf, val_probas)
        rocs.append(roc_auc_fold)
        prs.append(pr_fold)
        
        if feat_imp:
            fig, ax = plt.subplots(figsize=(7,5))
            plot_importance(clf, ax=ax, max_num_features=30)

    test_pred /= n_splits
    test_meta[name + '_avg'] = test_pred
    print('_'*100)
    print('#'*24)
    print('####### RESULTS #######')
    print('#'*24)
    print('Mean Accuracy over 5 folds: {0:.4f}, Std: {1:.4f}'.format(np.mean(accuracies), np.std(accuracies)))
    print('Mean AUCROC over 5 folds: {0:.4f}, Std: {1:.4f}'.format(np.mean(rocs), np.std(rocs)))
    print('Mean AUPRC over 5 folds: {0:.4f}, Std: {1:.4f}'.format(np.mean(prs), np.std(prs)))

In [None]:
def do_total_train_test(df_train, df_test, mapper, clf, mode='original', name='model_1'):
    X_train = df_train.drop(['Call Order'], axis=1)
    y_train = df_train['Call Order']
    
    X_test = df_test.drop(['Call Order'], axis=1)
    y_test = df_test['Call Order']
    
    print('X_train shape: ', X_train.shape)
    print('X_test shape: ', X_test.shape)

    clf.fit(mapper.fit_transform(X_train), y_train)
    pred = clf.predict(mapper.transform(X_test))
    probas_ = clf.predict_proba(mapper.transform(X_test))
    test_meta.loc[:, name] = probas_[:, 1]
    # Print model metrics
    print('Accuracy: ', accuracy_score(y_test, pred))
    print('Classification Report: ')
    print(classification_report(y_test, pred))
    # Plot confusion matrix 
    plot_cm(y_test, pred)
    # Compute ROC curve and area the curve
    roc_auc_fold, pr_fold = plot_roc_pr_curves(y_test, probas_)

## LR Model

In [None]:
do_kfold_cv(df_train, df_test, mapper2, LogisticRegression(C=0.1, class_weight='balanced'), name='lr')

## RF Model

In [None]:
do_kfold_cv(df_train, df_test, mapper2, RandomForestClassifier(n_estimators=100, max_depth=3, min_samples_split=10, class_weight='balanced', n_jobs=-1), name='rf')

## XGB Model

In [None]:
# TFIDF encoding
mapper3 = DataFrameMapper([
                ('tokens', tfidf),
            ], df_out=True, default=None)

In [None]:
# TFIDF weighted average of Fasttext trained embeddings
mapper4 = DataFrameMapper([
                ('tokens', TfidfWeightedEmbeddings(embeddings)),
            ], df_out=True, default=None)

In [None]:
# TFIDF weighted average of Fasttext pre-trained embeddings
mapper5 = DataFrameMapper([
                ('tokens', TfidfWeightedEmbeddings(embeddings_pretrained)),
            ], df_out=True, default=None)

In [None]:
# TFIDF weighted average of GloVe pre-trained embeddings
mapper6 = DataFrameMapper([
                ('tokens', TfidfWeightedEmbeddings(embeddings_glove_pretrained)),
            ], df_out=True, default=None)

In [None]:
# TFIDF weighted average of Fasttext trained embeddings and PCA
mapper7 = DataFrameMapper([
                ('tokens', [TfidfWeightedEmbeddings(embeddings), PCA(n_components=20)]),
            ], df_out=True, default=None)

In [None]:
# TFIDF weighted average of GloVe pre-trained embeddings and PCA
mapper8 = DataFrameMapper([
                ('tokens', [TfidfWeightedEmbeddings(embeddings_glove_pretrained), PCA(n_components=20)]),
            ], df_out=True, default=None)

In [None]:
import xgboost as xgb
from sklearn.metrics import roc_auc_score

def do_kfold_cv_xgb(df_train, df_test, mapper, params, num_boost_round, early_stopping_rounds, n_splits=5, 
                    mode='original', verbose_eval=True, feat_imp=True, name='model_xgb'):
    
    kf = StratifiedKFold(n_splits=n_splits, random_state=42)
    X_train = df_train.drop(['Call Order'], axis=1)
    y_train = df_train['Call Order']
    
    X_test = df_test.drop(['Call Order'], axis=1)
    y_test = df_test['Call Order']

    test_pred = np.zeros((X_test.shape[0]))

    accuracies = []
    rocs = []
    prs = []
    best_ntrees = []
    for i, (train_index, val_index) in enumerate(kf.split(X_train, y_train)):
        print('_'*100)
        print('#'*26)
        print('###### Doing Fold {} ######'.format(i))
        print('#'*26)
        X_train_kf, X_val_kf = X_train.iloc[train_index], X_train.iloc[val_index]
        y_train_kf, y_val_kf = y_train.iloc[train_index], y_train.iloc[val_index]
        if mode == 'undersample':
            X_train_kf, y_train_kf = do_undersample(X_train_kf, y_train_kf)
        elif mode == 'oversample':
            X_train_kf, y_train_kf = do_oversample(X_train_kf, y_train_kf)
        print('X_train_kf shape: ', X_train_kf.shape)
        print('X_val_kf shape: ', X_val_kf.shape)
        
        d_train_kf = xgb.DMatrix(mapper.fit_transform(X_train_kf), label=y_train_kf)
        d_val_kf = xgb.DMatrix(mapper.transform(X_val_kf), label=y_val_kf)

        d_test = xgb.DMatrix(mapper.transform(X_test), label=y_test)
        
        bst = xgb.train(params, d_train_kf, num_boost_round=num_boost_round,
                            evals=[(d_train_kf, 'train'),( d_val_kf, 'val')], 
                            verbose_eval=verbose_eval,
                            early_stopping_rounds=early_stopping_rounds,
                            )

        train_pred = bst.predict(d_train_kf, ntree_limit=bst.best_ntree_limit)        
        val_pred = bst.predict(d_val_kf, ntree_limit=bst.best_ntree_limit)
        pred = bst.predict(d_test, ntree_limit=bst.best_ntree_limit)
        test_meta[name + '_fold_{}'.format(i)] = pred
        test_pred += pred

        train_meta.loc[train_meta.iloc[val_index].index, name] = val_pred
        # Print model metrics
        if not verbose_eval:
            print('Best tree limit: ', bst.best_ntree_limit)
            print('Train AUROC: {}'.format(roc_auc_score(y_train_kf, train_pred)), \
                  'Val AUROC: {}'.format(roc_auc_score(y_val_kf, val_pred)))
            
        roc_auc_fold, pr_fold = plot_roc_pr_curves(y_val_kf, val_pred, xgb_model=True)
        rocs.append(roc_auc_fold)
        prs.append(pr_fold)
        best_ntrees.append(bst.best_ntree_limit)
        
        if feat_imp:
            fig, ax = plt.subplots(figsize=(7,5))
            plot_importance(bst, ax=ax, max_num_features=30)
            
    test_pred /= n_splits
    test_meta[name + '_avg'] = test_pred
    print('_'*100)
    print('#'*24)
    print('####### RESULTS #######')
    print('#'*24)
    print('Mean AUCROC over 5 folds: {0:.4f}, Std: {1:.4f}'.format(np.mean(rocs), np.std(rocs)))
    print('Mean AUPRC over 5 folds: {0:.4f}, Std: {1:.4f}'.format(np.mean(prs), np.std(prs)))
    # print('Best tree limits: {}, Mean: {:.4f}'.format(best_ntrees, np.mean(best_ntrees)))

## FastText embeddings

In [None]:
# FastText trained and PCA
params = {
    'booster':'gbtree',
    'eta':0.000001,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread':4,
    'min_child_weight': 30,
    'subsample': 0.8,
    'max_depth':3,
    'lambda':2
#     'scale_pos_weight':7804.0/1150.0
}

num_boost_round = 2000
early_stopping_rounds = 50
    
do_kfold_cv_xgb(df_train, mapper7, params, num_boost_round, early_stopping_rounds, name='xgb_fasttext_pca')

In [None]:
# GloVe and PCA
params = {
    'booster':'gbtree',
    'eta':0.000001,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread':4,
    'min_child_weight': 30,
    'subsample': 0.8,
    'max_depth':3,
    'lambda':2
#     'scale_pos_weight':7804.0/1150.0
}

num_boost_round = 2000
early_stopping_rounds = 50
    
do_kfold_cv_xgb(df_train, df_test, mapper8, params, num_boost_round, early_stopping_rounds, name='xgb')

In [None]:
do_total_train_test_xgb(df_train, df_test, mapper8, params, best_num_boost_round, name='xgb')

## FastText pre-trained embeddings

In [None]:
params = {
    'booster':'gbtree',
    'eta':0.000001,
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'nthread':4,
    'min_child_weight': 30,
    'subsample': 0.8,
    'max_depth':3,
    'lambda':2#,
#     'scale_pos_weight':7804.0/1150.0
}

num_boost_round = 2000
early_stopping_rounds = 50
    
best_num_boost_round = do_kfold_cv_xgb(df_train, mapper5, params, num_boost_round, early_stopping_rounds)

In [None]:
do_total_train_test(df_train, df_test, mapper5, params, best_num_boost_round, name='xgb_model_2')

# Stacking

In [None]:
df_train_stacked = train_meta[['xgb', 'lr', 'rf', 'Call Order']]

In [None]:
df_train_stacked.head()

In [None]:
df_test_stacked = test_meta[['xgb_avg', 'lr_avg', 'rf_avg', 'Call Order']]

In [None]:
df_test_stacked.columns = ['xgb', 'lr', 'rf', 'Call Order']

In [None]:
df_test_stacked.head()

In [None]:
mapper_identity = DataFrameMapper([], df_out=True, default=None)

In [None]:
roc_auc_score(df_test_stacked['Call Order'], df_test_stacked['lr'])

In [None]:
roc_auc_score(df_test_stacked['Call Order'], df_test_stacked['rf'])

In [None]:
roc_auc_score(df_test_stacked['Call Order'], df_test_stacked['xgb'])

In [None]:
do_total_train_test(df_train_stacked, df_test_stacked, mapper_identity, LogisticRegression(), name='stacked_lr_xgb')

In [None]:
all_test_stacked = test_meta.iloc[:, -19:-1]

In [None]:
df_train_stacked.to_csv('../datasets/df_train_stacking_orig.csv', index=False)
all_test_stacked.to_csv('../datasets/df_test_stacking_orig.csv', index=False)

### Developed by Data Science Elite Team, IBM Analytics:
- Vinay Rao Dandin, Data Scientist

#### Copyright (c) 2018 IBM Corporation