# Ensemble models

## Feature variables:

predictions from:

    - XGBoost （review meta data + historical performance)
    - Random Forest （review meta data + historical performance)
    - BoW
    - BERT
    
## Target variable:

    - whether the product is successful after 1 year
    
## Models:

    - logistic regression 
    - decision tree 
    


In [None]:
import boto3
import pandas as pd
import numpy as np
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score, f1_score, roc_auc_score, precision_score, recall_score, roc_curve, accuracy_score
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [None]:
current_session=boto3.session=boto3.Session(
    aws_access_key_id='AKIAQF74TYKWB5URILW2',
    aws_secret_access_key='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')
s3_client= current_session.client('s3')

def download_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    with open(path_to_file_on_local, 'wb') as f:
        s3_client.download_fileobj(bucket_name, file_path_on_s3_bucket, f)
    return True

def upload_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    s3_client.upload_file(path_to_file_on_local, bucket_name, file_path_on_s3_bucket)
    return True

def get_object(file_path_on_s3_bucket, bucket_name="ac297r", s3_client=s3_client):
    return s3_client.get_object(Bucket=bucket_name, Key=file_path_on_s3_bucket)

In [None]:
! rm /home/ubuntu/data/*

data = "/home/ubuntu/data"
download_object('clean/product_sample_long_term.pickle', 
                '/home/ubuntu/data/product_sample_long_term.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('Predictions/nontext_res_df.pickle', 
                f'{data}/nontext_res_df.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('Predictions/bow_res_df.pickle', 
                f'{data}/bow_res_df.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('Predictions/bert_res_df.pickle', 
                f'{data}/bert_res_df.pickle', bucket_name='ac297r', s3_client=s3_client)

In [None]:
# load predictions from three models
nontext = pd.read_pickle(f'{data}/nontext_res_df.pickle')
xgboost = nontext[['asin','label_after_1_yr_period_12_mo_min_bsr',
                   'y_xgboost_predict','y_xgboost_predict_probas']].copy()
rforest = nontext[['asin','label_after_1_yr_period_12_mo_min_bsr',
                   'y_forest_predict','y_forest_predict_probas']].copy()
del nontext
xgboost.rename(columns={'label_after_1_yr_period_12_mo_min_bsr': 'true_label',
                         'y_xgboost_predict': 'xgb_preds',
                         'y_xgboost_predict_probas': 'xgb_probas'}, inplace=True)
print(xgboost.shape)
rforest.rename(columns={'label_after_1_yr_period_12_mo_min_bsr': 'true_label',
                         'y_forest_predict': 'rfr_preds',
                         'y_forest_predict_probas': 'rfr_probas'}, inplace=True)
print(xgboost.shape)


bow = pd.read_pickle(f'{data}/bow_res_df.pickle')
bow.rename(columns={'label_after_1_yr_period_12_mo_min_bsr': 'true_label',
                         'preds': 'bow_preds',
                         'probas': 'bow_probas'}, inplace=True)
print(bow.shape)

ber = pd.read_pickle(f'{data}/bert_res_df.pickle')
ber.rename(columns={'y_true': 'true_label',
                         'prediction': 'ber_probas'}, inplace=True)
ber['ber_preds'] = np.where(ber['ber_probas'] >= 0.5, 1, 0)
print(ber.shape)

# combine three models
df = rforest.merge(xgboost, how='inner',
                   on=['asin', 'true_label']).merge(bow, how='inner',
                                                    on=['asin', 'true_label']).merge(ber, 
                                                                                     how='inner',
                                                                                     on=['asin', 'true_label'])
print(df.shape)

In [None]:
df.head()

In [None]:
# label observations as training and testing set 
sample_prod = pd.read_pickle(f'{data}/product_sample_long_term.pickle')
df['in_train'] = df['asin'].isin(sample_prod['train'])
df['in_validation'] = df['asin'].isin(sample_prod['val'])
df['in_test'] = df['asin'].isin(sample_prod['test'])
del sample_prod
assert (df['in_train'] + df['in_validation'] + df['in_test'] == 1).all()
df['test_train'] = np.where(df['in_train']==1, 'train', np.where(df['in_validation']==1, 'validation', 'test'))

### ensemble model

1. Take average of the predicted probabilities of the three models as the predicted probability of the ensemble model. Prediction is generated as whether the average predicted probability is >= 0.5

2. Generate prediction by majority rule---if two of the three models say 1, then the ensemble prediction is 1. Then, generate the predicted probability as the average of the two "correct" models. 

3. fit a linear regression to get weights on the three predicted probabilities

In [None]:
# separate into train and test
train_df = df.query('test_train == "train"').copy()
val_df = df.query('test_train == "validation"').copy()
test_df = df.query('test_train == "test"').copy()

# fit logit regression on the predicted probas
print('''
Logistic Regression
''')
clf = linear_model.LogisticRegression(max_iter=10000)
model_cols = ['bow_probas', 'ber_probas', 'xgb_probas', 'rfr_probas']
clf.fit(X=val_df[model_cols], y=val_df['true_label'])
for model, coef in list(zip(model_cols, list(clf.coef_.flatten()))):
    print('weight on', model, ':\t',  coef)
df['proba_lgt'] = clf.predict_proba(df[model_cols])[:,1]
df['pred_lgt'] = np.where(df['proba_lgt'] >=0.5, 1, 0)

print('''
Decision Tree
''')
# fit decision tree on the predicted probas
clf = DecisionTreeClassifier(max_depth=5)
clf.fit(X=val_df[model_cols], y=val_df['true_label'])
for model, coef in list(zip(model_cols, list(clf.feature_importances_.flatten()))):
    print('weight on', model, ':\t',  coef)
df['proba_tre'] = clf.predict_proba(df[model_cols])[:,1]
df['pred_tre'] = np.where(df['proba_tre'] >=0.5, 1, 0)

# separate into train and test again (to get the new ensemble predictions)
train_df = df.query('test_train == "train"').copy()
val_df = df.query('test_train == "validation"').copy()
full_train_df = pd.concat([train_df, val_df], axis=0).reset_index(drop=True)
test_df = df.query('test_train == "test"').copy()

In [None]:
# performance if take average over probabilities

def get_name(key):
    formal_name = {'pred_lgt': 'Ensemble (logistics)',
                   'pred_tre': 'Ensemble (tree)',
                   'ber_preds':'Bert',
                   'bow_preds':'BoW',
                   'xgb_preds':'XGB',
                   'rfr_preds':'RF'}
    return formal_name[key]

print('''
Training set
''')
for prediction in ['pred_lgt', 'pred_tre', 'ber_preds', 'bow_preds', 'xgb_preds', 'rfr_preds']:
    
    f1 = np.round(f1_score(train_df['true_label'], train_df[prediction]), 4)
    acc = np.round(accuracy_score(train_df['true_label'], train_df[prediction]), 4)
    precision = np.round(precision_score(train_df['true_label'], train_df[prediction]), 4)
    recall = np.round(recall_score(train_df['true_label'], train_df[prediction]), 4)
    auc = np.round(roc_auc_score(train_df['true_label'], train_df[prediction.replace('pred','proba')]), 4)
    
    print('-----------------------------------', get_name(prediction))
    print('f1\t\t', f1,
          '\naccuracy\t',acc,
          '\nprecision\t', precision,
          '\nrecall\t\t', recall,
          '\nauc\t\t', auc)


In [None]:
print('''
Validation set
''')
for prediction in ['pred_lgt', 'pred_tre', 'ber_preds', 'bow_preds', 'xgb_preds', 'rfr_preds']:
    
    f1 = np.round(f1_score(val_df['true_label'], val_df[prediction]), 4)
    acc = np.round(accuracy_score(val_df['true_label'], val_df[prediction]), 4)
    precision = np.round(precision_score(val_df['true_label'], val_df[prediction]), 4)
    recall = np.round(recall_score(val_df['true_label'], val_df[prediction]), 4)
    auc = np.round(roc_auc_score(val_df['true_label'], val_df[prediction.replace('pred','proba')]), 4)
    
    print('-----------------------------------', get_name(prediction))
    print('f1\t\t', f1,
          '\naccuracy\t',acc,
          '\nprecision\t', precision,
          '\nrecall\t\t', recall,
          '\nauc\t\t', auc)


In [None]:
print('''
Test set
''')
for prediction in ['pred_lgt', 'pred_tre', 'ber_preds', 'bow_preds', 'xgb_preds', 'rfr_preds']:
    
    f1 = np.round(f1_score(test_df['true_label'], test_df[prediction]), 4)
    acc = np.round(accuracy_score(test_df['true_label'], test_df[prediction]), 4)
    precision = np.round(precision_score(test_df['true_label'], test_df[prediction]), 4)
    recall = np.round(recall_score(test_df['true_label'], test_df[prediction]), 4)
    auc = np.round(roc_auc_score(test_df['true_label'], test_df[prediction.replace('pred','proba')]), 4)
    
    print('-----------------------------------', get_name(prediction))
    print('f1\t\t', f1,
          '\naccuracy\t',acc,
          '\nprecision\t', precision,
          '\nrecall\t\t', recall,
          '\nauc\t\t', auc)


In [None]:
# number of positive predictions of ensemble vs. BERT
print('# positive predictions of ensemble (lgt)', np.sum(test_df['pred_lgt']))
print('# positive predictions of ensemble (tree)', np.sum(test_df['pred_tre']))
print('# positive predictions of bert', np.sum(test_df['ber_preds']))

print('\nAmong ensemble tree 1s:')
print('# correct 1s:', np.sum((test_df['pred_tre'] == 1) & (test_df['true_label']==1)))
print('# incorrect 1s:', np.sum((test_df['pred_tre'] == 1) & (test_df['true_label']==0)))


print('\nAmong ensemble logistics 1s:')
print('# correct 1s:', np.sum((test_df['pred_lgt'] == 1) & (test_df['true_label']==1)))
print('# incorrect 1s:', np.sum((test_df['pred_lgt'] == 1) & (test_df['true_label']==0)))

print('\nAmong the bert 1s:')
print('# ensemble lgt 1s:', np.sum((test_df['pred_lgt'] == 1) & (test_df['ber_preds']==1)))
print('# ensemble tree 1s:', np.sum((test_df['pred_tre'] == 1) & (test_df['ber_preds']==1)))


In [None]:
def plot_roc(df, probas, axs, title='', legend_label='ROC curve', legend=True,):
    fpr, tpr, _ = roc_curve(df['true_label'],  df[probas])

    axs.plot(fpr,tpr, label=legend_label);
    axs.plot([0,1],[0,1], 'k--');
    axs.set(ylabel='True Positive Rate',
            xlabel='False Positive Rate', title=title);
    if legend:
        axs.legend(loc='center left', bbox_to_anchor=(1, 0.3));


In [None]:
print(train_df.shape)
print(test_df.shape)

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,3));
plot_roc(test_df, 'proba_lgt', axs=axs, title='Test ROC', legend_label='ensemble (logistic)', legend=True)
plot_roc(test_df, 'proba_tre', axs=axs, title='Test ROC', legend_label='ensemble (tree)', legend=True)
plot_roc(test_df, 'bow_probas', axs=axs, title='Test ROC', legend_label='BoW', legend=True)
plot_roc(test_df, 'ber_probas', axs=axs, title='Test ROC', legend_label='BERT', legend=True)
plot_roc(test_df, 'xgb_probas', axs=axs, title='Test ROC', legend_label='XGB', legend=True)
plot_roc(test_df, 'rfr_probas', axs=axs, title='Test ROC', legend_label='RF', legend=True)

f, axs = plt.subplots(1,1,figsize=(4,3));
plot_roc(full_train_df, 
         'proba_lgt', axs=axs, title='Train ROC', legend_label='ensemble (logistic)', legend=True)
plot_roc(full_train_df, 
         'proba_tre', axs=axs, title='Train ROC', legend_label='ensemble (tree)', legend=True)
plot_roc(full_train_df, 
         'bow_probas', axs=axs, title='Train ROC', legend_label='BoW', legend=True)
plot_roc(full_train_df, 
         'ber_probas', axs=axs, title='Train ROC', legend_label='BERT', legend=True)
plot_roc(full_train_df, 
         'xgb_probas', axs=axs, title='Train ROC', legend_label='XGB', legend=True)
plot_roc(full_train_df, 
         'rfr_probas', axs=axs, title='Train ROC', legend_label='RF', legend=True)

## Compare different models

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,3))
axs.hist(test_df['bow_probas'], bins=30, alpha=0.4, color='maroon', label='BoW');
axs.hist(test_df['ber_probas'], bins=20, alpha=0.4, color='forestgreen', label='Bert');
axs.hist(test_df['xgb_probas'], bins=30, alpha=0.4, color='royalblue', label='XGB');
axs.legend();
axs.set(title='Histogram of Predicted Probabilities',
        xlabel='predicted probability',
        ylabel='frequency');

In [None]:
def binsreg_plot(df, x, y, nbins=10, title='Binscatter', xlabel='x', ylabel='y'):
    sns.regplot(data=df, x=x, y=y,
                fit_reg=False, x_bins=nbins, label='binscatter',
                scatter_kws={"s": 40}, ci=95,
                ax=axs);
    axs.plot([0,1], [0,1], color='k', label='45 degree line')
    axs.legend(loc=2)
    axs.set_xlim(0,1)
    axs.set_ylim(0,1)
    axs.set(title=title, xlabel=xlabel, ylabel=ylabel);

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,3))
binsreg_plot(df=test_df, x='bow_probas', y='ber_probas', 
             nbins=20, title='Bert vs. BoW', xlabel='BoW', ylabel='Bert')

f, axs = plt.subplots(1,1,figsize=(4,3))
binsreg_plot(df=test_df, x='bow_probas', y='xgb_probas', 
             nbins=20, title='XGBoost vs. BoW', xlabel='BoW', ylabel='XGBoost')

f, axs = plt.subplots(1,1,figsize=(4,3))
binsreg_plot(df=test_df, x='ber_probas', y='xgb_probas', 
             nbins=20, title='XGBoost vs. BERT', xlabel='BERT', ylabel='XGBoost')

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,3))
binsreg_plot(df=test_df, x='xgb_probas', y='proba_lgt', 
             nbins=20, title='Ensemble vs. XGBoost', ylabel='Ensemble', xlabel='XGBoost')

f, axs = plt.subplots(1,1,figsize=(4,3))
binsreg_plot(df=test_df, x='bow_probas', y='proba_lgt', 
             nbins=20, title='Ensemble vs. BoW', ylabel='Ensemble', xlabel='BoW')

f, axs = plt.subplots(1,1,figsize=(4,3))
binsreg_plot(df=test_df, x='ber_probas', y='proba_lgt', 
             nbins=20, title='Ensemble vs. Bert', ylabel='Ensemble', xlabel='Bert')

## Look at reviews that are misclassified

In [None]:
download_object('clean/prod_level_bsr_rev.pickle',
               '/home/ubuntu/data/prod_level_bsr_rev.pickle', bucket_name='ac297r', s3_client=s3_client)
raw = pd.read_pickle(f'{data}/prod_level_bsr_rev.pickle')

In [None]:
full_df = df.merge(raw, how='left', on='asin')
full_df.to_pickle(f'{data}/ensemble_res_df.pickle')

In [None]:
# uploda to S3 for downstream analysis
upload_object('Predictions/ensemble_res_df.pickle', 
              f'{data}/ensemble_res_df.pickle', bucket_name='ac297r', s3_client=s3_client)

In [None]:
test_df_w_rev = test_df.merge(raw[['asin', 'after_1_yr_period_12_mo_min_bsr','review_text_3_mo']], 
                              how='left', on='asin')
del raw

In [None]:
def dist_of_incorret_vs_correct_preds(df, prediction):
    
    model = get_name(prediction)
    print((df.query(f'{prediction}==1')['after_1_yr_period_12_mo_min_bsr']).max())
    
    inc_min_bsr = df[df['true_label'] == df[prediction]]['after_1_yr_period_12_mo_min_bsr']
    print('incorrect min bsr:\n\n', inc_min_bsr.describe())
    cor_min_bsr = df[df['true_label'] != df[prediction]]['after_1_yr_period_12_mo_min_bsr']
    print('\n\ncorrent min bsr:\n\n', cor_min_bsr.describe())

    f, axs = plt.subplots(1,2,figsize=(8,3))
    axs[0].hist(inc_min_bsr, bins=30, alpha=0.4, color='maroon');
    axs[0].set(title=f'incorrect ({model})');
    axs[1].hist(cor_min_bsr, bins=20, alpha=0.4, color='forestgreen');
    axs[1].set(title=f'correct ({model})');

In [None]:
dist_of_incorret_vs_correct_preds(test_df_w_rev, 'pred_lgt')

In [None]:
dist_of_incorret_vs_correct_preds(test_df_w_rev, 'ber_preds')

In [None]:
dist_of_incorret_vs_correct_preds(test_df_w_rev, 'bow_preds')

In [None]:
dist_of_incorret_vs_correct_preds(test_df_w_rev, 'xgb_preds')

## Look at reviews that are misclassified


In [None]:
plt.hist(test_df_w_rev.query('ber_preds==0 & true_label==1')['after_1_yr_period_12_mo_min_bsr'],
        bins=10, alpha=0.8, label='BERT = 0, true label = 1');
plt.hist(test_df_w_rev.query('ber_preds==1 & true_label==1')['after_1_yr_period_12_mo_min_bsr'],
         color='purple', bins=10, alpha=0.5, label='BERT = 1, true label = 1');
plt.legend();
plt.xlabel('min BSR in the 1 year period after initial year');
plt.ylabel('frequency');