# BoW interpretation of BERT

This script uses a BoW model to analyze reviews under successful/unsuccessful products predicted by BERT

In [None]:
import boto3
import pandas as pd
import numpy as np
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import r2_score, f1_score, roc_auc_score, precision_score, recall_score, roc_curve
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [None]:
current_session=boto3.session=boto3.Session(
    aws_access_key_id='AKIAQF74TYKWB5URILW2',
    aws_secret_access_key='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')
s3_client= current_session.client('s3')

def download_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    with open(path_to_file_on_local, 'wb') as f:
        s3_client.download_fileobj(bucket_name, file_path_on_s3_bucket, f)
    return True

def upload_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    s3_client.upload_file(path_to_file_on_local, bucket_name, file_path_on_s3_bucket)
    return True

def get_object(file_path_on_s3_bucket, bucket_name="ac297r", s3_client=s3_client):
    return s3_client.get_object(Bucket=bucket_name, Key=file_path_on_s3_bucket)

In [None]:
data = "/home/ubuntu/data"

download_object('Predictions/ensemble_res_df.pickle',
               f'{data}/ensemble_res_df.pickle', bucket_name='ac297r', s3_client=s3_client)
df = pd.read_pickle(f'{data}/ensemble_res_df.pickle')

In [None]:
def get_word_freq(subdf, review_col):
    if subdf.shape[0]==0:
        print('empty selection')
        return None
    if review_col == 'product_name':
        subdf['review_text'] = subdf[review_col]
    else:
        subdf['review_text'] = subdf[review_col].str.join(" ")
        
        
    vectorizer = CountVectorizer(ngram_range=(1,1), stop_words='english', max_features = 100)
    vectorizer.fit(subdf['review_text'])
    vocab_df = pd.DataFrame(vectorizer.transform(subdf['review_text']).toarray(),
                            columns=vectorizer.get_feature_names_out())
    ave_counts = vocab_df.mean(axis=0).sort_values(ascending=False)
    return ave_counts

## look at Bert's positive/negative predictions

In [None]:
ber_pos = get_word_freq(df.query('ber_preds==1').copy(), review_col='review_text_3_mo')
ber_pos_words = ber_pos.index
ber_neg = get_word_freq(df.query('ber_preds==0').copy(), review_col='review_text_3_mo')
ber_neg_words = ber_neg.index

print('words in negative predictions but not in positive predictions:')
neg_minus_pos = list(set(ber_neg_words).difference(ber_pos_words))
print(neg_minus_pos)

print('\n\nwords in posituve predictions but not in negative predictions:')
pos_minus_neg = list(set(ber_pos_words).difference(ber_neg_words))
print(pos_minus_neg)

In [None]:
neg_minus_pos_freq = {word:-ber_neg[word] for word in neg_minus_pos}
temp = sorted(neg_minus_pos_freq, key=neg_minus_pos_freq.get)
neg_minus_pos_freq = {word:-neg_minus_pos_freq[word] for word in temp};

pos_minus_neg_freq = {word:-ber_pos[word] for word in pos_minus_neg}
temp = sorted(pos_minus_neg_freq, key=pos_minus_neg_freq.get)
pos_minus_neg_freq = {word:-pos_minus_neg_freq[word] for word in temp};

f, axs = plt.subplots(1,1,figsize=(4,3))
axs.bar(list(neg_minus_pos_freq.keys()), list(neg_minus_pos_freq.values()))
plt.xticks(rotation=45);
axs.set(title='Words in Negative but not Positive Predcitions',
        ylabel='average frequency');

f, axs = plt.subplots(1,1,figsize=(4,3))
axs.bar(list(pos_minus_neg_freq.keys()), list(pos_minus_neg_freq.values()))
plt.xticks(rotation=30);
axs.set(title='Words in Positive but not Negative Predcitions',
        ylabel='average frequency');

## Looking at TP, TN, FP, FN

### review text

In [None]:
tp = get_word_freq(df.query('ber_preds==1 & true_label==1').copy(), review_col='review_text_3_mo')
tn = get_word_freq(df.query('ber_preds==0 & true_label==0').copy(), review_col='review_text_3_mo')
fn = get_word_freq(df.query('ber_preds==0 & true_label==1').copy(), review_col='review_text_3_mo')
fp = get_word_freq(df.query('ber_preds==1 & true_label==0').copy(), review_col='review_text_3_mo')

In [None]:
print('''
reviews that BERT thinks is negative
''')
fn_minus_tn = set(fn.index).difference(tn.index) 
print('Words in Bert FN but not in TN (BERT got wrong):')
print(fn_minus_tn)
tn_minus_fn = set(tn.index).difference(fn.index) 
print('Words in Bert TN but not in FN (BERT got right):')
print(tn_minus_fn)

In [None]:
print('''
reviews of positive products
''')

tp_minus_fn = set(tp.index).difference(fn.index) 
print('Words in Bert TP but not in FN (BERT got right):')
print(tp_minus_fn)

fn_minus_tp = set(fn.index).difference(tp.index) 
print('\n\nWords in Bert FN but not in TP (BERT got wrong):')
print(fn_minus_tp)

### product names

In [None]:
tp = get_word_freq(df.query('ber_preds==1 & true_label==1').copy(), review_col='product_name')
tn = get_word_freq(df.query('ber_preds==0 & true_label==0').copy(), review_col='product_name')
fn = get_word_freq(df.query('ber_preds==0 & true_label==1').copy(), review_col='product_name')
fp = get_word_freq(df.query('ber_preds==1 & true_label==0').copy(), review_col='product_name')

In [None]:
print('''
products names that BERT thinks is negative
''')
fn_minus_tn = set(fn.index).difference(tn.index) 
print('Words in Bert FN but not in TN:')
print(fn_minus_tn)
tn_minus_fn = set(tn.index).difference(fn.index) 
print('\nWords in Bert TN but not in FN:')
print(tn_minus_fn)

In [None]:
print('''
reviews of positive products
''')

tp_minus_fn = set(tp.index).difference(fn.index) 
print('Words in Bert TP but not in FN:')
print(tp_minus_fn)

fn_minus_tp = set(fn.index).difference(tp.index) 
print('\n\nWords in Bert FN but not in TP:')
print(fn_minus_tp)