### Model description 

Three versions of models:

1. Target Variable: 
    - change in monthly BSR
   Feature Variable:
    - word count of reviews in the previous month
    
2. Target Variable:
    - monthly BSR
   Feature Variables:
    - word count of reviews in *all* reviews in and before the previous month
    
3. Target Variable:
    - monthly sales
   Feature Variables:
    - word count of reviews in *all* reviews in and before the previous month
    
In either case, 

- use a Bag of Word (TF-IDF) model on the 500 most common tri-grams/bi-grams from the training set.
- run LASSO/Ridge using the 500 features

    
Training set:

    - 2836 products (1/3 of all products in the dataset)
    - 68559 month-product pairs
    
Testing set:

    - 945 products (1/3 of the size of training set)
    - 24340 month-product pairs

In [1]:
import boto3
import pandas as pd
import numpy as np
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from sklearn.metrics import r2_score
from tqdm.auto import tqdm
from matplotlib import pyplot as plt
import seaborn as sns
import pickle

In [2]:
current_session=boto3.session=boto3.Session(
    aws_access_key_id='AKIAQF74TYKWB5URILW2',
    aws_secret_access_key='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')
s3_client= current_session.client('s3')

def download_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    with open(path_to_file_on_local, 'wb') as f:
        s3_client.download_fileobj(bucket_name, file_path_on_s3_bucket, f)
    return True

def upload_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    s3_client.upload_file(path_to_file_on_local, bucket_name, file_path_on_s3_bucket)
    return True

def get_object(file_path_on_s3_bucket, bucket_name="ac297r", s3_client=s3_client):
    return s3_client.get_object(Bucket=bucket_name, Key=file_path_on_s3_bucket)

In [3]:
download_object('clean/month_level_rank.pickle', 
                '/home/ubuntu/data/month_level_rank.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('clean/month_level_review.pickle', 
                '/home/ubuntu/data/month_level_review.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('clean/product_sample.pickle', 
                '/home/ubuntu/data/product_sample.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('raw/rank_sales.csv', 
                '/home/ubuntu/data/rank_sales.csv', bucket_name='ac297r', s3_client=s3_client)
download_object('clean/month_level_rank_sales_price.pickle',
               '/home/ubuntu/data/month_level_rank_sales_price.pickle', bucket_name='ac297r', s3_client=s3_client)

True

In [23]:
# input folders
data = "/home/ubuntu/data"

In [24]:
review = pd.read_pickle(f'{data}/month_level_review.pickle')
sample_prod = pd.read_pickle(f'{data}/product_sample.pickle')

In [25]:
train_prod = sample_prod['train']
test_prod = sample_prod['test']

rev = review.query('asin in @train_prod | asin in @test_prod').copy().reset_index(drop=True)
rev = rev[['asin', 'year_month', 'review_text']].copy()
print('review size:', rev.shape)

del sample_prod, review

review size: (108375, 3)


In [26]:
# concat all reviews in a prod-month into a big blob of text
rev['review_text'] = rev['review_text'].str.join(" ")

In [27]:
# Load the rank data
bsr = pd.read_pickle(f'{data}/month_level_rank_sales_price.pickle')[['asin', 'year_month', 'median_month_rank', 'median_month_est_sales']]
bsr['median_month_rank_prev'] = bsr.groupby(['asin'])['median_month_rank'].shift(1)
bsr['median_month_rank_diff'] = bsr['median_month_rank'] - bsr['median_month_rank_prev']
bsr['predict_using_year_month'] = bsr.groupby(['asin'])['year_month'].shift(1)
bsr

Unnamed: 0,asin,year_month,median_month_rank,median_month_est_sales,median_month_rank_prev,median_month_rank_diff,predict_using_year_month
0,B000052XB5,08-2017,0.088999,396.000000,,,
1,B000052XB5,09-2017,0.087918,397.125000,0.088999,-0.001081,08-2017
2,B000052XB5,10-2017,0.050594,448.500000,0.087918,-0.037324,09-2017
3,B000052XB5,11-2017,0.052829,439.625000,0.050594,0.002235,10-2017
4,B000052XB5,12-2017,0.084528,392.250000,0.052829,0.031699,11-2017
...,...,...,...,...,...,...,...
289970,B08QBXMHRT,01-2021,0.515792,5.006836,1.000000,-0.484208,12-2020
289971,B08QBXMHRT,02-2021,0.183434,13.577881,0.515792,-0.332358,01-2021
289972,B08QBXMHRT,03-2021,0.046404,30.356250,0.183434,-0.137030,02-2021
289973,B08QBXMHRT,04-2021,0.013760,38.770508,0.046404,-0.032643,03-2021


In [28]:
# merge rank and text
df = bsr[['asin', 'predict_using_year_month', 
          'median_month_rank', 'median_month_rank_diff', 
          'median_month_est_sales']].merge(rev, how='inner', 
                                               left_on=['asin', 'predict_using_year_month'], 
                                               right_on=['asin', 'year_month']).drop('predict_using_year_month', 
                                                                                     axis=1)
print(df.shape)
del bsr, rev

(83012, 6)


In [10]:
# separate into train and test
train_df = df.query('asin in @train_prod').reset_index(drop=True)
test_df = df.query('asin in @test_prod').reset_index(drop=True)
del df

In [11]:
def bow_vectorizer(vectorizer, train_df, test_df, target, cumulative=False):

    vectorizer.fit(train_df['review_text'])
    vocab = vectorizer.get_feature_names_out() # get vocab
    
    # transform training/test reviews
    X_train = vectorizer.transform(train_df['review_text'])
    X_test = vectorizer.transform(test_df['review_text'])
    y_train = train_df[target]
    y_test = test_df[target]
    
    # if we want to compute cumulative mean
    if cumulative: 
        
        print('''Compute cumulative mean:''')
        
        # X_train 
        vocab_df = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names_out())
        X_train = pd.concat([train_df['asin'].reset_index(drop=True), 
                              vocab_df.reset_index()], axis=1)

        X_train['n_days'] = X_train.groupby('asin')['asin'].cumcount() + 1
        for word in vocab:
            X_train[word] = X_train.groupby('asin')[word].cumsum()
            X_train[word] = X_train[word]/X_train['n_days']

        X_train = scipy.sparse.csr_matrix(X_train[vocab].values) # get back to sparse matrix
        
        # X_test
        vocab_df = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names_out())
        X_test = pd.concat([test_df['asin'].reset_index(drop=True), 
                              vocab_df.reset_index()], axis=1)

        X_test['n_days'] = X_test.groupby('asin')['asin'].cumcount() + 1
        for word in vocab:
            X_test[word] = X_test.groupby('asin')[word].cumsum()
            X_test[word] = X_test[word]/X_test['n_days']

        X_test = scipy.sparse.csr_matrix(X_test[vocab].values) # get back to sparse matrix

    print('training size:', X_train.shape)
    print('testing size:', X_test.shape)
    
    return X_train, X_test, y_train, y_test, vocab

# LASSO
def run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True):
    alphas = [0.5, 0.1, 0.01, 0.001]
    r2_list = []
    
    print('''
    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001]
    ''')
    
    for alpha in alphas:
        clf = linear_model.Lasso(alpha=alpha, max_iter=100000)
        clf.fit(X=X_train, y=y_train)

        r2 = clf.score(X=X_test, y=y_test)
        r2_list.append(r2)
        print(alpha, '\t', r2)

    print('-------------------------')
    best_alpha = alphas[np.argmax(np.array(r2_list))]
    print('best alpha', best_alpha)
    clf = linear_model.Lasso(alpha=best_alpha, max_iter=100000)
    clf.fit(X=X_train, y=y_train)
    
    print(np.sum(clf.coef_ < 0), np.sum(clf.coef_ > 0))
    
    if print_words:
        print('good words:')
        print(get_words(clf, words='best', n_words = 10))

        print('bad words:')
        print(get_words(clf, words='worst', n_words = 10))
    
    results = {alphas[idx]:r2_list[idx] for idx, val in enumerate(alphas)}
    return clf, results
    
    
def run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True):
    
    alphas = [0.5, 0.1, 0.01, 0.001]
    r2_list = []
    
    print('''
    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001]
    ''')
    
    for alpha in alphas:
        clf = linear_model.Ridge(alpha=best_alpha, max_iter=100000)
        clf.fit(X=X_train, y=y_train)

        r2 = clf.score(X=X_test, y=y_test)
        r2_list.append(r2)
        print(alpha, '\t', r2)

    print('-------------------------')
    best_alpha = alphas[np.argmax(np.array(r2_list))]
    print('best alpha', best_alpha)
    clf = linear_model.Ridge(alpha=alpha, max_iter=100000)
    clf.fit(X=X_train, y=y_train)
    
    if print_words:
        print('good words:')
        print(get_words(clf, words='best', n_words = 10))

        print('bad words:')
        print(get_words(clf, words='worst', n_words = 10))
        
    results = {alphas[idx]:r2_list[idx] for idx, val in enumerate(alphas)}
    return clf, results


def get_words(trained_model, words='best', n_words = 10):
    if words == 'best':
        good_words = vocab[trained_model.coef_ > 0] 
        pos_coef = trained_model.coef_[trained_model.coef_ > 0]
        best_words = good_words[np.argsort(-pos_coef)][:n_words]
        return best_words
    elif words == 'worst':
        bad_words = vocab[trained_model.coef_ < 0] 
        neg_coef = trained_model.coef_[trained_model.coef_ < 0]
        worst_words = bad_words[np.argsort(neg_coef)][:n_words]
        return worst_words
    


## Use Monthly Reviews to Predict Change

### CountVectorizer (Tri-gram)

In [None]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# run ridge regression 
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

### TF-IDF (Tri-gram)

In [None]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


### CountVectorizer (Bi-gram)

In [None]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

### TF-IDF (Bi-gram) 

In [None]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


##  Cumulative Reviews to Predict Monthly Rank

### CountVectorizer (Tri-gram) 

In [None]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# run ridge regression 
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

### TF-IDF (Tri-gram) 

In [None]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


### CountVectorizer (Bi-gram) 

In [None]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

### TF-IDF (Bi-gram)

In [None]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


## Cumulative Reviews to Predict Monthly Sales Volumes

In [None]:
r2_dict = {}

### CountVectorizer (Tri-gram)

In [None]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_est_sales', cumulative=True)

# run lasso regression 
lasso, r2 = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['trigram + count + lasso'] = r2

# run ridge regression 
ridge, r2 = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['trigram + count + ridge'] = r2

### TF-IDF (Tri-gram)

In [None]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_est_sales', cumulative=True)

# run lass regression 
lasso, r2 = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['trigram + tfidf + lasso'] = r2

# ridge regression
ridge, r2 = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['trigram + tfidf + ridge'] = r2


### CountVectorizer (Bi-gram)

In [None]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_est_sales', cumulative=True)

# run lass regression 
lasso, r2 = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['bigram + count + lasso'] = r2

# ridge regression
ridge, r2 = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['bigram + count + ridge'] = r2

In [None]:
with open(f'{data}/results_dict.pickle', 'wb') as fp:
    pickle.dump(r2_dict, fp)

### TF-IDF (Bi-gram)

In [None]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_est_sales', cumulative=True)

# run lass regression 
lasso, r2 = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['bigram + tfidf + lasso'] = r2

# ridge regression
ridge, r2 = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)
r2_dict['bigram + tfidf + ridge'] = r2


In [None]:
# save to local
with open(f'{data}/results_dict.pickle', 'wb') as fp:
    pickle.dump(r2_dict, fp)

In [None]:
# upload to s3
upload_object('models/bow/results_dict.pickle', 
              f'{data}/results_dict.pickle', bucket_name='ac297r', s3_client=s3_client)

## heapmap of r^2 

In [None]:
r2_dict_df = pd.DataFrame.from_dict(r2_dict)
tfidf_df = r2_dict_df[['trigram + tfidf + lasso', 'trigram + tfidf + ridge', 
                       'bigram + tfidf + lasso', 'bigram + tfidf + ridge']]
count_df = r2_dict_df[['trigram + count + lasso', 'trigram + count + ridge', 
                       'bigram + count + lasso', 'bigram + count + ridge']]

In [None]:
tfidf_df = tfidf_df.rename(columns={s:s.replace('+ tfidf +', '+') for s in tfidf_df.columns})
count_df = count_df.rename(columns={s:s.replace('+ count +', '+') for s in count_df.columns})

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,4));
sns.heatmap(count_df.T,cmap='Blues', annot=True, square=True, ax=axs);
axs.set(title='Bag of Word', xlabel='alpha', ylabel='model');

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,4));
sns.heatmap(tfidf_df.T,cmap='Blues', annot=True, square=True, ax=axs);
axs.set(title='TF-IDF', xlabel='alpha', ylabel='model');

## Look at best performing model

In [12]:
# convert predicted sales into rank data 
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_est_sales', cumulative=True)
alpha = 0.1

print('''
Running LASSO regression with alpha = 0.1
''')

clf = linear_model.Lasso(alpha=alpha, max_iter=100000)
clf.fit(X=X_train, y=y_train)

r2 = clf.score(X=X_test, y=y_test)
print(alpha, '\t', r2)

Compute cumulative mean:
training size: (61199, 500)
testing size: (21813, 500)

Running LASSO regression with alpha = 0.1

0.1 	 0.13943561913065272


In [13]:
clf.intercept_

-26.672374544409536

In [16]:
np.sum(clf.coef_ == 0)

400

In [None]:
mean_r2 = r2_score(y_test, [np.mean(y_train)] * len(y_test))
print('r^2 of training set average', '\t', mean_r2)

In [None]:
print('good words:')
print(get_words(clf, words='best', n_words = 10))

print('bad words:')
print(get_words(clf, words='worst', n_words = 10))

In [None]:
# convert predicted sales volumes into rank data
test_df['pred_sales'] = clf.predict(X_test)
test_df['orig_index'] = test_df.index

In [None]:
# test_df = test_df.groupby('year_month').apply(lambda x: x.sort_values(["pred_sales"])).reset_index(drop=True)
# test_df['pred_sales_order'] = test_df.groupby('year_month').cumcount('asin') + 1
# convert sales volumes into rank data
# test_df = test_df.groupby('year_month').apply(lambda x: x.sort_values(["median_month_est_sales"])).reset_index(drop=True)
# test_df['median_month_est_sales_order'] = test_df.groupby('year_month').cumcount('asin') + 1

# sp_rho = scipy.stats.spearmanr(test_df['pred_sales_order'], test_df['median_month_est_sales_order'])
# print('spearman rho:', sp_rho)

In [None]:
# categorize products as good and bad 
tmp_prod_perf = test_df.sort_values('median_month_est_sales').orig_index
bad_products = tmp_prod_perf[:tmp_prod_perf.shape[0]//3].values
good_products = tmp_prod_perf[tmp_prod_perf.shape[0]//3 * 2:].values
medi_products = tmp_prod_perf[tmp_prod_perf.shape[0]//3:tmp_prod_perf.shape[0]//3 * 2].values
print('good', good_products.shape, '\nbad', bad_products.shape, '\nmediocre', medi_products.shape)
del tmp_prod_perf

In [None]:
for cat in ['good_products', 'medi_products', 'bad_products']:
    row_id = test_df.query(f'orig_index in @{cat}')['orig_index'].values
    preds = clf.predict(scipy.sparse.csr_matrix(X_test.todense()[row_id,:]))
    r2 = r2_score(y_test[row_id].values, preds)
    cor = np.corrcoef(y_test[row_id].values, preds)[0,1]
    print(cat)
    print('r^2 : ', r2)
    print('correlation:', cor)
    print('\n')

In [None]:
test_df['residual'] = test_df['median_month_est_sales'] - test_df['pred_sales']
f, axs = plt.subplots(1,1,figsize=(4,4))
axs.scatter(test_df['pred_sales'], test_df['residual'], 
            s=4, alpha=0.2) 
axs.set(title='Residual vs. Prediction',
        xlabel='predicted sales volumes',
        ylabel='target - prediction');

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,3))
axs.hist(test_df['pred_sales'], density=True, 
         bins=40, alpha=0.4, label='prediction');
axs.hist(test_df['median_month_est_sales'], density=True, 
         bins=200, alpha=0.4, label='target');
axs.legend();
axs.set_xlim(-100, 1000);
axs.set(title='Histogram of Target and Prediction',
        xlabel='(predicted) sales volumes',
        ylabel='density');

In [None]:
for cat in ['good_products', 'medi_products', 'bad_products']:
    row_id = test_df.query(f'orig_index in @{cat}')['orig_index'].values
    preds = clf.predict(scipy.sparse.csr_matrix(X_test.todense()[row_id,:]))
    f, axs = plt.subplots(1,1,figsize=(4,3))
    axs.hist(preds,bins=40);

In [None]:
f, axs = plt.subplots(1,1,figsize=(4,3))
sns.regplot(test_df['pred_sales'], test_df['median_month_est_sales'],
            fit_reg=False, x_bins=30, label='binscatter',
            scatter_kws={"s": 40}, ci=95,
            ax=axs);
axs.plot([-10,450], [-10,450], color='k', label='45 degree line')
axs.legend(loc=4)
axs.set_xlim(-10,450)
axs.set_ylim(-10,450)
axs.set(title='Binscatter Plot of Predictions', 
        xlabel='average prediction', ylabel='averga target value');