### Model description 

Two versions of models:

1. Target Variable: 
    - change in monthly BSR
   Feature Variable:
    - word count of reviews in the previous month
    
2. Target Variable:
    - monthly BSR
   Feature Variables:
    - word count of reviews in *all* reviews in and before the previous month
    
    
In either case, 

- use a Bag of Word (TF-IDF) model on the 500 most common tri-grams/bi-grams from the training set.
- run LASSO/Ridge using the 500 features

    
Training set:

    - 2836 products (1/3 of all products in the dataset)
    - 68559 month-product pairs
    
Testing set:

    - 945 products (1/3 of the size of training set)
    - 24340 month-product pairs

In [1]:
import boto3
import pandas as pd
import numpy as np
import scipy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import linear_model
from tqdm.auto import tqdm

In [2]:
current_session=boto3.session=boto3.Session(
    aws_access_key_id='AKIAQF74TYKWB5URILW2',
    aws_secret_access_key='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')
s3_client= current_session.client('s3')

def download_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    with open(path_to_file_on_local, 'wb') as f:
        s3_client.download_fileobj(bucket_name, file_path_on_s3_bucket, f)
    return True

def upload_object(file_path_on_s3_bucket, path_to_file_on_local, bucket_name="ac297r", s3_client=s3_client):
    s3_client.upload_file(path_to_file_on_local, bucket_name, file_path_on_s3_bucket)
    return True

def get_object(file_path_on_s3_bucket, bucket_name="ac297r", s3_client=s3_client):
    return s3_client.get_object(Bucket=bucket_name, Key=file_path_on_s3_bucket)

In [3]:
download_object('clean/month_level_rank.pickle', 
                '/home/ubuntu/data/month_level_rank.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('clean/month_level_review.pickle', 
                '/home/ubuntu/data/month_level_review.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('clean/product_sample.pickle', 
                '/home/ubuntu/data/product_sample.pickle', bucket_name='ac297r', s3_client=s3_client)
download_object('raw/rank_sales.csv', 
                '/home/ubuntu/data/rank_sales.csv', bucket_name='ac297r', s3_client=s3_client)
download_object('raw/rank_sales.csv', 
                '/home/ubuntu/data/rank_sales.csv', bucket_name='ac297r', s3_client=s3_client)

True

In [4]:
# input folders
data = "/home/ubuntu/data"

In [5]:
review = pd.read_pickle(f'{data}/month_level_review.pickle')
sample_prod = pd.read_pickle(f'{data}/product_sample.pickle')

In [6]:
train_prod = sample_prod['train']
test_prod = sample_prod['test']
rev = review.query('asin in @train_prod | asin in @test_prod').copy().reset_index(drop=True)
rev = rev[['asin', 'year_month', 'review_text']].copy()
print('size:', rev.shape)
del sample_prod

size: (108375, 3)


In [7]:
# concat all reviews in a prod-month into a big blob of text
rev['review_text'] = rev['review_text'].str.join(" ")

In [8]:
# Load the rank data
bsr = pd.read_pickle(f'{data}/month_level_rank.pickle')[['asin', 'year_month', 'median_month_rank']]
bsr['median_month_rank_prev'] = bsr.groupby(['asin'])['median_month_rank'].shift(1)
bsr['median_month_rank_diff'] = bsr['median_month_rank'] - bsr['median_month_rank_prev']
bsr['predict_using_year_month'] = bsr.groupby(['asin'])['year_month'].shift(1)
bsr

Unnamed: 0,asin,year_month,median_month_rank,median_month_rank_prev,median_month_rank_diff,predict_using_year_month
0,B000052XB5,07-2017,0.090908,,,
1,B000052XB5,08-2017,0.088999,0.090908,-0.001909,07-2017
2,B000052XB5,09-2017,0.087918,0.088999,-0.001081,08-2017
3,B000052XB5,10-2017,0.050594,0.087918,-0.037324,09-2017
4,B000052XB5,11-2017,0.052829,0.050594,0.002235,10-2017
...,...,...,...,...,...,...
323776,B08QBXMHRT,03-2021,0.046404,0.183434,-0.137030,02-2021
323777,B08QBXMHRT,04-2021,0.013760,0.046404,-0.032643,03-2021
323778,B08QBXMHRT,05-2021,0.027598,0.013760,0.013838,04-2021
323779,B08QBXMHRT,06-2021,0.021938,0.027598,-0.005661,05-2021


In [9]:
# merge rank and text
df = bsr[['asin', 'predict_using_year_month', 'median_month_rank', 'median_month_rank_diff']].merge(rev, how='inner', 
                                               left_on=['asin', 'predict_using_year_month'], 
                                               right_on=['asin', 'year_month']).drop('predict_using_year_month', 
                                                                                     axis=1)
del bsr, rev

In [10]:
# separate into train and test
train_df = df.query('asin in @train_prod')
test_df = df.query('asin in @test_prod')
del df

In [11]:
train_df.head()

Unnamed: 0,asin,median_month_rank,median_month_rank_diff,year_month,review_text
0,B00005313T,0.005682,0.005682,07-2017,Bought this for my brother...and he seems like...
1,B00005313T,0.01502,0.009337,08-2017,Years ago my GP recommended this particular vi...
2,B00005313T,0.015266,0.000247,09-2017,I ned a multi with no iron and this does the t...
3,B00005313T,0.02044,0.005173,10-2017,Contains potentially dangerous levels of B6 an...
4,B00005313T,0.027367,0.006927,11-2017,"Great Excellent. If you cant afford Dualtabs, ..."


In [12]:
def bow_vectorizer(vectorizer, train_df, test_df, target, cumulative=False):

    vectorizer.fit(train_df['review_text'])
    vocab = vectorizer.get_feature_names_out() # get vocab
    
    # transform training/test reviews
    X_train = vectorizer.transform(train_df['review_text'])
    X_test = vectorizer.transform(test_df['review_text'])
    y_train = train_df[target]
    y_test = test_df[target]
    
    # if we want to compute cumulative mean
    if cumulative: 
        
        print('''Compute cumulative mean:''')
        
        # X_train 
        vocab_df = pd.DataFrame(X_train.toarray(),columns=vectorizer.get_feature_names_out())
        X_train = pd.concat([train_df['asin'].reset_index(drop=True), 
                              vocab_df.reset_index()], axis=1)

        X_train['n_days'] = X_train.groupby('asin')['asin'].cumcount() + 1
        for word in vocab:
            X_train[word] = X_train.groupby('asin')[word].cumsum()
            X_train[word] = X_train[word]/X_train['n_days']

        X_train = scipy.sparse.csr_matrix(X_train[vocab].values) # get back to sparse matrix
        
        # X_test
        vocab_df = pd.DataFrame(X_test.toarray(),columns=vectorizer.get_feature_names_out())
        X_test = pd.concat([test_df['asin'].reset_index(drop=True), 
                              vocab_df.reset_index()], axis=1)

        X_test['n_days'] = X_test.groupby('asin')['asin'].cumcount() + 1
        for word in vocab:
            X_test[word] = X_test.groupby('asin')[word].cumsum()
            X_test[word] = X_test[word]/X_test['n_days']

        X_test = scipy.sparse.csr_matrix(X_test[vocab].values) # get back to sparse matrix

    print('training size:', X_train.shape)
    print('testing size:', X_test.shape)
    
    return X_train, X_test, y_train, y_test, vocab

# LASSO
def run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True):
    alphas = [0.5, 0.1, 0.01, 0.001, 0.0001]
    r2_list = []
    
    print('''
    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    ''')
    
    for alpha in alphas:
        clf = linear_model.Lasso(alpha=alpha, max_iter=100000)
        clf.fit(X=X_train, y=y_train)

        r2 = clf.score(X=X_test, y=y_test)
        r2_list.append(r2)
        print(alpha, '\t', r2)

    print('-------------------------')
    best_alpha = alphas[np.argmax(np.array(r2_list))]
    print('best alpha', best_alpha)
    clf = linear_model.Lasso(alpha=alpha, max_iter=100000)
    clf.fit(X=X_train, y=y_train)
    
    if print_words:
        print('good words:')
        print(get_words(clf, words='best', n_words = 10))

        print('bad words:')
        print(get_words(clf, words='worst', n_words = 10))
    
    return clf
    
    
def run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True):
    
    alphas = [0.5, 0.1, 0.01, 0.001, 0.0001]
    r2_list = []
    
    print('''
    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    ''')
    
    for alpha in alphas:
        clf = linear_model.Ridge(alpha=alpha, max_iter=100000)
        clf.fit(X=X_train, y=y_train)

        r2 = clf.score(X=X_test, y=y_test)
        r2_list.append(r2)
        print(alpha, '\t', r2)

    print('-------------------------')
    best_alpha = alphas[np.argmax(np.array(r2_list))]
    print('best alpha', best_alpha)
    clf = linear_model.Ridge(alpha=alpha, max_iter=100000)
    clf.fit(X=X_train, y=y_train)
    
    if print_words:
        print('good words:')
        print(get_words(clf, words='best', n_words = 10))

        print('bad words:')
        print(get_words(clf, words='worst', n_words = 10))

    return clf


def get_words(trained_model, words='best', n_words = 10):
    if words == 'best':
        good_words = vocab[trained_model.coef_ > 0] 
        pos_coef = trained_model.coef_[trained_model.coef_ > 0]
        best_words = good_words[np.argsort(-pos_coef)][:n_words]
        return best_words
    elif words == 'worst':
        bad_words = vocab[trained_model.coef_ < 0] 
        neg_coef = trained_model.coef_[trained_model.coef_ < 0]
        worst_words = bad_words[np.argsort(neg_coef)][:n_words]
        return worst_words
    


## Use Monthly Reviews

### CountVectorizer (Tri-gram) Monthly Reviews to Predict Change

In [13]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# run ridge regression 
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -3.8135903040137364e-05
0.1 	 -3.8135903040137364e-05
0.01 	 -3.8135903040137364e-05
0.001 	 -3.8135903040137364e-05
0.0001 	 9.968012484473654e-05
-------------------------
best alpha 0.0001
good words:
['product did work' 'great product really' 'makes feel good'
 'don waste money' 'feel better taking' 'love product great'
 'increase milk supply' 'great product fast' 'gummies taste great'
 've using product']
bad words:
['horny goat weed' 'hair skin nails' 'capsules easy swallow'
 'apple cider vinegar' 'omega fish oil']

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.009061896933330482
0.1 	 -0.009092307271288291
0.01 	 -0.009098775122230673
0.001 	 -0.009099452904677507
0.0001 	 -0.009099742915217535
-------------------------
best alpha 0.5
good words:
['makes feel good' 'great product fast' 've using

### TF-IDF (Tri-gram) Monthly Review to Predict Change

In [14]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -3.8135903040137364e-05
0.1 	 -3.8135903040137364e-05
0.01 	 -3.8135903040137364e-05
0.001 	 -3.8135903040137364e-05
0.0001 	 -3.8135903040137364e-05
-------------------------
best alpha 0.5
good words:
[]
bad words:
[]

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.007615621226081615
0.1 	 -0.007770046991758317
0.01 	 -0.007805721201099569
0.001 	 -0.007809308086804023
0.0001 	 -0.007809666971462681
-------------------------
best alpha 0.5
good words:
['great product fast' 'scoop coffee morning' 'tell huge difference'
 've using week' 'difference skin hair' 'brain octane oil'
 'feel like helping' 'product did work' 'super easy swallow'
 'day great product']
bad words:
['far great product' 'product fast delivery' 'like maple syrup'
 'product definitely recommend' 'day highly recommend'
 'just ordered s

### CountVectorizer (Bi-gram) Monthly Reviews to Predict Change

In [15]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -3.8135903040137364e-05
0.1 	 -3.8135903040137364e-05
0.01 	 -3.8135903040137364e-05
0.001 	 3.322045441145338e-05
0.0001 	 -0.0010862679736725056
-------------------------
best alpha 0.001
good words:
['changed life' 'great flavor' 'looks like' 'hard swallow' 'did work'
 'really bad' 'works great' 'product ve' 'just ordered' 'acid reflux']
bad words:
['product buy' 'energy boost' 'stuff works' 'taking probiotics' 'don need'
 'pleasantly surprised' 'urinary tract' 'amazing product' 'happy purchase'
 'difference energy']

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.008090672923934505
0.1 	 -0.008095853638971473
0.01 	 -0.00809719024355493
0.001 	 -0.008096276127890434
0.0001 	 -0.008102003654164136
-------------------------
best alpha 0.5
good words:
['changed life' 'looks like' 'great flavor' 'produc

### TF-IDF (Bi-gram) Monthly Reviews to Predict Change

In [16]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank_diff', cumulative=False)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -3.8135903040137364e-05
0.1 	 -3.8135903040137364e-05
0.01 	 -3.8135903040137364e-05
0.001 	 -3.8135903040137364e-05
0.0001 	 1.1366950833147094e-05
-------------------------
best alpha 0.0001
good words:
['works great' 'easy swallow' 'good product' 'great price' 'great product']
bad words:
[]

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.004203763710061148
0.1 	 -0.0042823548994910254
0.01 	 -0.004300498826595778
0.001 	 -0.004302319315065306
0.0001 	 -0.0043025024328700034
-------------------------
best alpha 0.5
good words:
['cider vinegar' 'changed life' 'great good' 'began taking' 'free bottle'
 'super easy' 'seeing results' 'looks like' 'read reviews' 'really bad']
bad words:
['days feel' 'apple cider' 'loved product' 'taking probiotics'
 'pleasantly surprised' 'like energy' 'swallow good' 'prod

## Use cumulative reviews

### CountVectorizer (Tri-gram) Cumulative Reviews to Predict Monthly Rank

In [20]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# run ridge regression 
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

Compute cumulative mean:
training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.0014111716562019705
0.1 	 -0.0014111716562019705
0.01 	 -0.0014111716562019705
0.001 	 0.0014565364632549427
0.0001 	 0.004652895452083916
-------------------------
best alpha 0.0001
good words:
['gummies taste great' 'cod liver oil' 've using months'
 'using product years' 'great product helps' 'omega fish oil'
 'low carb diet' 'lost 10 pounds' 'taking product months'
 'swallow easy swallow']
bad words:
['good price good' 'year old loves' 'taste really good'
 'great product definitely' 'product good product' 'good product works'
 'great product fast' 'good value money' 'drink plenty water'
 'hard time swallowing']

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.05068311652960067
0.1 	 -0.05364920200316914
0.01 	 -0.054495339060786074
0.001 	 -0.054575408763182676
0.0001 	 -0

### TF-IDF (Tri-gram) Cumulative Review to Predict Monthly Rank

In [21]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(3,3), stop_words='english', max_features = 500)

X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


Compute cumulative mean:
training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.0014111716562019705
0.1 	 -0.0014111716562019705
0.01 	 -0.0014111716562019705
0.001 	 -0.0014111716562019705
0.0001 	 0.004357645532868881
-------------------------
best alpha 0.0001
good words:
['cod liver oil']
bad words:
['year old loves' 'taste really good' 'product great product'
 'good price good' 'easy swallow taste' 'taste like candy'
 'just started using' 'definitely recommend product'
 'noticed increase energy' 'great product great']

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.017195329081929778
0.1 	 -0.02305854104229721
0.01 	 -0.02474910883816106
0.001 	 -0.024927540652847746
0.0001 	 -0.024945490665843284
-------------------------
best alpha 0.5
good words:
['feel great taking' 'swallow easy swallow' 'super easy swallow'
 'great product highly' 'hair nails 

### CountVectorizer (Bi-gram) Cumulative Reviews to Predict Monthly Rank

In [22]:
# define vectorizer
vectorizer = CountVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)

Compute cumulative mean:
training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.0014111716562019705
0.1 	 -0.0014111716562019705
0.01 	 0.0019572586787154345
0.001 	 0.005984637128910397
0.0001 	 0.0010322056244562727
-------------------------
best alpha 0.001
good words:
['great flavor' 'free bottle' 'garden life' 'product years'
 'gummies taste' 'product highly' 'liked product' 'product really'
 'changed life' 'lose weight']
bad words:
['don want' 'fish burps' 'days taking' 'product just' 'good value'
 'using month' 'price good' 'definitely buy' 'don think'
 'noticed improvement']

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.0750569762260116
0.1 	 -0.07571047405746745
0.01 	 -0.07588872189523133
0.001 	 -0.07589967594586788
0.0001 	 -0.07589694562124016
-------------------------
best alpha 0.5
good words:
['great reviews' 'great flavor' 'didn help' 

### TF-IDF (Bi-gram) Cumulative Reviews to Predict Monthly Rank

In [23]:
# define vectorizer
vectorizer = TfidfVectorizer(ngram_range=(2,2), stop_words='english', max_features = 500)
X_train, X_test, y_train, y_test, vocab = bow_vectorizer(vectorizer, train_df, test_df, 
                                                         target='median_month_rank', cumulative=True)

# run lass regression 
lasso = run_lasso(X_train, y_train, X_test, y_test, vocab, print_words=True)

# ridge regression
ridge = run_ridge(X_train, y_train, X_test, y_test, vocab, print_words=True)


Compute cumulative mean:
training size: (68559, 500)
testing size: (24340, 500)

    Running LASSO regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.0014111716562019705
0.1 	 -0.0014111716562019705
0.01 	 -0.0014111716562019705
0.001 	 -0.0014111716562019705
0.0001 	 0.009898174835077711
-------------------------
best alpha 0.0001
good words:
['garden life' 'fish oil' 'excellent product']
bad words:
['ve noticed' 'far good' 'taste good' 'easy swallow' 'good value'
 'taste great' 'great value' 'product just' 'quality product' 'year old']

    Running ridge regression with alphas in [0.5, 0.1, 0.01, 0.001, 0.0001]
    
0.5 	 -0.02202767138989259
0.1 	 -0.025882010129543165
0.01 	 -0.02692167074808549
0.001 	 -0.02703008853661304
0.0001 	 -0.02704096895815189
-------------------------
best alpha 0.5
good words:
['free bottle' 'purchase product' 'ancestral supplements' 'began taking'
 'immune support' 'day day' 'product didn' 'great reviews' 'didn help'
 'nails strong