In [None]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import glob
from tqdm.auto import tqdm

import tensorflow as tf
import tensorflow_hub as hub

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#data folder
data = "/content/drive/My Drive/295R-Caps-Pattern/Data"

##Comments:

We consider linear regression, xgboost, and random forest models to predict the future change in rank of a product based on most of the data we have available apart from the text of the reviews and their titles. For every product, we predict the change in rank for a given month to the next month using the following features:
- rolling_median_month_rank: the daily rank of the product over the given month (standardized to 30 days per month), with a rolling median applied to the data to smooth it out.
- mean_month_rank: the mean of the product’s ranks over the given month.
- revrating_mean: the mean of the product’s review ratings over the given month.
- revrating_weightmean: the mean of the product’s review ratings over the given month, weighted by the number of upvotes for each review.
- reviewrating_meanverified: the mean of the product’s review ratings over the given month, weighted by the truth function associated with the verification of the reviews, i.e. weight a review by one if it is verified and weight by zero if it is not verified.
- cumulative_rating: the average over all the monthly product’s review ratings mean of all the previous months, i.e. the average over the values of point revrating_mean calculated for every month.
- cumulative_weightrating: the average over all the means of the product’s review ratings over the previous months, weighted by the number of upvotes, i.e. the average over the values of point revrating_weightmean calculated for every month.
- cumulative_rating_verified: the average over all the means of the product’s review ratings over the previous months, weighted by the verification truth functions, i.e. the average over the values of point reviewrating_meanverified calculated for every month.

In total that represents 37 features. Most of the notebook deals with preprocessing the data to have these features that we can feed in sklearn models.

Need to include price data and rating(s) of most influential review(s) (i.e. the rating(s) of the review(s) with most upvotes).

## Data processing

Copy from bert regression from Sehaj

In [None]:
# Loading the review texts
review_df = pd.read_pickle(f'{data}/clean/month_level_review.pickle')
review_df = review_df[['asin', 'year_month', 'reviewvotes_num','reviewrating','reviewverifiedpurchase','country_name']]

In [None]:
review_df.head()

Unnamed: 0,asin,year_month,reviewvotes_num,reviewrating,reviewverifiedpurchase,country_name
0,B000052XB5,01-2017,"[0, 14, 0, 0, 5, 2]","[5.0, 3.0, 5.0, 1.0, 2.0, 1.0]","[True, True, True, True, True, True]","[ United States, United States, United State..."
1,B000052XB5,01-2018,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5]","[5.0, 4.0, 5.0, 1.0, 2.0, 5.0, 5.0, 5.0, 4.0, ...","[True, True, True, True, True, True, True, Tru...","[ United States, United States, United State..."
2,B000052XB5,01-2019,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[5.0, 5.0, 5.0, 1.0, 5.0, 1.0, 5.0, 5.0, 5.0, ...","[True, True, True, True, True, False, True, Tr...","[ United States, United States, United State..."
3,B000052XB5,01-2020,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 1.0, 5.0, 5.0, ...","[True, True, True, True, True, True, True, Tru...","[ United States, United States, United State..."
4,B000052XB5,01-2021,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.0, 5.0, 1.0, 5.0, 4.0, 5.0, 5.0, 1.0, 4.0, ...","[True, True, True, True, True, True, True, Fal...","[ United States, United States, United State..."


In [None]:
# Loading the rank data
bsr_df = pd.read_pickle(f'{data}/clean/month_level_rank.pickle')
bsr_df = bsr_df[['asin', 'year_month', 'mean_month_rank', 'median_month_rank', 'rolling_median_month_rank']]

In [None]:
#Check if any median_month_rank equals zero
np.count_nonzero(bsr_df['median_month_rank']==0)

8510

In [None]:
# processing rank df for the merge
bsr_df['year_month'] = pd.to_datetime(bsr_df['year_month'])
bsr_df = bsr_df.sort_values(['asin', 'year_month'])
bsr_df['target_rank'] = bsr_df.groupby(['asin'])["median_month_rank"].shift(-1)

# processing review df for the merge
review_df['year_month'] = pd.to_datetime(review_df['year_month'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
# merging the data sets
products_df = pd.merge(review_df, bsr_df, on=["asin", "year_month"])
products_df = products_df.dropna()

In [None]:
#Check if any median_month_rank equals zero
np.count_nonzero(products_df['median_month_rank']==0)

6472

In [None]:
# creating the target variable
#products_df['rank_change_percent'] = (products_df['target_rank'] - products_df['median_month_rank'])/products_df['median_month_rank']
#products_df['normalised_rank_change'] = (products_df['rank_change'] - products_df['rank_change'].min())/(products_df['rank_change'].max() - products_df['rank_change'].min());
#Check if any rank_change_percent equals infinity
#np.count_nonzero(np.isinf(products_df['rank_change_percent']))

#Because a lot of median month ranks are zero taking percentage is not necessarily a good idea, so we predict rank change
products_df['rank_change'] = (products_df['target_rank'] - products_df['median_month_rank'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [None]:
def process_moving_median_months(row):
  """Processes the moving median column to make sure it
  doesn't break our model"""
  rolling_median = np.array(row['rolling_median_month_rank'])
  # impute the NaNs
  try: rolling_median[np.isnan(rolling_median)] = np.nanmean(rolling_median)
  except: return np.ones(30)*0.24191888901999709
  # make sure all are of length 30 (prioritising the last 30 values)
  if len(rolling_median) < 30:
    impute_value = np.nanmean(rolling_median)
    rolling_median = np.append(np.array([impute_value]*(30-len(rolling_median))), rolling_median)
  else:
    rolling_median = rolling_median[-30:]
  return rolling_median

products_df['rolling_median_month_rank'] = products_df.apply(process_moving_median_months, axis=1)

In [None]:
asins=products_df["asin"].unique()

In [None]:
products_df=products_df[products_df['asin'].isin(asins[:70])]

In [None]:
#We will then test on the last five products of products_df
test_df=products_df[products_df['asin'].isin(asins[65:70])]
test_df.shape

(153, 11)

In [None]:
# Freeing up RAM
del bsr_df
del review_df

In [None]:
#Creating weighted mean review ratings based on votes
def weight_mean(x):
  if sum(x['reviewvotes_num'])==0:
    return 0
  else:
    return np.average(x['reviewrating'],weights=x['reviewvotes_num'])

In [None]:
products_df['revrating_weightmean']=products_df.apply(weight_mean,axis=1)

In [None]:
products_df['revrating_mean']=products_df['reviewrating'].map(lambda x: np.mean(x))

In [None]:
#Creating weighted mean review ratings based on verification of review
def weight_mean_verified(x):
  if sum(x['reviewverifiedpurchase'])==0:
    return 0
  else:
    return np.average(x['reviewrating'],weights=x['reviewverifiedpurchase']) 

In [None]:
products_df['reviewrating_meanverified']=products_df.apply(weight_mean_verified,axis=1)

In [None]:
#creating cumulative weighted ratings column
products_df['cumulative_weightrating']=np.nan

In [None]:
#Example to access an element in dataframe which has the correct asin and the correct date associated
products_df[(products_df["asin"]=='B000052XB5')&(products_df["year_month"]=='2018-01-01')].iloc[0]['cumulative_weightrating']

nan

In [None]:
#Filling cumulative weighted rating column
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['revrating_weightmean'].tolist()
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_weightrating']=np.average(aux)

In [None]:
#creating cumulative rating without weights
products_df['cumulative_rating']=np.nan

In [None]:
#Filling cumulative rating column
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['revrating_mean'].tolist()
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_rating']=np.average(aux)

In [None]:
#creating cumulative rating weighted by verified
products_df['cumulative_rating_verified']=np.nan

In [None]:
#Filling cumulative weighted by verification rating column
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['reviewrating_meanverified'].tolist()
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_rating_verified']=np.average(aux)

In [None]:
products_df.head()

Unnamed: 0,asin,year_month,reviewvotes_num,reviewrating,reviewverifiedpurchase,country_name,mean_month_rank,median_month_rank,rolling_median_month_rank,target_rank,rank_change,revrating_weightmean,revrating_mean,reviewrating_meanverified,cumulative_weightrating,cumulative_rating,cumulative_rating_verified
0,B000052XB5,2018-01-01,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 5]","[5.0, 4.0, 5.0, 1.0, 2.0, 5.0, 5.0, 5.0, 4.0, ...","[True, True, True, True, True, True, True, Tru...","[ United States, United States, United State...",0.097395,0.097395,"[0.0431385783, 0.0420857428, 0.0420857428, 0.0...",0.120613,0.023217,4.0,4.1875,4.1875,4.110947,4.263801,4.346088
1,B000052XB5,2019-01-01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[5.0, 5.0, 5.0, 1.0, 5.0, 1.0, 5.0, 5.0, 5.0, ...","[True, True, True, True, True, False, True, Tr...","[ United States, United States, United State...",0.08539,0.08539,"[0.0402410374, 0.039752259000000005, 0.0384139...",0.070835,-0.014555,0.0,4.1,4.444444,2.836406,4.141136,4.246595
2,B000052XB5,2020-01-01,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]","[5.0, 5.0, 5.0, 4.0, 5.0, 5.0, 1.0, 5.0, 5.0, ...","[True, True, True, True, True, True, True, Tru...","[ United States, United States, United State...",0.045969,0.045969,"[0.0251247952, 0.0244153516, 0.0244153516, 0.0...",0.061649,0.01568,1.0,4.0,3.9,2.381842,4.058516,4.11177
3,B000052XB5,2021-01-01,"[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[5.0, 5.0, 1.0, 5.0, 4.0, 5.0, 5.0, 1.0, 4.0, ...","[True, True, True, True, True, True, True, Fal...","[ United States, United States, United State...",0.023476,0.023476,"[0.0045402867, 0.0046973071, 0.005080543500000...",0.007223,-0.016253,1.0,3.315789,3.75,2.418894,3.981058,4.031309
4,B000052XB5,2018-02-01,"[0, 0, 0, 3, 1, 0, 0, 0, 1, 0, 0, 1, 0, 5, 1, 0]","[3.0, 5.0, 5.0, 5.0, 1.0, 2.0, 5.0, 5.0, 5.0, ...","[True, True, True, True, True, True, True, Tru...","[ United States, United States, United State...",0.120613,0.120613,"[0.05059336710714286, 0.05059336710714286, 0.0...",0.088528,-0.032085,4.5,4.0625,4.0625,4.159579,4.238638,4.31064


In [None]:
products_df.shape

(2495, 17)

In [None]:
aux_train_df=products_df[products_df['asin'].isin(asins[:65])]
aux_test_df=products_df[products_df['asin'].isin(asins[65:70])]
print(aux_test_df.shape, aux_train_df.shape)

(153, 17) (2342, 17)


In [None]:
#Creating the final dataframes that will be used to create the final inputs for the regression in train and test phases
#Also including the non-cumulative data because if a specific month is getting a lot of positive reviews probably sign that getting good.
finalx_train_df=aux_train_df[['mean_month_rank','revrating_weightmean','revrating_mean','reviewrating_meanverified','cumulative_weightrating','cumulative_rating','cumulative_rating_verified']]
finalx_test_df=aux_test_df[['mean_month_rank','revrating_weightmean','revrating_mean','reviewrating_meanverified','cumulative_weightrating','cumulative_rating','cumulative_rating_verified']]
finaly_train_df=aux_train_df[['rank_change']]
finaly_test_df=aux_test_df[['rank_change']]

In [None]:
#Need to rearrange the data in final_x to then use to_numpy to get numpy array for input to regression
finalx_train_df=pd.concat([aux_train_df['rolling_median_month_rank'].apply(pd.Series),finalx_train_df.iloc[:]],axis=1)
finalx_test_df=pd.concat([aux_test_df['rolling_median_month_rank'].apply(pd.Series),finalx_test_df.iloc[:]],axis=1)

In [None]:
y_train=finaly_train_df.to_numpy()
print(y_train[:,0].shape)
y_train=y_train[:,0]
y_train

(2342,)


array([ 0.02321747, -0.01455517,  0.01568033, ..., -0.08686477,
       -0.01746772, -0.00677623])

In [None]:
y_test=finaly_test_df.to_numpy()
print(y_test[:,0].shape)
y_test=y_test[:,0]

(153,)


In [None]:
X_train=finalx_train_df.to_numpy()
print(X_train.shape)
X_train

(2342, 37)


array([[0.04313858, 0.04208574, 0.04208574, ..., 4.11094747, 4.26380052,
        4.34608844],
       [0.04024104, 0.03975226, 0.03841399, ..., 2.83640588, 4.14113614,
        4.24659539],
       [0.0251248 , 0.02441535, 0.02441535, ..., 2.38184161, 4.0585165 ,
        4.11176983],
       ...,
       [0.29721047, 0.29721047, 0.29721047, ..., 3.50837743, 4.70449383,
        4.70449383],
       [0.80482312, 0.80482312, 0.80482312, ..., 3.09002646, 4.6597165 ,
        4.66693872],
       [0.01648098, 0.01648098, 0.01648098, ..., 2.56430461, 4.57901828,
        4.59449447]])

In [None]:
X_test=finalx_test_df.to_numpy()
print(X_test.shape)
X_test

(153, 37)


array([[0.47401537, 0.47401537, 0.46152876, ..., 2.36      , 3.5       ,
        3.3       ],
       [0.72153839, 0.72153839, 0.65330542, ..., 2.29230769, 3.84615385,
        3.07692308],
       [0.02927165, 0.02927165, 0.02807813, ..., 2.41117424, 3.97916667,
        3.5625    ],
       ...,
       [0.07976259, 0.07938752, 0.07938752, ..., 3.65300253, 4.68783069,
        4.40740741],
       [0.2510193 , 0.23869121, 0.23869121, ..., 2.64669122, 4.74876847,
        4.57471264],
       [0.48743887, 0.49656043, 0.48743887, ..., 2.27094571, 4.78373016,
        4.65740741]])

In [None]:
#Checking if no nan and no infinities in the data
print(np.count_nonzero(np.isnan(X_train)))
np.count_nonzero(np.isnan(y_train))
print(np.count_nonzero(np.isinf(X_train)))
np.count_nonzero(np.isinf(y_train))

0
0


0

###Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
regressor=LinearRegression()

In [None]:
regressor.fit(X_train,y_train)

LinearRegression()

In [None]:
#Making prediction
y_pred = regressor.predict(X_test)

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Mean Absolute Error: 0.07748883029934157
Mean Squared Error: 0.015532399984786427
Root Mean Squared Error: 0.12462904952211754


In [None]:
metrics.r2_score(y_test, y_pred)

0.2221632573070046

### xgboost

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

In [None]:
xgboost=GradientBoostingRegressor()
xgboost.fit(X_train,y_train)
y_pred_boost = xgboost.predict(X_test)

In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_boost))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_boost))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_boost)))
metrics.r2_score(y_test, y_pred_boost)

Mean Absolute Error: 0.07280185456877979
Mean Squared Error: 0.01375802977236744
Root Mean Squared Error: 0.11729462806270133


0.3110207646922961

###Random Forest


In [None]:
ntrees = [50,100,150,200,500]
tree_depth = [3,5,8,None]
metric={'mae': [], 'mse': [], 'r2': []}
for tree in ntrees:
  for depth in tree_depth:
    forest=RandomForestRegressor(max_depth=depth, n_estimators=tree)
    forest.fit(X_train,y_train)
    y_pred_forest = forest.predict(X_test)
    metric['mse'].append(metrics.mean_squared_error(y_test, y_pred_forest))
    metric['mae'].append(metrics.mean_absolute_error(y_test, y_pred_forest))
    metric['r2'].append(metrics.r2_score(y_test, y_pred_forest))

In [None]:
print(metric['r2'])

[0.22849691514999704, 0.2277066381672027, 0.30612140249298614, 0.31911607858965374, 0.21175627903021355, 0.25681289769333615, 0.28467463661918657, 0.30878950177892295, 0.22052081940442225, 0.2517192981449795, 0.26916441228998156, 0.32439196970348894, 0.21704549230816517, 0.25788440773997323, 0.26223934024327844, 0.3155025129579764, 0.21677208073686682, 0.258121994108203, 0.2932744266421913, 0.3215133682745208]


In [None]:
from sklearn import metrics
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred_forest))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred_forest))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred_forest)))
metrics.r2_score(y_test, y_pred_forest)

Mean Absolute Error: 0.0683894866506928
Mean Squared Error: 0.01336567922387425
Root Mean Squared Error: 0.11561003080993557


0.33066902722304936