In [1]:
import csv
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pickle
import glob

In [2]:
# Loading the review texts
review_df = pd.read_pickle('./month_level_review.pickle')
review_df = review_df[['asin', 'year_month', 'reviewvotes_num','reviewrating','reviewverifiedpurchase','country_name']]

In [4]:
# Loading the rank data
bsr_df = pd.read_pickle('./month_level_rank.pickle')
bsr_df = bsr_df[['asin', 'year_month', 'mean_month_rank', 'median_month_rank', 'mean_month_est_sales','median_month_est_sales', 'rolling_median_month_rank','mean_month_price','median_month_price']]

In [11]:
bsr_df.head()

Unnamed: 0,asin,year_month,mean_month_rank,median_month_rank,mean_month_est_sales,median_month_est_sales,rolling_median_month_rank,mean_month_price,median_month_price,target_est_sales
0,B000052XB5,2017-08-01,0.088999,0.088999,389.705645,396.0,"[0.03545335800240161, 0.03545335800240161, 0.0...",15.29,15.29,397.125
1,B000052XB5,2017-09-01,0.087918,0.087918,395.304167,397.125,"[0.04527607163235586, 0.04999352908399204, 0.0...",13.556,13.46,448.5
2,B000052XB5,2017-10-01,0.050594,0.050594,452.091935,448.5,"[0.02781543396350373, 0.026825406625974667, 0....",11.97,11.97,439.625
3,B000052XB5,2017-11-01,0.052829,0.052829,449.233333,439.625,"[0.017099642677690898, 0.017099642677690898, 0...",11.968667,11.97,392.25
4,B000052XB5,2017-12-01,0.084528,0.084528,396.672581,392.25,"[0.031774022534299666, 0.031351070408713426, 0...",11.040968,11.96,379.75


In [5]:
# processing rank df for the merge
bsr_df['year_month'] = pd.to_datetime(bsr_df['year_month'])
bsr_df = bsr_df.sort_values(['asin', 'year_month'])
bsr_df['target_est_sales'] = bsr_df.groupby(['asin'])['median_month_est_sales'].shift(-1)

# processing review df for the merge
review_df['year_month'] = pd.to_datetime(review_df['year_month'])

In [6]:
bsr_df['target_est_sales'].std()

188.13104996238476

In [9]:
# merging the data sets
products_df = pd.merge(review_df, bsr_df, on=["asin", "year_month"])
products_df = products_df.dropna()

In [11]:
def process_moving_median_months(row):
  """Processes the moving median column to make sure it
  doesn't break our model"""
  rolling_median = np.array(row['rolling_median_month_rank'])
  # impute the NaNs
  try: rolling_median[np.isnan(rolling_median)] = np.nanmean(rolling_median)
  except: return np.ones(30)*0.24191888901999709
  # make sure all are of length 30 (prioritising the last 30 values)
  if len(rolling_median) < 30:
    impute_value = np.nanmean(rolling_median)
    rolling_median = np.append(np.array([impute_value]*(30-len(rolling_median))), rolling_median)
  else:
    rolling_median = rolling_median[-30:]
  return rolling_median

products_df['rolling_median_month_rank'] = products_df.apply(process_moving_median_months, axis=1)

In [None]:
asins=products_df["asin"].unique()

In [None]:
# Freeing up RAM
del bsr_df
del review_df

In [None]:
def num_rev(x):
  return len(x['reviewrating'])
#Creating column of number of reviews
products_df['num_reviews_month']=products_df.apply(num_rev,axis=1)

In [None]:
#Creating cumulative number of reviews
products_df['cumulative_num_reviews']=0
cumulative_numrev_list=[]

In [None]:
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['num_reviews_month'].tolist()
    cumulative_numrev_list.append(np.array(aux))
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_num_reviews']=np.sum(aux,dtype=int)

In [None]:
products_df['cumulative_numrev_list']=cumulative_numrev_list

In [None]:
#Creating weighted mean review ratings based on votes
def weight_mean(x):
  if sum(x['reviewvotes_num'])==0:
    return 0
  else:
    return np.average(x['reviewrating'],weights=x['reviewvotes_num'])

products_df['revrating_weightmean']=products_df.apply(weight_mean,axis=1)

In [None]:
products_df['revrating_mean']=products_df['reviewrating'].map(lambda x: np.mean(x))

In [None]:
#Creating weighted mean review ratings based on verification of review
def weight_mean_verified(x):
  if sum(x['reviewverifiedpurchase'])==0:
    return 0
  else:
    return np.average(x['reviewrating'],weights=x['reviewverifiedpurchase']) 

products_df['reviewrating_meanverified']=products_df.apply(weight_mean_verified,axis=1)

In [None]:
#creating cumulative weighted ratings column
products_df['cumulative_weightrating']=np.nan
cumulative_weightrating_list=[]

In [None]:
#Filling cumulative weighted rating column
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['revrating_weightmean'].tolist()
    cumulative_weightrating_list.append(np.array(aux))
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_weightrating']=np.average(aux)

In [None]:
products_df['cumulative_weightrating_list']=cumulative_weightrating_list

In [None]:
#creating cumulative rating without weights
products_df['cumulative_rating']=np.nan
cumulative_rating_list=[]

In [None]:
#Filling cumulative rating column
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['revrating_mean'].tolist()
    cumulative_rating_list.append(aux)
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_rating']=np.average(aux)

products_df['cumulative_rating_list']=cumulative_rating_list

In [None]:
#creating cumulative rating weighted by verified
products_df['cumulative_rating_verified']=np.nan
cumulative_rating_verified_list=[]

In [None]:
#Filling cumulative weighted by verification rating column
for asin in asins:
  yearmonthlist=products_df[products_df["asin"]==asin]['year_month'].tolist()
  for i,ym in enumerate(yearmonthlist):
    aux=products_df[(products_df['asin']==asin) & (products_df['year_month']<=ym)]['reviewrating_meanverified'].tolist()
    cumulative_rating_verified_list.append(aux)
    products_df.at[(products_df["asin"]==asin)&(products_df["year_month"]==ym),'cumulative_rating_verified']=np.average(aux)

products_df['cumulative_rating_verified_list']=cumulative_rating_verified_list

In [None]:
products_df.to_pickle('./merged_bsr_review_cumulative.pickle')  