In [1]:
!pip install scikit-surprise
!pip install xgboost
!pip install category_encoders



## Importing Libraries

In [53]:
import numpy as np
import pandas as pd
from sklearn.metrics import r2_score,mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from math import sqrt
from matplotlib.pylab import rcParams
import category_encoders as ce
import datetime
from sklearn.metrics import classification_report
%matplotlib inline
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise import KNNWithMeans
from surprise import accuracy
import os
from surprise.model_selection import train_test_split as tts

# Data import and EDA

In [4]:
rating = pd.read_csv('C:/Users/Saad/Desktop/New folder (2)/Amazon Product Reviews/ratings_Electronics.csv',header=None)

In [5]:
rating.rename(columns={0:'user_id',1:'prod_id',2:'rating',3:'tstamp'}, inplace = True)


In [6]:
rating.describe

<bound method NDFrame.describe of                 user_id     prod_id  rating      tstamp
0         AKM1MP6P0OYPR  0132793040     5.0  1365811200
1        A2CX7LUOHB2NDG  0321732944     5.0  1341100800
2        A2NWSAGRHCP8N5  0439886341     1.0  1367193600
3        A2WNBOD3WNDNKT  0439886341     3.0  1374451200
4        A1GI0U4ZRJA8WN  0439886341     1.0  1334707200
...                 ...         ...     ...         ...
7824477  A2YZI3C9MOHC0L  BT008UKTMW     5.0  1396569600
7824478  A322MDK0M89RHN  BT008UKTMW     5.0  1313366400
7824479  A1MH90R0ADMIK0  BT008UKTMW     4.0  1404172800
7824480  A10M2KEFPEQDHN  BT008UKTMW     4.0  1297555200
7824481  A2G81TMIOIDEQQ  BT008V9J9U     5.0  1312675200

[7824482 rows x 4 columns]>

In [7]:
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 4 columns):
 #   Column   Dtype  
---  ------   -----  
 0   user_id  object 
 1   prod_id  object 
 2   rating   float64
 3   tstamp   int64  
dtypes: float64(1), int64(1), object(2)
memory usage: 238.8+ MB


### Converting timestamp to datetime format for use in modeling

In [8]:
rating['tstamp_new'] = pd.to_datetime(rating['tstamp'], utc=True, unit='s')
rating.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7824482 entries, 0 to 7824481
Data columns (total 5 columns):
 #   Column      Dtype              
---  ------      -----              
 0   user_id     object             
 1   prod_id     object             
 2   rating      float64            
 3   tstamp      int64              
 4   tstamp_new  datetime64[ns, UTC]
dtypes: datetime64[ns, UTC](1), float64(1), int64(1), object(2)
memory usage: 298.5+ MB


In [111]:
print(rating['tstamp_new'].min())
print(rating['tstamp_new'].max())

1998-12-04 00:00:00+00:00
2014-07-23 00:00:00+00:00


In [113]:
print(rating['user_id'].nunique())
print(rating['prod_id'].nunique())

4201696
476002


# Models for predicting product success and failure based on ratings in first 30 days
## Feature Generation


In [9]:
prod_grp = rating.groupby('prod_id').agg(date_min=('tstamp_new','min'),
                                         date_max=('tstamp_new','max'),
                                         uniq_cnt=('tstamp_new',pd.Series.nunique),
                                         tot_rating=('tstamp_new','count'),
                                         rtg_min=('rating','min'),
                                         rtg_max=('rating','max'),
                                         rtg_uniq=('rating',pd.Series.nunique),
                                         rtg_avg=('rating','mean')
                                        )

In [10]:
prod_grp['prod_id']=prod_grp.index
prod_grp.index = prod_grp.index.rename('index')

In [11]:
prod_grp['date_diff']=(prod_grp['date_max']-prod_grp['date_min'])
prod_grp['date_diff'] = prod_grp['date_diff']/ np.timedelta64(1, 'D')
prod_grp

Unnamed: 0_level_0,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,prod_id,date_diff
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0132793040,2013-04-13 00:00:00+00:00,2013-04-13 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,0132793040,0.0
0321732944,2012-07-01 00:00:00+00:00,2012-07-01 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,0321732944,0.0
0439886341,2012-04-18 00:00:00+00:00,2013-07-22 00:00:00+00:00,3,3,1.0,3.0,2,1.666667,0439886341,460.0
0511189877,2014-03-22 00:00:00+00:00,2014-05-29 00:00:00+00:00,5,6,2.0,5.0,2,4.500000,0511189877,68.0
0528881469,2010-06-21 00:00:00+00:00,2014-07-18 00:00:00+00:00,26,27,1.0,5.0,5,2.851852,0528881469,1488.0
...,...,...,...,...,...,...,...,...,...,...
BT008G3W52,2012-12-17 00:00:00+00:00,2012-12-17 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,BT008G3W52,0.0
BT008SXQ4C,2012-01-09 00:00:00+00:00,2012-01-09 00:00:00+00:00,1,1,1.0,1.0,1,1.000000,BT008SXQ4C,0.0
BT008T2BGK,2011-11-10 00:00:00+00:00,2011-11-10 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,BT008T2BGK,0.0
BT008UKTMW,2010-12-31 00:00:00+00:00,2014-07-22 00:00:00+00:00,15,15,1.0,5.0,5,4.000000,BT008UKTMW,1299.0


In [114]:
len(prod_grp[prod_grp['rtg_avg']>=4.5])

186897

In [13]:
data_prep_1=rating.merge(prod_grp, on=["prod_id"], how='left').sort_values(by=["prod_id","tstamp_new"]).reset_index(drop=True)

In [14]:
data_prep_1['rating_5']=np.where(data_prep_1['rating']==5.0,1,0)
data_prep_1['rating_1']=np.where(data_prep_1['rating']==1.0,1,0)
data_prep_1

Unnamed: 0,user_id,prod_id,rating,tstamp,tstamp_new,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,date_diff,rating_5,rating_1
0,AKM1MP6P0OYPR,0132793040,5.0,1365811200,2013-04-13 00:00:00+00:00,2013-04-13 00:00:00+00:00,2013-04-13 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,0.0,1,0
1,A2CX7LUOHB2NDG,0321732944,5.0,1341100800,2012-07-01 00:00:00+00:00,2012-07-01 00:00:00+00:00,2012-07-01 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,0.0,1,0
2,A1GI0U4ZRJA8WN,0439886341,1.0,1334707200,2012-04-18 00:00:00+00:00,2012-04-18 00:00:00+00:00,2013-07-22 00:00:00+00:00,3,3,1.0,3.0,2,1.666667,460.0,0,1
3,A2NWSAGRHCP8N5,0439886341,1.0,1367193600,2013-04-29 00:00:00+00:00,2012-04-18 00:00:00+00:00,2013-07-22 00:00:00+00:00,3,3,1.0,3.0,2,1.666667,460.0,0,1
4,A2WNBOD3WNDNKT,0439886341,3.0,1374451200,2013-07-22 00:00:00+00:00,2012-04-18 00:00:00+00:00,2013-07-22 00:00:00+00:00,3,3,1.0,3.0,2,1.666667,460.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7824477,A7ETK9SC1HAMP,BT008UKTMW,5.0,1397088000,2014-04-10 00:00:00+00:00,2010-12-31 00:00:00+00:00,2014-07-22 00:00:00+00:00,15,15,1.0,5.0,5,4.000000,1299.0,1,0
7824478,A84WRQR47OOKR,BT008UKTMW,1.0,1401753600,2014-06-03 00:00:00+00:00,2010-12-31 00:00:00+00:00,2014-07-22 00:00:00+00:00,15,15,1.0,5.0,5,4.000000,1299.0,0,1
7824479,A1MH90R0ADMIK0,BT008UKTMW,4.0,1404172800,2014-07-01 00:00:00+00:00,2010-12-31 00:00:00+00:00,2014-07-22 00:00:00+00:00,15,15,1.0,5.0,5,4.000000,1299.0,0,0
7824480,A5FUA9HEV2O42,BT008UKTMW,5.0,1405987200,2014-07-22 00:00:00+00:00,2010-12-31 00:00:00+00:00,2014-07-22 00:00:00+00:00,15,15,1.0,5.0,5,4.000000,1299.0,1,0


In [15]:
prod_grp_7_days = data_prep_1[data_prep_1['tstamp_new']<=(data_prep_1['date_min']+datetime.timedelta(days=7))]


prod_grp_7_days_p1 = prod_grp_7_days.groupby(['prod_id','tstamp_new']).agg(rating_5_7days=('rating_5','sum'),
                                         rating_1_7days=('rating_1','sum'),
                                         tot_rtg_7days=('rating','count'),
                                         sum_rtg_7days=('rating','sum')
                                        )

prod_grp_7_days_p2 = prod_grp_7_days_p1.groupby('prod_id').agg(rating_5_7days=('rating_5_7days','sum'),
                                         rating_1_7days=('rating_1_7days','sum'),
                                         tot_rtg_7days=('tot_rtg_7days','sum'),
                                         sum_rtg_7days=('sum_rtg_7days','sum'),
                                         max_num_rtg_7days=('tot_rtg_7days','max')
                                        )


prod_grp_7_days_p2['rtg_avg_7days']=prod_grp_7_days_p2['sum_rtg_7days']/prod_grp_7_days_p2['tot_rtg_7days']
prod_grp_7_days_p2

Unnamed: 0_level_0,rating_5_7days,rating_1_7days,tot_rtg_7days,sum_rtg_7days,max_num_rtg_7days,rtg_avg_7days
prod_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0132793040,1,0,1,5.0,1,5.0
0321732944,1,0,1,5.0,1,5.0
0439886341,0,1,1,1.0,1,1.0
0511189877,3,0,3,15.0,1,5.0
0528881469,0,0,1,2.0,1,2.0
...,...,...,...,...,...,...
BT008G3W52,1,0,1,5.0,1,5.0
BT008SXQ4C,0,1,1,1.0,1,1.0
BT008T2BGK,1,0,1,5.0,1,5.0
BT008UKTMW,0,0,1,3.0,1,3.0


In [16]:
prod_grp_30_days = data_prep_1[data_prep_1['tstamp_new']<=(data_prep_1['date_min']+datetime.timedelta(days=30))]


prod_grp_30_days_p1 = prod_grp_30_days.groupby(['prod_id','tstamp_new']).agg(rating_5_30days=('rating_5','sum'),
                                         rating_1_30days=('rating_1','sum'),
                                         tot_rtg_30days=('rating','count'),
                                         sum_rtg_30days=('rating','sum')
                                        )

prod_grp_30_days_p2 = prod_grp_30_days_p1.groupby('prod_id').agg(rating_5_30days=('rating_5_30days','sum'),
                                         rating_1_30days=('rating_1_30days','sum'),
                                         tot_rtg_30days=('tot_rtg_30days','sum'),
                                         sum_rtg_30days=('sum_rtg_30days','sum'),
                                         max_num_rtg_30days=('tot_rtg_30days','max')
                                        )


prod_grp_30_days_p2['rtg_avg_30days']=prod_grp_30_days_p2['sum_rtg_30days']/prod_grp_30_days_p2['tot_rtg_30days']
prod_grp_30_days_p2

Unnamed: 0_level_0,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days
prod_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0132793040,1,0,1,5.0,1,5.0
0321732944,1,0,1,5.0,1,5.0
0439886341,0,1,1,1.0,1,1.0
0511189877,4,0,5,22.0,2,4.4
0528881469,0,0,2,6.0,1,3.0
...,...,...,...,...,...,...
BT008G3W52,1,0,1,5.0,1,5.0
BT008SXQ4C,0,1,1,1.0,1,1.0
BT008T2BGK,1,0,1,5.0,1,5.0
BT008UKTMW,0,0,1,3.0,1,3.0


## Trend Variables
We have created features for product performance in the first 7 days, and first 30 days since the first review.
The objective of these features is to understand if the model rating has improved or declined over the month, and get an understanding of how the product might perform beyond the first month

In [17]:
prod_grp_fin=prod_grp.merge(prod_grp_7_days_p2, on=["prod_id"], how='left').merge(prod_grp_30_days_p2, on=["prod_id"], how='left').reset_index(drop=True)

In [18]:
pot_good_prod = np.where((prod_grp_fin['rtg_avg_30days']>4.0)& (prod_grp_fin['tot_rtg_30days']>=2))
print("Potentially good prod: "+str(len(prod_grp_fin.loc[pot_good_prod])))

success_prod = np.where((prod_grp_fin['rtg_avg']>4.0) & (prod_grp_fin['rtg_avg_30days']>4.0)&(prod_grp_fin['tot_rating']>50))
print("Successful prod: "+str(len(prod_grp_fin.loc[success_prod])))

print("Fraction of potentially good products that succeeded after a month: ")
print(len(prod_grp_fin.loc[success_prod])/len(prod_grp_fin.loc[pot_good_prod]))


Potentially good prod: 47228
Successful prod: 10698
Fraction of potentially good products that succeeded after a month: 
0.2265181671889557


In [19]:
pot_bad_prod = np.where((prod_grp_fin['rtg_avg_30days']<=2.5)& (prod_grp_fin['tot_rtg_30days']>=2))
print("Potentially good prod: "+str(len(prod_grp_fin.loc[pot_bad_prod])))

hid_gem_prod = np.where((prod_grp_fin['rtg_avg']>4.0) & (prod_grp_fin['rtg_avg_30days']<=2.5)&(prod_grp_fin['tot_rating']>20))
print("Hidden Gem prod: "+str(len(prod_grp_fin.loc[hid_gem_prod])))

bad_prod = np.where((prod_grp_fin['rtg_avg']<=2.5) & (prod_grp_fin['rtg_avg_30days']<=2.5)&(prod_grp_fin['tot_rating']>10))
print("Bad prod: "+str(len(prod_grp_fin.loc[bad_prod])))

print("Fraction of potentially bad products that succeeded after a month: ")
print(len(prod_grp_fin.loc[hid_gem_prod])/len(prod_grp_fin.loc[pot_bad_prod]))

print("Fraction of potentially good products that failed after a month: ")
print(len(prod_grp_fin.loc[bad_prod])/len(prod_grp_fin.loc[pot_bad_prod]))


Potentially good prod: 11314
Hidden Gem prod: 921
Bad prod: 1439
Fraction of potentially bad products that succeeded after a month: 
0.08140357079724235
Fraction of potentially good products that failed after a month: 
0.12718755524129396


## Desired Outcome
The online platform will likely benefit from the following actions -
1. Identifying which products are likely to become successful, and promote it for higher overall sales on platform
2. Identify bad products which tarnish the platform's reputation and take down product before bad customer experience

In [20]:
prod_grp_pred1=prod_grp_30_days_p2.merge(prod_grp_7_days_p2, on=["prod_id"], how='left')

In [21]:
prod_grp_pred1['prod_id']=prod_grp_pred1.index
prod_grp_pred1.index = prod_grp_pred1.index.rename('index')
prod_grp_pred1

Unnamed: 0_level_0,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days,rating_5_7days,rating_1_7days,tot_rtg_7days,sum_rtg_7days,max_num_rtg_7days,rtg_avg_7days,prod_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0132793040,1,0,1,5.0,1,5.0,1,0,1,5.0,1,5.0,0132793040
0321732944,1,0,1,5.0,1,5.0,1,0,1,5.0,1,5.0,0321732944
0439886341,0,1,1,1.0,1,1.0,0,1,1,1.0,1,1.0,0439886341
0511189877,4,0,5,22.0,2,4.4,3,0,3,15.0,1,5.0,0511189877
0528881469,0,0,2,6.0,1,3.0,0,0,1,2.0,1,2.0,0528881469
...,...,...,...,...,...,...,...,...,...,...,...,...,...
BT008G3W52,1,0,1,5.0,1,5.0,1,0,1,5.0,1,5.0,BT008G3W52
BT008SXQ4C,0,1,1,1.0,1,1.0,0,1,1,1.0,1,1.0,BT008SXQ4C
BT008T2BGK,1,0,1,5.0,1,5.0,1,0,1,5.0,1,5.0,BT008T2BGK
BT008UKTMW,0,0,1,3.0,1,3.0,0,0,1,3.0,1,3.0,BT008UKTMW


## Product Classification
A product is classified as successful if it has an average rating of >4.0 and a minimum of 50 total ratings
Similarly, a product is classified as a bad/failed product if it has an average rating of <=2.5 and and a minimum of 10 total ratings

I have also considered the average rating in the first 30 days so as to arrive at a list of potential good/bad products

In [22]:
prod_grp_fin['good_prod_flag']=np.where((prod_grp_fin['rtg_avg']>4.0)&(prod_grp_fin['rtg_avg_30days']>4.0)&(prod_grp_fin['tot_rating']>=50),1,0)
prod_grp_fin['bad_prod_flag']=np.where((prod_grp_fin['rtg_avg']<=2.5)&(prod_grp_fin['rtg_avg_30days']<=2.5)&(prod_grp_fin['tot_rating']>=10),1,0)


In [23]:
prod_grp_pred_target_var = prod_grp_fin[{'prod_id','bad_prod_flag','good_prod_flag'}]

  prod_grp_pred_target_var = prod_grp_fin[{'prod_id','bad_prod_flag','good_prod_flag'}]


In [24]:
prod_grp_pred_target_var

Unnamed: 0,bad_prod_flag,good_prod_flag,prod_id
0,0,0,0132793040
1,0,0,0321732944
2,0,0,0439886341
3,0,0,0511189877
4,0,0,0528881469
...,...,...,...
475997,0,0,BT008G3W52
475998,0,0,BT008SXQ4C
475999,0,0,BT008T2BGK
476000,0,0,BT008UKTMW


Creating a list of potential good and bad products for model training and testing

In [25]:
pred_good = prod_grp_pred1.loc[(prod_grp_pred1['rtg_avg_30days']>4.0)&(prod_grp_pred1['tot_rtg_30days']>=2)]
pred_good


Unnamed: 0_level_0,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days,rating_5_7days,rating_1_7days,tot_rtg_7days,sum_rtg_7days,max_num_rtg_7days,rtg_avg_7days,prod_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0511189877,4,0,5,22.0,2,4.400000,3,0,3,15.0,1,5.000000,0511189877
0594033926,3,0,4,19.0,1,4.750000,1,0,2,9.0,1,4.500000,0594033926
0594451647,2,0,2,10.0,1,5.000000,2,0,2,10.0,1,5.000000,0594451647
0594511488,2,0,2,10.0,1,5.000000,1,0,1,5.0,1,5.000000,0594511488
0777700018,2,0,2,10.0,1,5.000000,2,0,2,10.0,1,5.000000,0777700018
...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00LGQ6HL8,6,0,6,30.0,2,5.000000,6,0,6,30.0,2,5.000000,B00LGQ6HL8
B00LH52WT4,3,0,3,15.0,2,5.000000,3,0,3,15.0,2,5.000000,B00LH52WT4
B00LI4ZZO8,2,0,3,14.0,1,4.666667,2,0,3,14.0,1,4.666667,B00LI4ZZO8
B00LJO86NE,2,0,2,10.0,1,5.000000,1,0,1,5.0,1,5.000000,B00LJO86NE


In [26]:
pred_bad = prod_grp_pred1.loc[(prod_grp_pred1['rtg_avg_30days']<=2.5)&(prod_grp_pred1['tot_rtg_30days']>=2)]
pred_bad


Unnamed: 0_level_0,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days,rating_5_7days,rating_1_7days,tot_rtg_7days,sum_rtg_7days,max_num_rtg_7days,rtg_avg_7days,prod_id
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0594012015,1,3,4,8.0,1,2.000000,1,3,4,8.0,1,2.000000,0594012015
0899336795,0,1,3,7.0,1,2.333333,0,0,1,2.0,1,2.000000,0899336795
1400699894,0,1,2,4.0,1,2.000000,0,0,1,3.0,1,3.000000,1400699894
7227012573,0,0,2,5.0,1,2.500000,0,0,1,2.0,1,2.000000,7227012573
7538637400,0,1,2,4.0,1,2.000000,0,0,1,3.0,1,3.000000,7538637400
...,...,...,...,...,...,...,...,...,...,...,...,...,...
B00KYIRNWG,0,0,3,7.0,1,2.333333,0,0,1,2.0,1,2.000000,B00KYIRNWG
B00KZHP102,1,3,6,14.0,1,2.333333,0,1,1,1.0,1,1.000000,B00KZHP102
B00KZIZRIM,0,2,2,2.0,2,1.000000,0,2,2,2.0,2,1.000000,B00KZIZRIM
B00LBZ1Z7K,0,0,2,5.0,1,2.500000,0,0,2,5.0,1,2.500000,B00LBZ1Z7K


## Target Variables
We have now built two datasets with their respective target variables

In [27]:
pred_good_fin=pred_good.merge(prod_grp_pred_target_var, on=["prod_id"], how='left')

pred_bad_fin=pred_bad.merge(prod_grp_pred_target_var, on=["prod_id"], how='left')

In [28]:
pred_good_fin

Unnamed: 0,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days,rating_5_7days,rating_1_7days,tot_rtg_7days,sum_rtg_7days,max_num_rtg_7days,rtg_avg_7days,prod_id,bad_prod_flag,good_prod_flag
0,4,0,5,22.0,2,4.400000,3,0,3,15.0,1,5.000000,0511189877,0,0
1,3,0,4,19.0,1,4.750000,1,0,2,9.0,1,4.500000,0594033926,0,0
2,2,0,2,10.0,1,5.000000,2,0,2,10.0,1,5.000000,0594451647,0,0
3,2,0,2,10.0,1,5.000000,1,0,1,5.0,1,5.000000,0594511488,0,0
4,2,0,2,10.0,1,5.000000,2,0,2,10.0,1,5.000000,0777700018,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47223,6,0,6,30.0,2,5.000000,6,0,6,30.0,2,5.000000,B00LGQ6HL8,0,0
47224,3,0,3,15.0,2,5.000000,3,0,3,15.0,2,5.000000,B00LH52WT4,0,0
47225,2,0,3,14.0,1,4.666667,2,0,3,14.0,1,4.666667,B00LI4ZZO8,0,0
47226,2,0,2,10.0,1,5.000000,1,0,1,5.0,1,5.000000,B00LJO86NE,0,0


# Model training for predicting successful and bad products

In [29]:
x_train,x_test,y_train,y_test = train_test_split(pred_good_fin.drop(['prod_id','good_prod_flag','bad_prod_flag'],axis=1),pred_good_fin['good_prod_flag'],test_size=0.2,random_state=75)

x_train_bad,x_test_bad,y_train_bad,y_test_bad = train_test_split(pred_bad_fin.drop(['prod_id','good_prod_flag','bad_prod_flag'],axis=1),pred_bad_fin['bad_prod_flag'],test_size=0.2,random_state=75)

In [30]:
xg = XGBClassifier()
xg2 = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)
xg.fit(x_train, y_train)
xg2.fit(x_train, y_train)

y_pred_xgb = xg.predict(x_test)
y_pred_xgb2 = xg2.predict(x_test)

In [31]:
print(classification_report(y_test, y_pred_xgb))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      8053
           1       0.56      0.17      0.26      1393

    accuracy                           0.86      9446
   macro avg       0.72      0.57      0.59      9446
weighted avg       0.83      0.86      0.82      9446



In [32]:
print(classification_report(y_test, y_pred_xgb2))

              precision    recall  f1-score   support

           0       0.87      0.98      0.92      8053
           1       0.56      0.18      0.27      1393

    accuracy                           0.86      9446
   macro avg       0.72      0.58      0.60      9446
weighted avg       0.83      0.86      0.83      9446



In [33]:
len(pred_good_fin[pred_good_fin['good_prod_flag']==1])/len(pred_good_fin)

0.14840772423138815

We have a precision of 0.56 when predicting successful products. The actual event rate is 0.148 -> implying that we have achieved a predicting power of more than 3.5 times over the event rate

In [34]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(x_train,y_train)
y_pred_rf = rf.predict(x_test)

print(classification_report(y_test, y_pred_rf))

              precision    recall  f1-score   support

           0       0.88      0.97      0.92      8053
           1       0.53      0.20      0.29      1393

    accuracy                           0.86      9446
   macro avg       0.70      0.59      0.61      9446
weighted avg       0.82      0.86      0.83      9446



Ability to identify successful products is increased by a magnitude of ~3.5 times

More than half the recomendations are likely to become successful products with over 50 total reviews and average rating > 4.0

## Model Training and testing for bad product classification

In [35]:
xg_bad = XGBClassifier()
xg2_bad = xgb.XGBClassifier(n_estimators=100, max_depth=8, learning_rate=0.1, subsample=0.5)
xg_bad.fit(x_train_bad, y_train_bad)
xg2_bad.fit(x_train_bad, y_train_bad)

y_pred_xgb_bad = xg_bad.predict(x_test_bad)
y_pred_xgb2_bad = xg2_bad.predict(x_test_bad)

print(classification_report(y_test_bad, y_pred_xgb_bad))
print(classification_report(y_test_bad, y_pred_xgb2_bad))


              precision    recall  f1-score   support

           0       0.92      0.99      0.95      2067
           1       0.48      0.10      0.17       196

    accuracy                           0.91      2263
   macro avg       0.70      0.55      0.56      2263
weighted avg       0.88      0.91      0.89      2263

              precision    recall  f1-score   support

           0       0.92      0.99      0.95      2067
           1       0.53      0.10      0.17       196

    accuracy                           0.91      2263
   macro avg       0.72      0.55      0.56      2263
weighted avg       0.89      0.91      0.89      2263



In [36]:
rf_bad = RandomForestClassifier(n_estimators=1000,min_samples_leaf=5,
                               min_samples_split=10,max_depth=20)
rf_bad.fit(x_train_bad,y_train_bad)
y_pred_rf_bad = rf.predict(x_test_bad)

print(classification_report(y_test_bad, y_pred_rf_bad))

print(len(pred_bad_fin[pred_bad_fin['bad_prod_flag']==1])/len(pred_bad_fin))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2067
           1       0.50      0.01      0.01       196

    accuracy                           0.91      2263
   macro avg       0.71      0.50      0.48      2263
weighted avg       0.88      0.91      0.87      2263

0.08202227328972954


We are able to predict bad products with over a 50% accuracy
However, the best model is able to capture only 10% of all the bad products

While the accuracy for the Random Forest model is 100%, the capture rate is VERY low, making it a poor model this use case. The 2nd XGBoost model has the best combination of precision and recall, thus finalising the model for prediction

# Product Recommendation System - Generic

This is a rule based product reommendation system.
We assign weights to each product rating based on the total number of ratings, and average ratings of the top 5 percentile products
This is a system similar to the methodology used by IMDB to rate its top movies

In [37]:
prod_grp_fin

Unnamed: 0,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,prod_id,date_diff,...,max_num_rtg_7days,rtg_avg_7days,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days,good_prod_flag,bad_prod_flag
0,2013-04-13 00:00:00+00:00,2013-04-13 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,0132793040,0.0,...,1,5.0,1,0,1,5.0,1,5.0,0,0
1,2012-07-01 00:00:00+00:00,2012-07-01 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,0321732944,0.0,...,1,5.0,1,0,1,5.0,1,5.0,0,0
2,2012-04-18 00:00:00+00:00,2013-07-22 00:00:00+00:00,3,3,1.0,3.0,2,1.666667,0439886341,460.0,...,1,1.0,0,1,1,1.0,1,1.0,0,0
3,2014-03-22 00:00:00+00:00,2014-05-29 00:00:00+00:00,5,6,2.0,5.0,2,4.500000,0511189877,68.0,...,1,5.0,4,0,5,22.0,2,4.4,0,0
4,2010-06-21 00:00:00+00:00,2014-07-18 00:00:00+00:00,26,27,1.0,5.0,5,2.851852,0528881469,1488.0,...,1,2.0,0,0,2,6.0,1,3.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
475997,2012-12-17 00:00:00+00:00,2012-12-17 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,BT008G3W52,0.0,...,1,5.0,1,0,1,5.0,1,5.0,0,0
475998,2012-01-09 00:00:00+00:00,2012-01-09 00:00:00+00:00,1,1,1.0,1.0,1,1.000000,BT008SXQ4C,0.0,...,1,1.0,0,1,1,1.0,1,1.0,0,0
475999,2011-11-10 00:00:00+00:00,2011-11-10 00:00:00+00:00,1,1,5.0,5.0,1,5.000000,BT008T2BGK,0.0,...,1,5.0,1,0,1,5.0,1,5.0,0,0
476000,2010-12-31 00:00:00+00:00,2014-07-22 00:00:00+00:00,15,15,1.0,5.0,5,4.000000,BT008UKTMW,1299.0,...,1,3.0,0,0,1,3.0,1,3.0,0,0


In [38]:
rating_avg = prod_grp_fin['rtg_avg'].mean()
rating_count_q_95 = prod_grp_fin['tot_rating'].quantile(0.95)

In [47]:
top_prod = prod_grp_fin[prod_grp_fin['tot_rating']>=rating_count_q_95]
top_prod.shape


(24014, 24)

## Creating a function for calculating the combined rating


In [48]:
def combined_rating(df):
    vc = df['tot_rating']
    v_avg = df['rtg_avg']
    return (vc/(vc+rating_count_q_95)*v_avg)+(rating_count_q_95/(rating_count_q_95+vc)*rating_avg)


In [49]:
top_prod['new_rating']=top_prod.apply(combined_rating,axis=1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_prod['new_rating']=top_prod.apply(combined_rating,axis=1)


In [50]:
top_prod=top_prod.sort_values('new_rating',ascending=False).head(500)

## List of top 50 products based on a generic rating system

In [52]:
top_prod.head(50)

Unnamed: 0,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,prod_id,date_diff,...,rtg_avg_7days,rating_5_30days,rating_1_30days,tot_rtg_30days,sum_rtg_30days,max_num_rtg_30days,rtg_avg_30days,good_prod_flag,bad_prod_flag,new_rating
207782,2011-08-18 00:00:00+00:00,2014-07-14 00:00:00+00:00,779,2512,1.0,5.0,5,4.881369,B0043WJRRS,1061.0,...,5.0,3,0,3,15.0,1,5.0,1,0,4.860122
180308,2010-07-07 00:00:00+00:00,2014-07-13 00:00:00+00:00,885,1876,1.0,5.0,5,4.853412,B003FVVMS0,1467.0,...,5.0,1,0,1,5.0,1,5.0,1,0,4.825962
323018,2012-10-03 00:00:00+00:00,2014-07-21 00:00:00+00:00,537,1306,1.0,5.0,5,4.836907,B007R5YGO2,656.0,...,3.777778,22,0,33,146.0,4,4.424242,1,0,4.798629
369568,2012-11-09 00:00:00+00:00,2014-07-16 00:00:00+00:00,528,1447,1.0,5.0,5,4.828611,B009NB8WR0,614.0,...,5.0,12,1,14,65.0,2,4.642857,1,0,4.79423
137519,2011-01-24 00:00:00+00:00,2014-07-12 00:00:00+00:00,425,658,1.0,5.0,5,4.869301,B0029N3U8K,1265.0,...,5.0,1,0,2,8.0,1,4.0,0,0,4.793736
324655,2012-04-28 00:00:00+00:00,2014-07-09 00:00:00+00:00,326,449,1.0,5.0,5,4.895323,B007SZ0E1K,802.0,...,4.666667,9,0,10,49.0,2,4.9,1,0,4.785582
131560,2009-07-26 00:00:00+00:00,2014-07-20 00:00:00+00:00,545,798,1.0,5.0,5,4.842105,B001W28L2Y,1820.0,...,5.0,1,0,1,5.0,1,5.0,1,0,4.780696
241489,2011-08-09 00:00:00+00:00,2014-07-20 00:00:00+00:00,711,1524,1.0,5.0,5,4.811024,B004S4R5CK,1076.0,...,4.888889,10,0,11,54.0,4,4.909091,1,0,4.778932
423495,2013-07-17 00:00:00+00:00,2014-07-22 00:00:00+00:00,360,2607,1.0,5.0,5,4.797085,B00D5Q75RC,370.0,...,4.875,71,1,84,403.0,8,4.797619,1,0,4.778337
131557,2009-08-21 00:00:00+00:00,2014-07-16 00:00:00+00:00,525,735,1.0,5.0,5,4.843537,B001W26TIW,1790.0,...,5.0,1,0,1,5.0,1,5.0,1,0,4.777131


# Product Recommendation System - Personalised for User


In [88]:
prod_recom_data = data_prep_1[data_prep_1['tot_rating']> 1000]

In [89]:
prod_recom_data

Unnamed: 0,user_id,prod_id,rating,tstamp,tstamp_new,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,date_diff,rating_5,rating_1
183,A3AKVALGT4Y02G,0972683275,4.0,1203379200,2008-02-19 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.470980,2345.0,0,0
184,A22TFR5ELFDG50,0972683275,5.0,1207526400,2008-04-07 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.470980,2345.0,1,0
185,AQCEXDZDNVJUG,0972683275,4.0,1212969600,2008-06-09 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.470980,2345.0,0,0
186,AE8R1JSMJYIU,0972683275,4.0,1228694400,2008-12-08 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.470980,2345.0,0,0
187,A2DEU0B3AUINV9,0972683275,5.0,1263168000,2010-01-11 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.470980,2345.0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7795930,A20T8JALJ2MRG5,B00INNP5VU,2.0,1405987200,2014-07-22 00:00:00+00:00,2014-03-27 00:00:00+00:00,2014-07-23 00:00:00+00:00,119,1097,1.0,5.0,5,4.013674,118.0,0,0
7795931,A2A54M8Y7OIF0Y,B00INNP5VU,4.0,1406073600,2014-07-23 00:00:00+00:00,2014-03-27 00:00:00+00:00,2014-07-23 00:00:00+00:00,119,1097,1.0,5.0,5,4.013674,118.0,0,0
7795932,A2ORAYGSS7LAEL,B00INNP5VU,5.0,1406073600,2014-07-23 00:00:00+00:00,2014-03-27 00:00:00+00:00,2014-07-23 00:00:00+00:00,119,1097,1.0,5.0,5,4.013674,118.0,1,0
7795933,A1LXUR2W20KHU2,B00INNP5VU,1.0,1406073600,2014-07-23 00:00:00+00:00,2014-03-27 00:00:00+00:00,2014-07-23 00:00:00+00:00,119,1097,1.0,5.0,5,4.013674,118.0,0,1


In [90]:

data = Dataset.load_from_df(prod_recom_data[['user_id', 'prod_id', 'rating']], Reader())

In [91]:
data

<surprise.dataset.DatasetAutoFolds at 0x25db1865610>

In [92]:
cross_validate(SVD(), data, measures=['RMSE','MAE'], cv=3, verbose=True,n_jobs=-1)
#RMSE and MAE are the default measures in the surprise library

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    1.2036  1.2035  1.2014  1.2028  0.0010  
MAE (testset)     0.9174  0.9165  0.9163  0.9167  0.0005  
Fit time          31.42   26.29   23.28   27.00   3.36    
Test time         8.06    6.66    8.16    7.63    0.68    


{'test_rmse': array([1.2035834 , 1.20347646, 1.2014393 ]),
 'test_mae': array([0.91737476, 0.91649621, 0.91625349]),
 'fit_time': (31.420886039733887, 26.285184860229492, 23.28089737892151),
 'test_time': (8.062875986099243, 6.660243511199951, 8.15762448310852)}

# NOTE - System limitations
The RMSE values are large, and generally should not be accepted.
However, due to a lack of higer RAM capacity, and absence of GPU on my laptop, I will have to proceed with the above numbers.

Training ith a higher cross validation number kept resultig in memory errors and system crashes. For the same reason, the products being considered in the model have been restricted to only those products which have at least 1000 ratings in total

In [99]:
train = data.build_full_trainset()
svd = SVD()
svd.fit(train)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25e2d5b3c40>

In [95]:
prod_recom_data.head()

Unnamed: 0,user_id,prod_id,rating,tstamp,tstamp_new,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,date_diff,rating_5,rating_1
183,A3AKVALGT4Y02G,972683275,4.0,1203379200,2008-02-19 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.47098,2345.0,0,0
184,A22TFR5ELFDG50,972683275,5.0,1207526400,2008-04-07 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.47098,2345.0,1,0
185,AQCEXDZDNVJUG,972683275,4.0,1212969600,2008-06-09 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.47098,2345.0,0,0
186,AE8R1JSMJYIU,972683275,4.0,1228694400,2008-12-08 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.47098,2345.0,0,0
187,A2DEU0B3AUINV9,972683275,5.0,1263168000,2010-01-11 00:00:00+00:00,2008-02-19 00:00:00+00:00,2014-07-22 00:00:00+00:00,674,1051,1.0,5.0,5,4.47098,2345.0,1,0


In [96]:
prod_recom_data['user_id'].value_counts()

ADLVFFE4VBT8      61
A6FIAB28IS79      54
A2XRMQA6PJ5ZJ8    46
A1CMD08Z49PGKQ    42
A680RUE1FDO8B     40
                  ..
A1PONY6EPRHN0G     1
A2OZUZSCG3078U     1
A3FFPNVDUC5SYX     1
A336VRM9VDDYTO     1
A1LXUR2W20KHU2     1
Name: user_id, Length: 1084994, dtype: int64

In [103]:
prod_recom_data[prod_recom_data['user_id']=='A680RUE1FDO8B']

Unnamed: 0,user_id,prod_id,rating,tstamp,tstamp_new,date_min,date_max,uniq_cnt,tot_rating,rtg_min,rtg_max,rtg_uniq,rtg_avg,date_diff,rating_5,rating_1
169816,A680RUE1FDO8B,B00006B7DA,5.0,1262044800,2009-12-29 00:00:00+00:00,2003-02-03 00:00:00+00:00,2014-07-11 00:00:00+00:00,883,1106,1.0,5.0,5,3.914105,4176.0,1,0
222003,A680RUE1FDO8B,B00007EDM8,5.0,1295136000,2011-01-16 00:00:00+00:00,2003-06-06 00:00:00+00:00,2014-07-20 00:00:00+00:00,1129,1568,1.0,5.0,5,3.75,4062.0,1,0
802347,A680RUE1FDO8B,B000CKVOOY,5.0,1302307200,2011-04-09 00:00:00+00:00,2010-11-25 00:00:00+00:00,2014-07-16 00:00:00+00:00,926,1873,1.0,5.0,5,4.707955,1329.0,1,0
888085,A680RUE1FDO8B,B000EVSLRO,5.0,1168214400,2007-01-08 00:00:00+00:00,2006-07-03 00:00:00+00:00,2014-07-15 00:00:00+00:00,934,1283,1.0,5.0,5,4.177708,2934.0,1,0
1341476,A680RUE1FDO8B,B000RZQZM0,5.0,1229731200,2008-12-20 00:00:00+00:00,2007-07-12 00:00:00+00:00,2014-07-11 00:00:00+00:00,1091,1866,1.0,5.0,5,4.503751,2556.0,1,0
1383916,A680RUE1FDO8B,B000U5TUWE,5.0,1232755200,2009-01-24 00:00:00+00:00,2007-09-27 00:00:00+00:00,2014-07-13 00:00:00+00:00,793,1235,1.0,5.0,5,4.365992,2481.0,1,0
1419055,A680RUE1FDO8B,B000V1MLBE,5.0,1191110400,2007-09-30 00:00:00+00:00,2007-09-16 00:00:00+00:00,2014-07-10 00:00:00+00:00,622,1570,1.0,5.0,5,4.004459,2489.0,1,0
1568859,A680RUE1FDO8B,B00119T6NQ,3.0,1203292800,2008-02-18 00:00:00+00:00,2008-01-05 00:00:00+00:00,2014-07-13 00:00:00+00:00,785,1266,1.0,5.0,5,4.281201,2381.0,0,0
1701371,A680RUE1FDO8B,B0015AARJI,1.0,1229644800,2008-12-19 00:00:00+00:00,2008-04-04 00:00:00+00:00,2014-07-22 00:00:00+00:00,1478,4468,1.0,5.0,5,4.226723,2300.0,0,1
1800492,A680RUE1FDO8B,B0018P7WZ2,5.0,1298937600,2011-03-01 00:00:00+00:00,2008-05-26 00:00:00+00:00,2014-07-22 00:00:00+00:00,901,1259,1.0,5.0,5,4.200159,2248.0,1,0


In [110]:
svd.predict('A680RUE1FDO8B', 'B00119T6NQ', 3.0)

Prediction(uid='A680RUE1FDO8B', iid='B00119T6NQ', r_ui=3.0, est=3.7888403110791575, details={'was_impossible': False})

In [109]:
svd.predict('A680RUE1FDO8B', 'B008CS5T76', 5.0)

Prediction(uid='A680RUE1FDO8B', iid='B008CS5T76', r_ui=5.0, est=4.926208662445611, details={'was_impossible': False})

# Personalised model training and testing complete
The above model helps us estimate user ratings for products.

The model was built based on collaborative filtering. Groups of similar users areidentified, and ratings for products are calculated based on peer ratings.

I have used the SVD algorithm for creating the model, trained using the library 'Surprise'

# BOT Identification

In [None]:
user_grp_days = data_prep_1.groupby(['user_id','tstamp_new']).agg(rating_5=('rating_5','sum'),
                                         rating_1=('rating_1','sum'),
                                         tot_rtg=('rating','count'),
                                         sum_rtg=('rating','sum'),
                                         uniq_prod_cnt=('prod_id',pd.Series.nunique),
                                         rtg_avg=('rating','mean')
                                        )

user_grp_days['susp_bot1']=np.where(((user_grp_days['rating_1']>=5)&(user_grp_days['rating_1']==user_grp_days['tot_rtg'])),1,0)
user_grp_days['susp_bot2']=np.where(((user_grp_days['rating_5']>=5)&(user_grp_days['rating_5']==user_grp_days['tot_rtg'])),1,0)
user_grp_days['susp_bot']=user_grp_days['susp_bot1']+user_grp_days['susp_bot2']


In [None]:
len(user_grp_days[user_grp_days['susp_bot']>=1])

## There are 10550 user_id date combinations where the users are suspected bots

A user_id is suspected to be a bot if it has a string of low ratings within the same day. This could be done to artificaially lower the rating of competing products
The user_id can also manipulate ratings of prodcts belonging to the same seller/company by providing a series of high ratings for its products

I have applied the same rules above to arrive at a set of user_ids which could be bot accounts. While we are not certain, taking additional measures such as captcha verification for these accounts after rating could help identify bot/spam activity