In [16]:
from langchain import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline, AutoModel, AutoModelForSequenceClassification, BertForSequenceClassification, AdamW, BertConfig, BertTokenizer, BertTokenizerFast
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset
from datasets import load_dataset
from trl import SFTTrainer
import spacy
import logging
import pandas as pd
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from tqdm import tqdm
%matplotlib inline

In [3]:
unlabeled_df = pd.read_excel("unlabeled_all.xlsx", sheet_name="Sheet1")

In [5]:
unlabeled_df.groupby('product').count()

Unnamed: 0_level_0,index,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,...,feature,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details
product,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
desk,3603,3603,927,3603,3603,3603,3603,2681,3603,3603,...,3603,3603,3603,3603,2596,3188,1785,3603,3603,3603
fridge,5201,5201,1472,5201,5201,5201,5201,3249,5201,5201,...,5201,5201,5201,5201,4286,4934,2568,5201,5201,5201
ladder,2931,2931,1214,2931,2931,2931,2931,1823,2931,2931,...,2931,2931,2931,2930,1952,2797,1942,2931,2931,2931
mower,7569,7569,2827,7569,7569,7569,7569,4375,7569,7569,...,7569,7569,7569,7569,6333,2,5345,7569,7569,7569


In [20]:
nb_labels = 3
n_class = nb_labels
model_id = 'bert-base-uncased'

In [10]:
tokenizer = BertTokenizer.from_pretrained(model_id, 
                                          do_lower_case=True)
model = BertForSequenceClassification.from_pretrained(model_id,
                                                      num_labels=nb_labels,
                                                      output_attentions=False,
                                                      output_hidden_states=False)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
#device = torch.device('cpu') #use cpu to diagnose error
model.to(device)

model.load_state_dict(torch.load('data_volume2/finetuned_BERT_base_uncased_DQD_epoch_1.model', map_location=torch.device('cpu')))

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

<All keys matched successfully>

In [11]:
label_dict = {
    'Delivery/Packaging': 0,
    'Quality': 1,
    'Design/Functionality': 2
}

In [12]:
def bert_predict(text, model, tokenizer, n_class):
    
    inputs = tokenizer.encode_plus(
        text, 
        add_special_tokens=True, 
        return_attention_mask=True, 
        pad_to_max_length=True, 
        max_length=256, 
        return_tensors='pt'
    )
    model.eval()
    with torch.no_grad():    
        output = model(**inputs)
    
    return np.argmax(output[0].detach().numpy(), axis= 1).item()

In [14]:
def predict(row):
    review_text = row['reviewText']
    predictions = bert_predict(review_text, model, tokenizer, n_class)
    return predictions

In [13]:
labeled_df = unlabeled_df.copy()

In [18]:
tqdm.pandas()

In [21]:
labeled_df['predictions'] = labeled_df.progress_apply(predict, axis=1)

  0%|                                               | 0/19304 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|███████████████████████████████████| 19304/19304 [15:22<00:00, 20.93it/s]


In [22]:
labeled_df.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,predictions
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,"['>#667,119 in Tools & Home Improvement (See t...","['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},0
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,"['>#667,119 in Tools & Home Improvement (See t...","['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},0
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,"['>#324,997 in Tools & Home Improvement (See t...",[],Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},2
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,"['>#2,528,323 in Tools & Home Improvement (See...","['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},1
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,"['>#2,528,323 in Tools & Home Improvement (See...","['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},2


In [23]:
label_mapping = {0: 'Delivery/ Packaging', 1: 'Quality', 2: 'Design/Functionality'}

In [24]:
labeled_df['predictions'] = labeled_df['predictions'].map(label_mapping)

In [25]:
labeled_df.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,predictions
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,"['>#667,119 in Tools & Home Improvement (See t...","['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,"['>#667,119 in Tools & Home Improvement (See t...","['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,"['>#324,997 in Tools & Home Improvement (See t...",[],Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},Design/Functionality
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,"['>#2,528,323 in Tools & Home Improvement (See...","['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Quality
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,"['>#2,528,323 in Tools & Home Improvement (See...","['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Design/Functionality


In [26]:
labeled_df.rename(columns={'predictions': 'reviewType'}, inplace=True)

In [27]:
labeled_df.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,rank,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,reviewType
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,"['>#667,119 in Tools & Home Improvement (See t...","['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,"['>#667,119 in Tools & Home Improvement (See t...","['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,"['>#324,997 in Tools & Home Improvement (See t...",[],Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},Design/Functionality
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,"['>#2,528,323 in Tools & Home Improvement (See...","['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Quality
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,"['>#2,528,323 in Tools & Home Improvement (See...","['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Design/Functionality


In [28]:
labeled_df.columns

Index(['index', 'product', 'overall', 'vote', 'verified', 'reviewTime',
       'reviewerID', 'asin', 'style', 'reviewerName', 'reviewText', 'summary',
       'unixReviewTime', 'image', 'category', 'tech1', 'description', 'fit',
       'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view',
       'main_cat', 'similar_item', 'date', 'price', 'imageURL',
       'imageURLHighRes', 'details', 'reviewType'],
      dtype='object')

In [None]:
labeled_df.

In [97]:
ranked_df_1 = labeled_df.copy()

In [98]:
product_asin_counts = ranked_df_1.groupby(['product', 'asin']).size()
#ranked_product_asin = product_asin_counts.groupby('product').transform('rank', ascending=False)
ranked_product_asin = product_asin_counts.groupby('product').rank(method='dense', ascending=False)
#ranked_product_asin = ranked_product_asin.reset_index(drop=True)
#ranked_df_1['mostReviewed_rank'] = ranked_product_asin

review_rank = ranked_product_asin.reset_index()
review_rank.rename(columns={0 : 'mostReviewed_rank'}, inplace=True)

In [110]:
ranked_df_2 = pd.merge(ranked_df_1, review_rank[['asin', 'mostReviewed_rank']], on='asin')

In [111]:
ranked_df_2.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,reviewType,mostReviewed_rank
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,"['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,"['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,[],Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},Design/Functionality,48.0
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,"['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Quality,47.0
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,"['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Design/Functionality,47.0


In [40]:
ranked_df_1.columns

Index(['index', 'product', 'overall', 'vote', 'verified', 'reviewTime',
       'reviewerID', 'asin', 'style', 'reviewerName', 'reviewText', 'summary',
       'unixReviewTime', 'image', 'category', 'tech1', 'description', 'fit',
       'title', 'also_buy', 'tech2', 'brand', 'feature', 'rank', 'also_view',
       'main_cat', 'similar_item', 'date', 'price', 'imageURL',
       'imageURLHighRes', 'details', 'reviewType', 'mostReviewed_rank'],
      dtype='object')

In [112]:
ranked_df_3 = ranked_df_2.copy()

In [113]:
avg_star_df = ranked_df_3.groupby('asin')['overall'].mean().reset_index()

In [114]:
product_star_avg = pd.merge(ranked_df_3[['product','asin']], avg_star_df, on='asin')

In [115]:
ranked_df_2[['product','asin']]

Unnamed: 0,product,asin
0,ladder,B0000224LY
1,ladder,B0000224LY
2,ladder,B0000224M4
3,ladder,B0000224LQ
4,ladder,B0000224LQ
...,...,...
19299,desk,B01H20EAXS
19300,desk,B01H433G9W
19301,desk,B01HCAZY7E
19302,desk,B01HCAZY7E


In [116]:
product_star_avg

Unnamed: 0,product,asin,overall
0,ladder,B0000224LY,1.0
1,ladder,B0000224LY,1.0
2,ladder,B0000224M4,3.0
3,ladder,B0000224LQ,2.0
4,ladder,B0000224LQ,2.0
...,...,...,...
19299,desk,B01H20EAXS,2.0
19300,desk,B01H433G9W,3.0
19301,desk,B01HCAZY7E,1.5
19302,desk,B01HCAZY7E,1.5


In [117]:
product_star_avg.rename(columns={'overall': 'avg_star_rating'}, inplace=True)
product_star_avg = product_star_avg.drop_duplicates('asin')

In [136]:
ranked_df_final = pd.merge(ranked_df_3, product_star_avg[['asin','avg_star_rating']], on='asin')

In [138]:
ranked_df_final.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,reviewType,mostReviewed_rank,avg_star_rating
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0,1.0
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0,1.0
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},Design/Functionality,48.0,3.0
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Quality,47.0,2.0
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Design/Functionality,47.0,2.0


In [141]:
#total_rows = len(ranked_df_final)
#progress_bar = tqdm(total=total_rows, desc="Writing CSV")

#ranked_df_final.to_csv("labeled_final.csv", index=False, chunksize=1000)
#ranked_df_final.to_csv("labeled_final.csv", index=False)
ranked_df_final.to_excel("labeled_final.xlsx", index=False)
#progress_bar.close()

In [137]:
len(ranked_df_final)

19304

In [125]:
len(ranked_df_1)

19304

In [126]:
len(ranked_df_2)

19304

In [127]:
len(ranked_df_3)

19304

In [128]:
len(product_star_avg)

19304

In [129]:
len(product_star_avg[['asin','avg_star_rating']])

19304

In [130]:
ranked_df_3.head()

Unnamed: 0,index,product,overall,vote,verified,reviewTime,reviewerID,asin,style,reviewerName,...,also_view,main_cat,similar_item,date,price,imageURL,imageURLHighRes,details,reviewType,mostReviewed_rank
0,0,ladder,1,,True,"08 12, 2015",A1L1U5H7ZVOBBE,B0000224LY,{'Size Name:': ' 28 Feet'},Andreas Ringstad,...,"['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0
1,1,ladder,1,,True,"10 19, 2016",A2T4TNHPL68SYK,B0000224LY,{'Size Name:': ' 32 Feet'},Amazon Customer,...,"['B0017ZRIFM', 'B0017ZRIG6', 'B074GLDCMG', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",$291.23,[],[],{},Delivery/ Packaging,47.0
2,2,ladder,3,,True,"10 30, 2004",A6SHOGP56RZLA,B0000224M4,{'Size Name:': ' 2 Feet'},Jeffrey S. Alek,...,[],Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,[],[],{},Design/Functionality,48.0
3,3,ladder,1,23.0,True,"12 1, 2005",A2971BCXW8MCKY,B0000224LQ,,M. Lewis,...,"['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Quality,47.0
4,4,ladder,3,2.0,True,"03 4, 2011",A3HI1K6M2SPB1H,B0000224LQ,,Rena K. Rouse,...,"['B000JIL0WU', 'B00BGE2NXQ', 'B003KGBIKC', 'B0...",Tools & Home Improvement,"class=""a-bordered a-horizontal-stripes a-spa...","November 8, 1999",,['https://images-na.ssl-images-amazon.com/imag...,['https://images-na.ssl-images-amazon.com/imag...,{},Design/Functionality,47.0


In [133]:
duplicates = product_star_avg.duplicated('asin')
print(product_star_avg[duplicates])

      product        asin  avg_star_rating
1      ladder  B0000224LY              1.0
4      ladder  B0000224LQ              2.0
6      ladder  B00002N6J5              2.6
7      ladder  B00002N6J5              2.6
8      ladder  B00002N6J5              2.6
...       ...         ...              ...
19294    desk  B01GV3P62G              2.0
19295    desk  B01GV3P62G              2.0
19296    desk  B01GV3P62G              2.0
19298    desk  B01GVL2LZI              1.5
19302    desk  B01HCAZY7E              1.5

[17640 rows x 3 columns]


In [135]:
len(product_star_avg)

1664

In [134]:
product_star_avg = product_star_avg.drop_duplicates('asin')