In [22]:
from datasets import load_dataset
import pandas as pd
import os



In [23]:
CATEGORY = "Cell_Phones_and_Accessories"
MIN_TIMESTAMP = 1577836800000 # 2020.01.01 00:00:000

# Raw data

### Downald from web datasets and save it in parquet format

In [24]:

if not os.path.exists(f"data/raw_review_{CATEGORY}.parquet"):
    dataset = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_review_{CATEGORY}", trust_remote_code=True)
    dataset['full'].to_parquet(f"data/raw_review_{CATEGORY}.parquet")

if not os.path.exists(f"data/raw_meta_{CATEGORY}.parquet"):  
    dataset_META = load_dataset("McAuley-Lab/Amazon-Reviews-2023", f"raw_meta_{CATEGORY}", trust_remote_code=True)
    dataset_META["full"].to_parquet(f"data/raw_meta_{CATEGORY}.parquet")


# Transformed data 

## Reviews

In [25]:
review_df = pd.read_parquet(f"data/raw_review_{CATEGORY}.parquet")
review_df

Unnamed: 0,rating,title,text,images,asin,parent_asin,user_id,timestamp,helpful_vote,verified_purchase
0,4.0,No white background! It’s clear!,I bought this bc I thought it had the nice whi...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B08L6L3X1S,B08L6L3X1S,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1612044451196,0,True
1,5.0,Awesome! Great price! Works well!,Perfect. How pissed am I that I recently paid ...,[],B079BPGF6C,B079BPGF6C,AFKZENTNBQ7A7V7UXW5JJI6UGRYQ,1534443517349,2,True
2,5.0,Worked but took an hour to install,Overall very happy with the end result. If you...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B088DR7Z5B,B0BBGGC8F2,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1629235304798,3,True
3,4.0,Decent,Lasted about 9 months then the lock button bro...,"[{'attachment_type': 'IMAGE', 'large_image_url...",B07XRDHDNQ,B07XRDHDNQ,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1590470082910,0,True
4,5.0,LOVE IT!,LOVE THIS CASE! Works better than my expensive...,[],B00A8ZDL9Y,B00A8ZDL9Y,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,1408994588000,0,True
...,...,...,...,...,...,...,...,...,...,...
20812940,5.0,Great product.,Works like a charm. Hugs the air vents and doe...,[],B07GWZ5TDN,B08PH51TH7,AEMUARCPFEENKWQD6DWHTTEUT6SQ,1580678991616,0,True
20812941,5.0,Great Nite Light,This is great product. the LED nite light can ...,[],B07N2FVX7H,B07N2FVX7H,AEMUARCPFEENKWQD6DWHTTEUT6SQ,1573839159982,0,True
20812942,2.0,Wizgear,Not too happy with this product. It keeps on f...,[],B0176S0GCU,B0176S0GCU,AEMUARCPFEENKWQD6DWHTTEUT6SQ,1463911508000,0,True
20812943,5.0,Five Stars,Works good,[],B00QTE09SY,B0BM9LK5TG,AFZCCH2LRAP6ICSAMXW32FKJJ5PA,1453694244000,0,True


In [26]:
review_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20812945 entries, 0 to 20812944
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   rating             float64
 1   title              object 
 2   text               object 
 3   images             object 
 4   asin               object 
 5   parent_asin        object 
 6   user_id            object 
 7   timestamp          int64  
 8   helpful_vote       int64  
 9   verified_purchase  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 1.4+ GB


### Reduce size of dataframe

In [27]:
review_df = review_df[review_df['asin'] == review_df['parent_asin'] ]
review_df = review_df[review_df["timestamp"] > MIN_TIMESTAMP] # reviews which was written after 2020.01.01 00:00:000
review_df = review_df.groupby('parent_asin').filter(lambda x: len(x) > 9) # We only look to product with at least 10 reviews
review_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2898462 entries, 3 to 20812939
Data columns (total 10 columns):
 #   Column             Dtype  
---  ------             -----  
 0   rating             float64
 1   title              object 
 2   text               object 
 3   images             object 
 4   asin               object 
 5   parent_asin        object 
 6   user_id            object 
 7   timestamp          int64  
 8   helpful_vote       int64  
 9   verified_purchase  bool   
dtypes: bool(1), float64(1), int64(2), object(6)
memory usage: 223.9+ MB


In [28]:
review_df.isna().sum()

rating               0
title                0
text                 0
images               0
asin                 0
parent_asin          0
user_id              0
timestamp            0
helpful_vote         0
verified_purchase    0
dtype: int64

### Drop images and dupclicates 

In [29]:
review_df  = review_df.drop(['images','user_id'], axis =1).drop_duplicates()
review_df

Unnamed: 0,rating,title,text,asin,parent_asin,timestamp,helpful_vote,verified_purchase
3,4.0,Decent,Lasted about 9 months then the lock button bro...,B07XRDHDNQ,B07XRDHDNQ,1590470082910,0,True
16,2.0,Don't try to tighten it up!!,Putting it on a night stand drawer or top of h...,B07KJH11VV,B07KJH11VV,1606781266760,0,True
23,5.0,Fast,Fast charging,B085HFJCKW,B085HFJCKW,1606084321645,0,True
35,1.0,Returning as soon as I can.,I absolutely do not like this phone. Screen is...,B07WLLLTNZ,B07WLLLTNZ,1649706650086,0,True
36,5.0,Very nice,Nice phone with decent sized screen. Charges f...,B08J4JYD47,B08J4JYD47,1647645304652,2,True
...,...,...,...,...,...,...,...,...
20812878,5.0,Case,Its very good material nice and sturdiness,B07QNG5FQ4,B07QNG5FQ4,1612909409505,0,True
20812899,5.0,Good quality,Holding up daily uses pretty well. Good value ...,B08FXD4R21,B08FXD4R21,1607063977681,0,True
20812919,1.0,Bad,I disliked this product. I have had many scree...,B07YFYKVMZ,B07YFYKVMZ,1585850985346,0,True
20812935,5.0,Great ig,Love it,B07SJZZN78,B07SJZZN78,1580874756275,0,True


### Removed reviews where purchase is not verified

In [30]:
review_df.verified_purchase.value_counts()

verified_purchase
True     2742496
False     120618
Name: count, dtype: int64

In [31]:
review_df = review_df[review_df["verified_purchase"] == True].reindex()

### Save

In [32]:
review_df.to_parquet(f'data/review_{CATEGORY}.parquet')

## Metadata

In [33]:
metadata_df = pd.read_parquet(f"data/raw_meta_{CATEGORY}.parquet")
metadata_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,images,videos,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,ARAREE Slim Diary Cell Phone Case for Samsung ...,3.8,5,"[Genuine Cow leather with 6 different colors, ...","[JUST LOOK, You can tell the difference. Make ...",,"{'hi_res': [None, None, None, None, None, None...","{'title': [], 'url': [], 'user_id': []}",araree,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""3.35 x 0.59 x 6.18 inc...",B013SK1JTY,,,
1,Cell Phones & Accessories,Bastmei for OnePlus 7T Case Extremely Light Ul...,4.4,177,[Ultra-thin & Ultra-light: The ultra slim fit ...,[],11.98,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Bastmei,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Package Dimensions"": ""7.6 x 4.29 x 0.75 inch...",B07ZPSG8P5,,,
2,Cell Phones & Accessories,Wireless Fones Branded New Iphone 5C/LITE Hot ...,4.0,2,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",WIRELESS FONES,"[Cell Phones & Accessories, iPhone Accessories]","{""Item model number"": ""Apple Iphone 5C"", ""Othe...",B00GKR3L12,,,
3,Cell Phones & Accessories,"iPhone 6 Plus + Case, DandyCase Perfect PATTER...",4.0,15,"[Slim-Fit design for the iPhone 6 Plus (5.5"" s...",[Case does not need to be removed for charging...,,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",DandyCase,"[Cell Phones & Accessories, iPhone Accessories]","{""Product Dimensions"": ""5.43 x 0.28 x 2.64 inc...",B00PB8U8BW,,,
4,Cell Phones & Accessories,"Case for Galaxy S6/S6 Edge, Thin Translucent V...",4.0,1,[],[],,"{'hi_res': [None, None], 'large': ['https://m....","{'title': [], 'url': [], 'user_id': []}",7Pite,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Package Dimensions"": ""8.31 x 3.74 x 0.55 inc...",B07D3RHSRV,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288485,,Original White Touch Screen for Star N8000 Cel...,3.1,3,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",STAR,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Other display features"": ""Wireless"", ""Color""...",B00BHXX6N2,,,
1288486,Cell Phones & Accessories,AERO ARMOR Protective Case for Samsung Galaxy ...,4.5,9,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",AERO ARMOR,[],"{""Package Dimensions"": ""8.7 x 4.6 x 0.7 inches...",B00P9Y70XC,,,
1288487,Sports & Outdoors,Bandiction 3 Pack Sport Band Compatible with A...,4.2,18,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Bandiction,"[Cell Phones & Accessories, Accessories, Smart...","{""Item Package Dimensions L x W x H"": ""8.5 x 2...",B08JCMWH7H,,,
1288488,Cell Phones & Accessories,"Weycolor Moto E7 (2021) Case, Liquid Silicone ...",5.0,1,[],[],,{'hi_res': ['https://m.media-amazon.com/images...,"{'title': [], 'url': [], 'user_id': []}",Weycolor,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""6 x 3 x 0.25 inches"", ...",B08TQQRS69,,,


In [34]:
metadata_df = metadata_df.drop(['images','videos'],axis =1)
metadata_df

Unnamed: 0,main_category,title,average_rating,rating_number,features,description,price,store,categories,details,parent_asin,bought_together,subtitle,author
0,Cell Phones & Accessories,ARAREE Slim Diary Cell Phone Case for Samsung ...,3.8,5,"[Genuine Cow leather with 6 different colors, ...","[JUST LOOK, You can tell the difference. Make ...",,araree,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""3.35 x 0.59 x 6.18 inc...",B013SK1JTY,,,
1,Cell Phones & Accessories,Bastmei for OnePlus 7T Case Extremely Light Ul...,4.4,177,[Ultra-thin & Ultra-light: The ultra slim fit ...,[],11.98,Bastmei,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Package Dimensions"": ""7.6 x 4.29 x 0.75 inch...",B07ZPSG8P5,,,
2,Cell Phones & Accessories,Wireless Fones Branded New Iphone 5C/LITE Hot ...,4.0,2,[],[],,WIRELESS FONES,"[Cell Phones & Accessories, iPhone Accessories]","{""Item model number"": ""Apple Iphone 5C"", ""Othe...",B00GKR3L12,,,
3,Cell Phones & Accessories,"iPhone 6 Plus + Case, DandyCase Perfect PATTER...",4.0,15,"[Slim-Fit design for the iPhone 6 Plus (5.5"" s...",[Case does not need to be removed for charging...,,DandyCase,"[Cell Phones & Accessories, iPhone Accessories]","{""Product Dimensions"": ""5.43 x 0.28 x 2.64 inc...",B00PB8U8BW,,,
4,Cell Phones & Accessories,"Case for Galaxy S6/S6 Edge, Thin Translucent V...",4.0,1,[],[],,7Pite,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Package Dimensions"": ""8.31 x 3.74 x 0.55 inc...",B07D3RHSRV,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1288485,,Original White Touch Screen for Star N8000 Cel...,3.1,3,[],[],,STAR,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Other display features"": ""Wireless"", ""Color""...",B00BHXX6N2,,,
1288486,Cell Phones & Accessories,AERO ARMOR Protective Case for Samsung Galaxy ...,4.5,9,[],[],,AERO ARMOR,[],"{""Package Dimensions"": ""8.7 x 4.6 x 0.7 inches...",B00P9Y70XC,,,
1288487,Sports & Outdoors,Bandiction 3 Pack Sport Band Compatible with A...,4.2,18,[],[],,Bandiction,"[Cell Phones & Accessories, Accessories, Smart...","{""Item Package Dimensions L x W x H"": ""8.5 x 2...",B08JCMWH7H,,,
1288488,Cell Phones & Accessories,"Weycolor Moto E7 (2021) Case, Liquid Silicone ...",5.0,1,[],[],,Weycolor,"[Cell Phones & Accessories, Cases, Holsters & ...","{""Product Dimensions"": ""6 x 3 x 0.25 inches"", ...",B08TQQRS69,,,


### Save

In [35]:
metadata_df.to_parquet(f'data/metadata_{CATEGORY}.parquet')