In [5]:
import gzip
import json
import pandas as pd
from tqdm import tqdm

def load_snap_reviews(path, max_rows=None):
    data = []
    with gzip.open(path, 'rt', encoding='utf-8') as f:
        for i, line in enumerate(tqdm(f)):
            data.append(json.loads(line))
            if max_rows and i >= max_rows - 1:
                break
    return pd.DataFrame(data)

# Load small sample first (important!)
df = load_snap_reviews(
    "../data/raw/Electronics.json.gz",
    max_rows=100_000
)

99999it [00:01, 64725.78it/s]


In [6]:
df.head()

Unnamed: 0,overall,verified,reviewTime,reviewerID,asin,style,reviewerName,reviewText,summary,unixReviewTime,vote,image
0,5.0,True,"07 17, 2002",A1N070NS9CJQ2I,60009810,{'Format:': ' Hardcover'},Teri Adams,This was the first time I read Garcia-Aguilera...,Hit The Spot!,1026864000,,
1,5.0,False,"07 6, 2002",A3P0KRKOBQK1KN,60009810,{'Format:': ' Hardcover'},Willa C.,"As with all of Ms. Garcia-Aguilera's books, I ...",one hot summer is HOT HOT HOT!,1025913600,,
2,5.0,False,"07 3, 2002",A192HO2ICJ75VU,60009810,{'Format:': ' Hardcover'},Kit,I've not read any of Ms Aguilera's works befor...,One Hot Summer,1025654400,2.0,
3,4.0,False,"06 30, 2002",A2T278FKFL3BLT,60009810,{'Format:': ' Hardcover'},Andres,This romance novel is right up there with the ...,I love this book!,1025395200,3.0,
4,5.0,False,"06 28, 2002",A2ZUXVTW8RXBXW,60009810,{'Format:': ' Hardcover'},John,Carolina Garcia Aguilera has done it again. S...,One Hot Book,1025222400,,


In [7]:
df.columns

Index(['overall', 'verified', 'reviewTime', 'reviewerID', 'asin', 'style',
       'reviewerName', 'reviewText', 'summary', 'unixReviewTime', 'vote',
       'image'],
      dtype='str')

In [8]:
data = df[[
    "reviewerID",
    "asin",
    "overall",
    "unixReviewTime"
]].rename(columns={
    "reviewerID": "user_id",
    "asin": "item_id",
    "overall": "rating",
    "unixReviewTime": "timestamp"
})

In [9]:
data.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,A1N070NS9CJQ2I,60009810,5.0,1026864000
1,A3P0KRKOBQK1KN,60009810,5.0,1025913600
2,A192HO2ICJ75VU,60009810,5.0,1025654400
3,A2T278FKFL3BLT,60009810,4.0,1025395200
4,A2ZUXVTW8RXBXW,60009810,5.0,1025222400


In [10]:
print("Users:", data.user_id.nunique())
print("Items:", data.item_id.nunique())
print("Interactions:", len(data))

data.rating.value_counts().sort_index()

Users: 95191
Items: 1290
Interactions: 100000


rating
1.0     9407
2.0     4535
3.0     7043
4.0    17410
5.0    61605
Name: count, dtype: int64

In [12]:
data.to_csv(
    "../data/processed/interactions.csv",
    index=False
)