In [1]:
import glob
import tqdm
import snap
import json
import calendar
import pandas as pd

In [2]:
all_vertex_filepaths = glob.glob('/home/wangmingrui/Datasets/amazon/vertices/*')
all_edge_filepaths = glob.glob('/home/wangmingrui/Datasets/amazon/edges/*')
all_vertex_filepaths.sort()
all_edge_filepaths.sort()

In [3]:
G = snap.TNEANet()
product_id_to_cont = {}

pbar = tqdm.tqdm(total=len(all_vertex_filepaths))
for filepath in all_vertex_filepaths:
    pbar.desc = filepath
    pbar.update(1)
    df = pd.read_csv(filepath, sep='\t')
    df = df[['product_id', 'product_title', 'product_category']]

    for product_id, product_title, product_category in df.values:
        if product_id in product_id_to_cont:
            continue

        product_cont_id = len(product_id_to_cont)
        product_id_to_cont[product_id] = product_cont_id

        G.AddNode(product_cont_id)
        G.AddIntAttrDatN(product_cont_id, 1, 'is_product')
        G.AddStrAttrDatN(product_cont_id, product_category, 'product_category')
    del df
pbar.close()

/home/wangmingrui/Datasets/amazon/vertices/Wireless_v1_00.tsv: 100%|██████████| 46/46 [03:29<00:00,  2.31s/it]


In [5]:
num_product_nodes = G.GetNodes()
print('Found {} vertices'.format(num_product_nodes))

FOut = snap.TFOut('amazon_products_only.graph')
G.Save(FOut)
FOut.Flush()

with open('amazon_product_id_to_cont.json', 'w') as f:
    json.dump(product_id_to_cont, f)

Found 21390118 vertices


In [3]:
G = snap.TNEANet.Load(snap.TFIn('amazon_products_only.graph'))
num_product_nodes = G.GetNodes()
with open('amazon_product_id_to_cont.json', 'r') as f:
    product_id_to_cont = json.load(f)

In [None]:
def date_str_to_timestamp(date_str):
    timetuple = [int(i) for i in date_str.split('-')]
    timetuple += [0, 0, 0]
    return calendar.timegm(timetuple)

customer_id_to_cont = {}

# pbar = tqdm.tqdm(total=len(all_edge_filepaths))
for filepath in all_edge_filepaths:
#     pbar.desc = filepath
#     pbar.update(1)
    df = pd.read_csv(filepath, sep='\t')
    df = df[[
        'customer_id', 'product_id',
        'star_rating', 'helpful_votes', 'total_votes',
        'vine', 'verified_purchase', 'review_date'
    ]]

    for row in tqdm.tqdm(df.values):
        (
            customer_id,
            product_id,
            star_rating,
            helpful_votes,
            total_votes,
            vine,
            verified_purchase,
            review_date
        ) = row
        
        if customer_id not in customer_id_to_cont:
            customer_cont_id = len(customer_id_to_cont) + num_product_nodes
            customer_id_to_cont[customer_id] = customer_cont_id
            G.AddNode(customer_cont_id)
            G.AddIntAttrDatN(customer_cont_id, 0, 'is_product')
        else:
            customer_cont_id = customer_id_to_cont[customer_id]
        product_cont_id = product_id_to_cont[product_id]

        edge = G.AddEdge(customer_cont_id, product_cont_id)
        G.AddIntAttrDatE(edge, star_rating, 'star_rating')
        G.AddIntAttrDatE(edge, helpful_votes, 'helpful_votes')
        G.AddIntAttrDatE(edge, total_votes, 'total_votes')
        G.AddIntAttrDatE(edge, vine == 'Y', 'vine')
        G.AddIntAttrDatE(edge, verified_purchase == 'Y', 'verified_purchase')
        G.AddIntAttrDatE(edge, date_str_to_timestamp(review_date), 'review_date')
# pbar.close()

 59%|█████▉    | 3477313/5906028 [01:36<01:04, 37502.45it/s]