In [1]:
import pathlib, time, pandas as pd, networkx as nx

ROOT = pathlib.Path().resolve().parents[0]
DATA = ROOT / "data"
INTER = ROOT / "intermediate"
INTER.mkdir(exist_ok=True)

CLICK_FILE = DATA / "clicks.parquet"
TAXO_FILE  = DATA / "taxonomy_edges.csv"
EDGE_OUT   = INTER / "edges.tsv"

print("Files exist?\n  clicks :", CLICK_FILE.exists(), 
      "\n  taxonomy:", TAXO_FILE.exists())

Files exist?
  clicks : True 
  taxonomy: True


In [2]:
import pyarrow.parquet as pq
import pandas as pd

clicks = pq.read_table(CLICK_FILE).to_pandas()
print("Click rows:", len(clicks))

pairs = (
    clicks.merge(clicks, on="user_id")
          .query("item_id_x < item_id_y")          # avoid self-pairs
          .groupby(["item_id_x", "item_id_y"])
          .size()
          .reset_index(name="cnt")
)

print("Co-click pairs:", len(pairs))
pairs.head()


Click rows: 6739590
Co-click pairs: 41858115


Unnamed: 0,item_id_x,item_id_y,cnt
0,101635370,B00004T1XE,1
1,101635370,B00004Z5M1,1
2,101635370,B00005T3UD,1
3,101635370,B00006B8BP,1
4,101635370,B00006B8DX,1


In [3]:
G = nx.Graph()

# co-click edges
G.add_weighted_edges_from(
    pairs.itertuples(index=False, name=None)    # (src, dst, cnt)
)

# taxonomy edges
taxo = pd.read_csv(TAXO_FILE)
G.add_weighted_edges_from(
    taxo.assign(cnt=1.0).itertuples(index=False, name=None)
)

print(f"Graph: {G.number_of_nodes():,} nodes  |  {G.number_of_edges():,} edges")

nx.write_weighted_edgelist(G, EDGE_OUT)
print("Wrote", EDGE_OUT)

Graph: 163,601 nodes  |  41,862,368 edges
Wrote /Users/vedantajain/hrec/intermediate/edges.tsv


In [5]:
import networkx as nx
nx.write_weighted_edgelist(G, EDGE_OUT, delimiter="\t")