# Phase 3: Graph Clustering (HDFS Version)
This notebook builds a Co-Readership Graph and detects book communities using HDFS data.

In [1]:
import os
import sys
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, desc

# Ensure GraphFrames is available
os.environ["PYSPARK_SUBMIT_ARGS"] = "--packages graphframes:graphframes:0.8.2-spark3.2-s_2.12 pyspark-shell"

# Configurable Limit (Set to 0 for full dataset)
LIMIT = 1000000

In [2]:
# Initialize Spark Session (Cluster Mode)
spark = SparkSession.builder \
    .appName("Goodreads_Graph_Clustering") \
    .config("spark.driver.memory", "8g") \
    .getOrCreate()

print("‚úÖ Spark Session created.")

:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/ubuntu/.ivy2/cache
The jars for the packages stored in: /home/ubuntu/.ivy2/jars
graphframes#graphframes added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-3f886886-227f-43bd-860a-47ef7539e92f;1.0
	confs: [default]
	found graphframes#graphframes;0.8.2-spark3.2-s_2.12 in spark-packages
	found org.slf4j#slf4j-api;1.7.16 in central
:: resolution report :: resolve 131ms :: artifacts dl 4ms
	:: modules in use:
	graphframes#graphframes;0.8.2-spark3.2-s_2.12 from spark-packages in [default]
	org.slf4j#slf4j-api;1.7.16 from central in [default]
	---------------------------------------------------------------------
	|                  |            modules            ||   artifacts   |
	|       conf       | number| search|dwnlded|evicted|| number|dwnlded|
	---------------------------------------------------------------------
	|      default     |   2   |   0   |   0   |   0   ||   2   |   0   |
	---------------------------------

‚úÖ Spark Session created.


In [3]:
# HDFS Paths
hdfs_base = "hdfs:///user/ubuntu/goodreads_data/processed"
interactions_path = f"{hdfs_base}/master_interactions"
out_communities = f"{hdfs_base}/book_communities"

In [4]:
# Read Master Interactions
print("‚è≥ Reading master_interactions from HDFS...")
try:
    df_interactions = spark.read.parquet(interactions_path).select("user_id", "book_id")
    print("Finished reading.")
    if LIMIT > 0:
        print(f"‚ö†Ô∏è Limiting interactions to {LIMIT} rows for fast graph construction.")
        df_interactions = df_interactions.limit(LIMIT)
        
except Exception as e:
    print(f"‚ùå master_interactions not found or accessible: {e}")

‚è≥ Reading master_interactions from HDFS...


                                                                                

Finished reading.
‚ö†Ô∏è Limiting interactions to 1000000 rows for fast graph construction.


In [5]:
# Build Edges: Co-Readership
# Find pairs of books read by the same user
print("‚è≥ Building Co-Readership Edges (Self-Join)...")

# Alias for self-join
df_a = df_interactions.alias("a")
df_b = df_interactions.alias("b")

edges_raw = df_a.join(df_b, col("a.user_id") == col("b.user_id")) \
    .filter(col("a.book_id") < col("b.book_id")) \
    .select(
        col("a.book_id").alias("src"),
        col("b.book_id").alias("dst")
    )

# Aggregate to get weight (number of shared readers)
edges = edges_raw.groupBy("src", "dst").agg(count("*").alias("weight"))

# Filter edges to remove weak connections (Optional, but recommended for performance)

MIN_SHARED_READERS = 10

edges = edges.filter(col("weight") >= MIN_SHARED_READERS)

print(f"‚úÖ Edges built (Threshold: {MIN_SHARED_READERS}).")
# edges.show(1)

‚è≥ Building Co-Readership Edges (Self-Join)...
‚úÖ Edges built (Threshold: 1).


In [6]:
# Build Nodes
# Nodes are just unique book_ids from the edges
nodes = edges.select("src").union(edges.select("dst")).distinct().withColumnRenamed("src", "id")
print("Finished calculating nodes.")
# print(f"üîπ Nodes Count: {nodes.count()}")
# print(f"üîπ Edges Count: {edges.count()}")

Finished calculating nodes.


In [7]:
# Create GraphFrame
try:
    from graphframes import GraphFrame
    g = GraphFrame(nodes, edges)
    print("‚úÖ GraphFrame created.")
except ImportError:
    print("‚ùå GraphFrames not installed. Ensure the package is loaded.")

‚úÖ GraphFrame created.




In [8]:
# Run Connected Components
# if nodes.count() > 0:
print("‚è≥ Running Connected Components...")
sc = spark.sparkContext
sc.setCheckpointDir("hdfs:///user/ubuntu/graphframes_checkpoints")

result = g.connectedComponents()

print("‚úÖ Connected Components done.")
result.show(5)

# Save Communities
print(f"üíæ Saving communities to HDFS: {out_communities}")
result.write.mode("overwrite").parquet(out_communities)
print("üéâ DONE! Communities saved.")
# else:
#     print("‚ö†Ô∏è No nodes/edges found with current sample. Increase LIMIT or lower threshold.")

‚è≥ Running Connected Components...




‚úÖ Connected Components done.


                                                                                

+--------+---------+
|      id|component|
+--------+---------+
| 1000230|        1|
|10011065|        3|
|10017650|        5|
| 1002096|        6|
|10023828|        8|
+--------+---------+
only showing top 5 rows

üíæ Saving communities to HDFS: hdfs:///user/ubuntu/goodreads_data/processed/book_communities


                                                                                

üéâ DONE! Communities saved.
