In [1]:
import time
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import IntegerType, BooleanType, DateType, StructType, StructField
from pyspark.sql.functions import explode, col, split, array, array_min, concat, least, collect_set, size, sum, min


BUCKET_INPUT_PATH = "gs://iasd-input-data"
DATASET_PATHS = {
    "notre_dame": f"{BUCKET_INPUT_PATH}/web-NotreDame.txt",
    "berk_stan.txt": f"{BUCKET_INPUT_PATH}/web-BerkStan.txt",
    "stanford.txt": f"{BUCKET_INPUT_PATH}/web-Stanford.txt",
    "google.txt": f"{BUCKET_INPUT_PATH}/web-Google.txt"
}

dataset_path = f"{BUCKET_INPUT_PATH}/test.txt"



In [2]:
spark_session = SparkSession \
    .builder \
    .appName("Python Spark SQL basic example") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

spark_context = spark_session.sparkContext

spark_session

In [3]:
def load_df(path):
    return spark_session.read.format("csv").option("header","false")\
                .load(path)


def preprocess_df(df):
    col_name = df.columns[0]
    return df.filter(f"{col_name} NOT LIKE '#%'")\
                .withColumn('k', split(df[col_name], '\t').getItem(0)) \
                .withColumn('v', split(df[col_name], '\t').getItem(1)) \
                .drop(col_name)\
                .withColumn("k",col("k").cast(IntegerType())) \
                .withColumn("v",col("v").cast(IntegerType()))

In [None]:
df_raw = load_df(dataset_path)
df_raw.show(6)

In [None]:
df = preprocess_df(df_raw)
df.show(5)

In [4]:
def iterate_map_df(df):
    return df.union(df.select(col("v").alias("k"), col("k").alias("v")))

In [5]:
def iterate_reduce_df(df):
    global nb_new_pair

    df = df.groupBy(col("k")).agg(collect_set("v").alias("v"))\
                                            .withColumn("min", least(col("k"), array_min("v")))\
                                            .filter((col("k")!=col('min')))

    nb_new_pair += df.withColumn("count", size("v")-1).select(sum("count")).collect()[0][0]

    return df.select(col("min").alias("a_min"), concat(array(col("k")), col("v")).alias("valueList"))\
                                                    .withColumn("valueList", explode("valueList"))\
                                                    .filter((col('a_min')!=col('valueList')))\
                                                    .select(col('a_min').alias("k"), col('valueList').alias("v"))

In [6]:
from pyspark.sql.functions import explode, col, split, array, array_min, concat, least, collect_set, size, sum

nb_new_pair = sc.accumulator(0)  


def compute_cc_df(df):
    nb_iteration = 0
    while True:
        nb_iteration += 1
        nb_pairs_start = nb_new_pair.value

        df = iterate_map_df(df)
        df = iterate_reduce_df(df)
        df = df.distinct()
        
        print(f"Number of new pairs for iteration #{nb_iteration}:\t{nb_new_pair.value}")
        if nb_pairs_start == nb_new_pair.value:
            print("\nNo new pair, end of computation")
            break

    return df

In [7]:
df_raw = load_df(dataset_path)
df = preprocess_df(df_raw)

start_time = time.time()
df = compute_cc_df(df)
print(f"Nb of connected components in the graph: {df.select('k').distinct().count()}")
print(f"Duration in seconds: {time.time() - start_time}")

                                                                                

Number of new pairs for iteration #1:	4


                                                                                

Number of new pairs for iteration #2:	13
Number of new pairs for iteration #3:	17
Number of new pairs for iteration #4:	17

No new pair, end of computation


[Stage 130:>                                                        (0 + 4) / 4]

Nb of connected components in the graph: 2
Duration in seconds: 25.979886770248413


                                                                                