In [1]:
import findspark
findspark.init()

In [2]:
import sys
import time
import json
import os

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
SUBMIT_ARGS = "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11  pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

In [4]:
conf = SparkConf() \
    .setAppName("task1") \
    .setMaster("local[3]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [5]:
pyfiles = str(sc.getConf().get('spark.submit.pyFiles')).split(',')
sys.path.extend(pyfiles)

In [6]:
from graphframes import *

In [7]:
LPA_MAXITER = 5

In [8]:
filter_threshold = 7
input_file_path = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/homework/hw4/data/ub_sample_data.csv"
community_output_file_path = "./output/task1_jupyter.txt"

In [9]:
def outputAsFile(res):
    with open(community_output_file_path, 'w', encoding='utf-8') as fp:
        for t in res:
            if len(t) == 1:
                fp.write("'%s'\n" % (t[0]))
            else:
                fp.write("'" + ("', '").join(t) + "'\n")

In [10]:
data_with_header = sc.textFile(input_file_path)
header = data_with_header.first()
baskets = data_with_header.filter(lambda l: l != header) \
    .map(lambda l: tuple(l.split(','))) \
    .distinct() \
    .map(lambda x: (x[0], [x[1]])) \
    .reduceByKey(lambda x, y: x + y) \
    .collect()
# baskets - [(uid, [bid, ...]), ...]

In [12]:
uids = [x[0] for x in baskets]
baskets_length = len(baskets)
edges_list = []
vertices_have_link = []
for i in range(baskets_length):
    uid_i = baskets[i][0]
    bids_i = set(baskets[i][1])
    for j in range(i+1, baskets_length):
        uid_j = baskets[j][0]
        bids_j = set(baskets[j][1])
        if len(bids_i.intersection(bids_j)) >= filter_threshold:
            edges_list.append((uid_i, uid_j, 1))
            edges_list.append((uid_j, uid_i, 1))
            if uid_i not in vertices_have_link:
                vertices_have_link.append(uid_i)
            if uid_j not in vertices_have_link:
                vertices_have_link.append(uid_j)

In [13]:
# build dataframe context
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

In [14]:
uids_transform = [(u,) for u in vertices_have_link]
vertices = spark.createDataFrame(uids_transform, ["id"])

In [15]:
edges = spark.createDataFrame(edges_list, ["src", "dst", "relationship"])

In [16]:
edges.show()

+--------------------+--------------------+------------+
|                 src|                 dst|relationship|
+--------------------+--------------------+------------+
|39FT2Ui8KUXwmUt6h...|0FVcoJko1kfZCrJRf...|           1|
|0FVcoJko1kfZCrJRf...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|JM0GL6Dx4EuZ1mprL...|           1|
|JM0GL6Dx4EuZ1mprL...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|bSUS0YcvS7UelmHvC...|           1|
|bSUS0YcvS7UelmHvC...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|DKolrsBSwMTpTJL22...|           1|
|DKolrsBSwMTpTJL22...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|sdLns7062kz3Ur_b8...|           1|
|sdLns7062kz3Ur_b8...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|_VTEyUzzH92X3w-Ip...|           1|
|_VTEyUzzH92X3w-Ip...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|qtOCfMTrozmUSHWIc...|           1|
|qtOCfMTrozmUSHWIc...|39FT2Ui8KUXwmUt6h...|           1|
|39FT2Ui8KUXwmUt6h...|zBi_JWB5u

In [17]:
g = GraphFrame(vertices, edges)

In [18]:
communities = g.labelPropagation(maxIter=LPA_MAXITER)

In [19]:
communities.show()

+--------------------+-------------+
|                  id|        label|
+--------------------+-------------+
|gH0dJQhyKUOVCKQA6...| 146028888064|
|oegRUjhGbP62M18Wy...|1005022347264|
|2quguRdKBzul3GpRi...|1425929142273|
|DPtOaWemjBPvFiZJB...|  17179869185|
|Ih85YhFRDzOnB09yS...|1005022347264|
|ZW-XoteNlRuuK-19q...| 532575944705|
|tekHDsd0fskYG3tqu...|  17179869185|
|OoyQYSeYNyRVOmdO3...|1425929142273|
|dTeSvET2SR5LDF_J0...|1425929142273|
|wXdrUQg4-VkSZH1FG...| 644245094400|
|mu4XvWvJOb3XpG1C_...|  17179869185|
|FyQrUamokaPLDrBxG...| 584115552258|
|ZXyGw3Z1DyhK1sfNt...| 584115552258|
|XrRLaAeV20MRwdSIG...| 103079215104|
|tAcY4S3vIuNlAoRlC...| 532575944704|
|eqWEgMH-DCP74i82B...|1649267441664|
|TjsBbWAfwxWEXPxaL...|1425929142273|
|lJFBgSAccsMGwIjfD...| 274877906944|
|Nf_Jw_W_CwOz5WJ7A...| 584115552258|
|cIbbfJEGLB3B-c8Po...|  17179869185|
+--------------------+-------------+
only showing top 20 rows



In [20]:
result = communities.groupby('label') \
    .agg(F.collect_list('id').alias('collect')) \
    .select('collect') \
    .rdd \
    .map(tuple) \
    .map(lambda x: tuple(sorted(x[0]))) \
    .sortBy(keyfunc=lambda x: x[0]) \
    .sortBy(keyfunc=lambda x: len(x)) \
    .collect()

In [21]:
for x in result:
    print(x)

('23y0Nv9FFWn_3UWudpnFMA',)
('3Vd_ATdvvuVVgn_YCpz8fw',)
('453V8MlGr8y61PpsDAFjKQ',)
('46HhzhpBfTdTSB5ceTx_Og',)
('9W73B44Iw8WslrTNB2CdCg',)
('Cf0chERnfd06ltnN45xLNQ',)
('F47atsRPw-KHmRVk5exBFw',)
('JeOHA8tW7gr-FDYOcPJoeA',)
('KBoIRjxSW7OWczv8OS9Bew',)
('QYKexxaOJQlseGWmc6soRg',)
('Si3aMsOVGSVlsc54iuiPwA',)
('YVQFzWm0H72mLUh-8gzd5w',)
('_m1ot2zZetDgjerAD2Sidg',)
('d5WLqmTMvmL7-RmUDVKqqQ',)
('eqWEgMH-DCP74i82BEAZzw',)
('gH0dJQhyKUOVCKQA6sqAnw',)
('gUu0uaiU7UEUVIgCdnqPVQ',)
('jJDUCuPwVqwjbth3s92whA',)
('jSbXY_rno4hYHQCFftsWXg',)
('tX0r-C9BaHYEolRUfufTsQ',)
('vENR70IrUsDNTDebbuxyQA',)
('0QREkWHGO8-Z_70qx1BIWw', 'KHjroLTG6Ah8LyItTyB2yw')
('2GUjO7NU88cPXpoffYCU8w', 'BDmxm7aeWFOLT35gSvkmig')
('6YmRpoIuiq8I19Q8dHKTHw', 'frQs7y5qa-X1pvAM0sJe1w')
('6xi9tBoZ6r_v41u_XFsSnA', 'XrRLaAeV20MRwdSIGjj2SQ')
('98rLDXbloLXekGjieuQSlA', 'MJ0Wphhko2-LbJ0uZ5XyQA')
('Ams0iLRd0AhZZectGKA8fw', 'EiwxlbR8fb68lMgEXhcWKA')
('Gr-MqCunME2K_KmsAwjpTA', '_6Zg4ukwS0kst9UtkfVw3w')
('QRsuZ_LqrRU65dTs5CL4Lw', 'lJFBgSAccsMGw

In [22]:
len(result)

44