In [1]:
import findspark
findspark.init()

In [2]:
import sys
import time
import json
import os

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [3]:
SUBMIT_ARGS = "--packages graphframes:graphframes:0.6.0-spark2.3-s_2.11  pyspark-shell"
os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS

In [4]:
conf = SparkConf() \
    .setAppName("task1") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [5]:
pyfiles = str(sc.getConf().get(u'spark.submit.pyFiles')).split(',')
sys.path.extend(pyfiles)

In [6]:
from graphframes import *

In [7]:
filter_threshold = 7
input_file_path = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/homework/hw4/data/ub_sample_data.csv"

In [8]:
data_with_header = sc.textFile(input_file_path)
header = data_with_header.first()
baskets = data_with_header.filter(lambda l: l != header) \
    .map(lambda l: tuple(l.split(','))) \
    .distinct() \
    .map(lambda x: (x[0], [x[1]])) \
    .reduceByKey(lambda x, y: x + y) \
    .collectAsMap()

In [9]:
baskets['H6QSYPYJFAW2wGOyn12SYg']

['j7zJxmr8BfYJhC3KW9BshA',
 'MHxce6ztMkbYHR8z3Z1q3g',
 'w1Bpt8xEginRIkQiOAN9Gg',
 '0044Q4cVBcHBgXPmhGgtcQ',
 'oNn16x2Ubv60RAibmJ2uSA',
 'P4h8kgy_48oE9vCHJc4BCQ',
 'TWqDzegXkyYv7itPSNIhqg']

In [10]:
uids = sorted(list(baskets.keys()))

In [11]:
uids_length = len(uids)

In [27]:
uids_length

3374

In [12]:
edges_list = []
vertices_have_link = set([])
for i in range(uids_length):
    uid1 = uids[i]
    set1 = set(baskets[uid1])
    for j in range(i+1, uids_length):
        uid2 = uids[j]
        set2 = set(baskets[uid2])
        if len(set1.intersection(set2)) >= filter_threshold:
            edges_list.append((uid1, uid2, 1))
            edges_list.append((uid2, uid1, 1))
            vertices_have_link.add(uid1)
            vertices_have_link.add(uid2)

In [13]:
edges_list[:5]

[('0FMte0z-repSVWSJ_BaQTg', '0FVcoJko1kfZCrJRfssfIA', 1),
 ('0FVcoJko1kfZCrJRfssfIA', '0FMte0z-repSVWSJ_BaQTg', 1),
 ('0FVcoJko1kfZCrJRfssfIA', '1KQi8Ymatd4ySAd4fhSfaw', 1),
 ('1KQi8Ymatd4ySAd4fhSfaw', '0FVcoJko1kfZCrJRfssfIA', 1),
 ('0FVcoJko1kfZCrJRfssfIA', '2XYdguaaZ7dgi6fAlddujg', 1)]

In [15]:
len(vertices_have_link)

222

In [16]:
len(edges_list)

996

In [34]:
# because uids is sorted, result_1 is sorted
result_1 = [(u,) for u in uids if u not in vertices_have_link]

In [40]:
result_1

[('---1lKK3aKOuomHnwAkAow',),
 ('--2vR0DIsmQ6WfcSzKWigw',),
 ('--RlSfc-QmcHFGHyX6aVjA',),
 ('--YhjyV-ce1nFLYxP49C5A',),
 ('-0HhZbPBlB1YZx3BhAfaEA',),
 ('-267Yx8RmdP6io2-qI4UcQ',),
 ('-2kCxY7_aw5hOz7fJnGMbQ',),
 ('-3s52C4zL_DHRK0ULG6qtg',),
 ('-50XWnmQGqBgEI-9ANvLlg',),
 ('-6559fkJ6rCWIZDbqVUomA',),
 ('-7bM_DeL2Kj2CuYuVDsLNg',),
 ('-8syaSLDbMXW2F2FApVgig',),
 ('-9b4s874f_CnznTu4JorRg',),
 ('-9da1xk7zgnnfO1uTVYGkA',),
 ('-ARdx8hOcEWlMDjzwLYZ_g',),
 ('-Ak9LVDH8GYyWtIMnNo5Ug',),
 ('-Anyb0vB5LrW273whytNRw',),
 ('-C-l8EHSLXtZZVfUAUhsPA',),
 ('-EJorVxe7h2GSxdiRyMmDA',),
 ('-Fy91nyOFqPv9M_MaZ4W2g',),
 ('-KpEgEen1tj-jdjIS7uVOw',),
 ('-OY3fkHVYy0Dx160rKCiWQ',),
 ('-P3SyBLmBhyhDcYatlBgBQ',),
 ('-PHC1ulwHkY4LEmMqmFwPg',),
 ('-Q4HGzcXSCeiqvmY2mg-aQ',),
 ('-QmmHtp57b3zT79APvRONA',),
 ('-R3Liu8xDWu-VpWSW8YlKA',),
 ('-RA9NLalwmRTOX_8UMHnVQ',),
 ('-RApyq06DMBii7ovXtQnfw',),
 ('-ShdX4pDKrldKfic9rHhSQ',),
 ('-TMDrC66dvClx5Z7Hdzrfw',),
 ('-VNv58eLhbQpz787rcD8VA',),
 ('-WMhXcdzVV-o3PN06-yzRA',),
 ('-Y6tXYP

In [17]:
spark = SparkSession.builder.config(conf=SparkConf()).getOrCreate()

In [18]:
uids_transform = [(uid,) for uid in vertices_have_link]
vertices = spark.createDataFrame(uids_transform, ["id"])

In [19]:
edges = spark.createDataFrame(edges_list, ["src", "dst", "relationship"])

In [20]:
g = GraphFrame(vertices, edges)

In [21]:
g.inDegrees.show()

+--------------------+--------+
|                  id|inDegree|
+--------------------+--------+
|m1IVpXClMox1VGw5h...|       4|
|FyQrUamokaPLDrBxG...|      14|
|OoyQYSeYNyRVOmdO3...|       3|
|waN6iwcphiVEoCews...|       1|
|0FMte0z-repSVWSJ_...|       1|
|JM0GL6Dx4EuZ1mprL...|      28|
|l-1cva9rA8_ugLrtS...|       7|
|kKTcYPz47sCDH1_yl...|       1|
|7G8w2SnaC-qDVQ7_G...|       3|
|1KQi8Ymatd4ySAd4f...|       3|
|H4EQn0rjFuGRgIm6c...|       1|
|MwpK7PqQX7fgTFM2P...|       1|
|ELfzWgdf64VBLi5z1...|       1|
|TZ974xcbw2kqjYxAh...|       1|
|fcWM-oqjgS94yi1IN...|       1|
|4pc_EyanaC3ARh0MZ...|       7|
|R4l3ONHzGBakKKNo4...|       2|
|j8Dts8irvVBwEhEEa...|       1|
|2GUjO7NU88cPXpoff...|       3|
|Z9a1tDT8fVI75qXYw...|       3|
+--------------------+--------+
only showing top 20 rows



In [22]:
communities = g.labelPropagation(maxIter=5)

In [23]:
communities.show()

+--------------------+-------------+
|                  id|        label|
+--------------------+-------------+
|gH0dJQhyKUOVCKQA6...| 146028888064|
|oegRUjhGbP62M18Wy...|1005022347264|
|2quguRdKBzul3GpRi...|1425929142273|
|DPtOaWemjBPvFiZJB...|  17179869185|
|Ih85YhFRDzOnB09yS...|1005022347264|
|ZW-XoteNlRuuK-19q...| 532575944705|
|tekHDsd0fskYG3tqu...|  17179869185|
|OoyQYSeYNyRVOmdO3...|1425929142273|
|dTeSvET2SR5LDF_J0...|1425929142273|
|wXdrUQg4-VkSZH1FG...| 644245094400|
|mu4XvWvJOb3XpG1C_...|  17179869185|
|FyQrUamokaPLDrBxG...| 584115552258|
|ZXyGw3Z1DyhK1sfNt...| 584115552258|
|XrRLaAeV20MRwdSIG...| 103079215104|
|tAcY4S3vIuNlAoRlC...| 532575944704|
|eqWEgMH-DCP74i82B...|1649267441664|
|TjsBbWAfwxWEXPxaL...|1425929142273|
|lJFBgSAccsMGwIjfD...| 274877906944|
|Nf_Jw_W_CwOz5WJ7A...| 584115552258|
|cIbbfJEGLB3B-c8Po...|  17179869185|
+--------------------+-------------+
only showing top 20 rows



In [51]:
result_2 = communities.groupby('label') \
    .agg(F.collect_list('id').alias('collect')) \
    .select('collect') \
    .rdd \
    .map(tuple) \
    .map(lambda x: tuple(sorted(x[0]))) \
    .collect()

In [36]:
result = result_1 + result_2

In [46]:
result = sorted(sorted(result, key=lambda x: x[0]), key=lambda x: len(x))

In [47]:
for r in result:
    print(r)

('---1lKK3aKOuomHnwAkAow',)
('--2vR0DIsmQ6WfcSzKWigw',)
('--RlSfc-QmcHFGHyX6aVjA',)
('--YhjyV-ce1nFLYxP49C5A',)
('-0HhZbPBlB1YZx3BhAfaEA',)
('-267Yx8RmdP6io2-qI4UcQ',)
('-2kCxY7_aw5hOz7fJnGMbQ',)
('-3s52C4zL_DHRK0ULG6qtg',)
('-50XWnmQGqBgEI-9ANvLlg',)
('-6559fkJ6rCWIZDbqVUomA',)
('-7bM_DeL2Kj2CuYuVDsLNg',)
('-8syaSLDbMXW2F2FApVgig',)
('-9b4s874f_CnznTu4JorRg',)
('-9da1xk7zgnnfO1uTVYGkA',)
('-ARdx8hOcEWlMDjzwLYZ_g',)
('-Ak9LVDH8GYyWtIMnNo5Ug',)
('-Anyb0vB5LrW273whytNRw',)
('-C-l8EHSLXtZZVfUAUhsPA',)
('-EJorVxe7h2GSxdiRyMmDA',)
('-Fy91nyOFqPv9M_MaZ4W2g',)
('-KpEgEen1tj-jdjIS7uVOw',)
('-OY3fkHVYy0Dx160rKCiWQ',)
('-P3SyBLmBhyhDcYatlBgBQ',)
('-PHC1ulwHkY4LEmMqmFwPg',)
('-Q4HGzcXSCeiqvmY2mg-aQ',)
('-QmmHtp57b3zT79APvRONA',)
('-R3Liu8xDWu-VpWSW8YlKA',)
('-RA9NLalwmRTOX_8UMHnVQ',)
('-RApyq06DMBii7ovXtQnfw',)
('-ShdX4pDKrldKfic9rHhSQ',)
('-TMDrC66dvClx5Z7Hdzrfw',)
('-VNv58eLhbQpz787rcD8VA',)
('-WMhXcdzVV-o3PN06-yzRA',)
('-Y6tXYPYqeVy37-L5p0rMw',)
('-YV1yESQXqR3vpIgBjKDsw',)
('-Z0uXJn_uP3U0h-e31

('Zb2T6JhcBGzgc6RWmldbDg',)
('ZhfC1izXG4YY7gkqyAzGOg',)
('ZknZOyIXOvBhMDQQYVh8aA',)
('ZmKGF8ZZUo16g2ISrz4ahw',)
('ZmWLeLU_bGrNiqBVAGo-eg',)
('Zmp1Q6Ul9VH3zL02Z5ls_A',)
('ZnIaEvJwrxlBUjimIpx2Qw',)
('ZngF2uBYrqyFjXd6LvuluQ',)
('Zoec9wehLFa8CV1JnCCVug',)
('Zpk_NpxZ767LP3Zlw87y9w',)
('Zqk-ye_OHeH_sS1JsdsLgg',)
('Zr4N9sG31fybEa-l3a_46Q',)
('ZsjuEgOdvx9l5GKZdtwYIw',)
('Zt8kRfpqdL7cULnAS2f5AQ',)
('ZtWi1WgfiiBlKmBA8VS4NQ',)
('ZudG3YIMP-6xda4yT0OeEQ',)
('_-gnSzjvS0lAwuvpUdJoyA',)
('_-yFnMPrRqMqc07SeIBDDg',)
('_1SKSmLqpHBqWh09DxQMfw',)
('_1pnRlhWYQv9FJ4zAgtFAA',)
('_50EXPLguCJR8o454dtD3w',)
('_5HFgadpCIaSVINVvse2Kw',)
('_5keMAmic7zzrnM5mVefBw',)
('_7PfR6Tvh2xTbiVi1GELoQ',)
('_7zgzdB1Qog-HUWQdbH0pw',)
('_9WN_qmjbbRu6eFCMVXokw',)
('_A8CO1V40LPIkXqE-LTYTg',)
('_ABftW6CZdx4iMdHaCkiZA',)
('_FaY6Tr0EE5KKylpPjw1xw',)
('_GCtk6kjbArSxiPPLlAfMA',)
('_GX0dMS_5sJoaKmDfY8SwA',)
('_IfD6K58QjUR-W-fXdXYnQ',)
('_JpQ84FfRPG0TTw_T0zR_A',)
('_MjgqPR1pvDnZZ6wCwabBQ',)
('_O34OQgnyeJ-9w3VnjDR1Q',)
('_O5ajREINPG9zeyhW5

In [48]:
def outputAsFile(res):
    with open(community_output_file_path, 'w', encoding='utf-8') as fp:
        for t in res:
            if len(t) == 1:
                fp.write("'%s'\n" % (t[0]))
            else:
                fp.write("'" + ("', '").join(t) + "'\n")

In [49]:
community_output_file_path = "./jupyter_task1_output.txt"

In [50]:
outputAsFile(result)