# Finding mutual friends using spark

In [1]:
from pyspark import SparkContext

sc = SparkContext.getOrCreate()

### get rdd of friend list lines

In [2]:
lines = sc.textFile("data/friend_list")
lines.take(10)

['0 1,2,3',
 '1 0,2,4,5,6',
 '2 0,1,4,6,7,8',
 '3 0,4,7',
 '4 1,2,3,5,8',
 '5 1,4',
 '6 1,2',
 '7 2,3',
 '8 2,4']

### map on each line

In [3]:
def gen_pair_of_friends(row):
    """
    for each friend pair, get
    every possible mutual guy.
    E.g. given 1 2,3,4 return
    12:3,4
    13:2,4
    14:2,3
    similarly for 2 1,3
    12:3
    23:1
    """
    result = []
    first, seconds = row.split(" ")
    seconds = set(seconds.split(","))
    for sec in seconds:
        # remove itself from possible mutual friends
        result.append((''.join(sorted([first, sec])), seconds - set([sec])))
    return result

In [4]:
pairs_of_friends = lines.flatMap(gen_pair_of_friends)
pairs_of_friends.take(10)

[('01', {'2', '3'}),
 ('03', {'1', '2'}),
 ('02', {'1', '3'}),
 ('15', {'0', '2', '4', '6'}),
 ('12', {'0', '4', '5', '6'}),
 ('01', {'2', '4', '5', '6'}),
 ('14', {'0', '2', '5', '6'}),
 ('16', {'0', '2', '4', '5'}),
 ('27', {'0', '1', '4', '6', '8'}),
 ('02', {'1', '4', '6', '7', '8'})]

### shuffle same keys together

In [5]:
mutual_friend = pairs_of_friends.groupByKey()

# groupByKey() returns iterator on values
mutual_friend.map(lambda x: (x[0], list(x[1]))).take(10)

[('03', [{'1', '2'}, {'4', '7'}]),
 ('12', [{'0', '4', '5', '6'}, {'0', '4', '6', '7', '8'}]),
 ('14', [{'0', '2', '5', '6'}, {'2', '3', '5', '8'}]),
 ('16', [{'0', '2', '4', '5'}, {'2'}]),
 ('24', [{'0', '1', '6', '7', '8'}, {'1', '3', '5', '8'}]),
 ('26', [{'0', '1', '4', '7', '8'}, {'1'}]),
 ('34', [{'0', '7'}, {'1', '2', '5', '8'}]),
 ('45', [{'1', '2', '3', '8'}, {'1'}]),
 ('48', [{'1', '2', '3', '5'}, {'2'}]),
 ('01', [{'2', '3'}, {'2', '4', '5', '6'}])]

### reduce each result of mapper

In [6]:
def reducer(result):
    """
    we have received at max two possible
    sets of mutual friends for a pair. Now,
    intersection of the two values to see 
    the actual mutual friends. E.g.
    """
    return (result[0], set.intersection(*list(result[1])))

In [7]:
mutual_friend.map(reducer).take(10)

[('03', set()),
 ('12', {'0', '4', '6'}),
 ('14', {'2', '5'}),
 ('16', {'2'}),
 ('24', {'1', '8'}),
 ('26', {'1'}),
 ('34', set()),
 ('45', {'1'}),
 ('48', {'2'}),
 ('01', {'2'})]