In [None]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

In [None]:
# Let's import the libraries we will need
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf

In [None]:
import re, sys, operator
from pyspark import SparkConf, SparkContext
sc = SparkContext(conf=SparkConf())

In [None]:
lines = sc.textFile('/content/browsing.txt')
baskets = lines.map(lambda l: l.split())
N = baskets.count()

In [None]:
baskets.take(5)

[['FRO11987', 'ELE17451', 'ELE89019', 'SNA90258', 'GRO99222'],
 ['GRO99222',
  'GRO12298',
  'FRO12685',
  'ELE91550',
  'SNA11465',
  'ELE26917',
  'ELE52966',
  'FRO90334',
  'SNA30755',
  'ELE17451',
  'FRO84225',
  'SNA80192'],
 ['ELE17451', 'GRO73461', 'DAI22896', 'SNA99873', 'FRO86643'],
 ['ELE17451', 'ELE37798', 'FRO86643', 'GRO56989', 'ELE23393', 'SNA11465'],
 ['ELE17451',
  'SNA69641',
  'FRO86643',
  'FRO78087',
  'SNA11465',
  'GRO39357',
  'ELE28573',
  'ELE11375',
  'DAI54444']]

In [None]:
N

31101

In [None]:
baskets = baskets.map(lambda b: sorted(set(b)))

PythonRDD[5] at RDD at PythonRDD.scala:53

In [None]:
baskets.take(5)

[['ELE17451', 'ELE89019', 'FRO11987', 'GRO99222', 'SNA90258'],
 ['ELE17451',
  'ELE26917',
  'ELE52966',
  'ELE91550',
  'FRO12685',
  'FRO84225',
  'FRO90334',
  'GRO12298',
  'GRO99222',
  'SNA11465',
  'SNA30755',
  'SNA80192'],
 ['DAI22896', 'ELE17451', 'FRO86643', 'GRO73461', 'SNA99873'],
 ['ELE17451', 'ELE23393', 'ELE37798', 'FRO86643', 'GRO56989', 'SNA11465'],
 ['DAI54444',
  'ELE11375',
  'ELE17451',
  'ELE28573',
  'FRO78087',
  'FRO86643',
  'GRO39357',
  'SNA11465',
  'SNA69641']]

In [None]:
def singles_helper(basket):
    ret = []
    for item in basket:
        ret.append((item, 1))
    return ret

In [None]:
singles_support = baskets.flatMap(singles_helper)
singles_support.take(5)

[('ELE17451', 1),
 ('ELE89019', 1),
 ('FRO11987', 1),
 ('GRO99222', 1),
 ('SNA90258', 1)]

In [None]:
singles_support = singles_support.reduceByKey(operator.add)
singles_support.take(5)

[('FRO11987', 104),
 ('SNA90258', 550),
 ('ELE52966', 380),
 ('ELE91550', 23),
 ('FRO84225', 74)]

In [None]:
singles_support = singles_support.filter(lambda x: x[1] >= 100)
singles_support.take(5)

[('FRO11987', 104),
 ('SNA90258', 550),
 ('ELE52966', 380),
 ('SNA80192', 258),
 ('DAI22896', 1219)]

In [None]:
singles_support_b = {}
for item, support in singles_support.collect():
    singles_support_b[item] = support

In [None]:
singles_support_b = sc.broadcast(singles_support_b)

In [None]:
def doubles_helper(basket):
    singles = singles_support_b.value
    ret = []
    for i in range(len(basket)):
        if basket[i] in singles:
            for j in range(i):
                if basket[j] in singles:
                    ret.append(((basket[j], basket[i]), 1)) # basket is sorted
    return ret

In [None]:
doubles_support = baskets.flatMap(doubles_helper)
doubles_support.take(5)

[(('ELE17451', 'FRO11987'), 1),
 (('ELE17451', 'GRO99222'), 1),
 (('FRO11987', 'GRO99222'), 1),
 (('ELE17451', 'SNA90258'), 1),
 (('FRO11987', 'SNA90258'), 1)]

In [None]:
doubles_support = doubles_support.reduceByKey(operator.add)
doubles_support.take(5)

[(('ELE17451', 'GRO99222'), 148),
 (('FRO11987', 'SNA90258'), 2),
 (('ELE17451', 'ELE26917'), 314),
 (('ELE17451', 'GRO12298'), 36),
 (('ELE26917', 'GRO12298'), 17)]

In [None]:
doubles_support = doubles_support.filter(lambda x: x[1] >= 100)
doubles_support.take(5)

[(('ELE17451', 'GRO99222'), 148),
 (('ELE17451', 'ELE26917'), 314),
 (('ELE26917', 'GRO99222'), 192),
 (('ELE17451', 'SNA30755'), 111),
 (('DAI22896', 'GRO73461'), 304)]

In [None]:
def confidence_doubles_helper(double_support):
    double, support = double_support
    support = float(support)
    u, v = double
    singles = singles_support_b.value
    uv_conf = support / singles[u]
    vu_conf = support / singles[v]
    return (('%s -> %s' % (u, v), uv_conf),
            ('%s -> %s' % (v, u), vu_conf))

In [None]:
doubles_conf = doubles_support.flatMap(confidence_doubles_helper)
doubles_conf.take(5)

[('ELE17451 -> GRO99222', 0.03819354838709677),
 ('GRO99222 -> ELE17451', 0.16335540838852097),
 ('ELE17451 -> ELE26917', 0.08103225806451612),
 ('ELE26917 -> ELE17451', 0.13699825479930192),
 ('ELE26917 -> GRO99222', 0.08376963350785341)]

In [None]:
doubles_conf = doubles_conf.sortBy(lambda x: (-x[1], x[0]))
doubles_conf.take(5)

[('DAI93865 -> FRO40251', 1.0),
 ('GRO85051 -> FRO40251', 0.999176276771005),
 ('GRO38636 -> FRO40251', 0.9906542056074766),
 ('ELE12951 -> FRO40251', 0.9905660377358491),
 ('DAI88079 -> FRO40251', 0.9867256637168141)]

In [None]:
doubles_support_b = {}
for entry, support in doubles_support.collect():
    doubles_support_b[entry] = support

In [None]:
doubles_support_b = sc.broadcast(doubles_support_b)

In [None]:
def triples_helper(basket):
    doubles = doubles_support_b.value
    singles = singles_support_b.value
    ret = []
    for i in range(len(basket)):
        if basket[i] not in singles:
            continue
        for j in range(i):
            if basket[j] not in singles:
                continue
            if (basket[j], basket[i]) not in doubles:
                continue
            for k in range(j):
                if basket[k] not in singles:
                    continue
                if (basket[k], basket[j]) not in doubles:
                    continue
                if (basket[k], basket[i]) not in doubles:
                    continue
                ret.append(((basket[k], basket[j], basket[i]), 1))
    return ret

In [None]:
triples_support = baskets.flatMap(triples_helper)
triples_support = triples_support.reduceByKey(operator.add)
triples_support = triples_support.filter(lambda x: x[1] >= 100)
triples_support.take(5)

[(('ELE17451', 'SNA59903', 'SNA72163'), 127),
 (('DAI62779', 'ELE17451', 'FRO78087'), 121),
 (('DAI62779', 'ELE17451', 'ELE26917'), 160),
 (('DAI62779', 'ELE17451', 'SNA55762'), 157),
 (('DAI62779', 'ELE17451', 'SNA99873'), 126)]

In [None]:
def confidence_triples_helper(triple_support):
    doubles = doubles_support_b.value
    triple, support = triple_support
    support = float(support)
    u, v, w = triple
    uv_w = support / doubles[u, v]
    uw_v = support / doubles[u, w]
    vw_u = support / doubles[v, w]
    return (('(%s, %s) -> %s' % (u, v, w), uv_w),
            ('(%s, %s) -> %s' % (u, w, v), uw_v),
            ('(%s, %s) -> %s' % (v, w, u), vw_u))

In [None]:
triples_conf = triples_support.flatMap(confidence_triples_helper)
triples_conf = triples_conf.sortBy(lambda x: (-x[1], x[0]))
triples_conf.take(5)

[('(DAI23334, ELE92920) -> DAI62779', 1.0),
 ('(DAI31081, GRO85051) -> FRO40251', 1.0),
 ('(DAI55911, GRO85051) -> FRO40251', 1.0),
 ('(DAI62779, DAI88079) -> FRO40251', 1.0),
 ('(DAI75645, GRO85051) -> FRO40251', 1.0)]

In [None]:
with open('./out.txt', 'w') as f:
    f.write(str(doubles_conf.take(5)))
    f.write('\n')
    f.write(str(triples_conf.take(5)))