In [None]:
# get dataset
! kaggle datasets download -f actors.csv gsimonx37/letterboxd
! unzip actors.csv.zip
! rm actors.csv.zip

# get whole repo if running in google colab
! git clone https://github.com/mattia01017/movie-actor-mb-analysis
! pip install -r movie-actor-mb-analysis/requirements.txt

# setup Spark
import os
import findspark
! apt-get install openjdk-8-jdk-headless -qq > /dev/null
! wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
! tar xf spark-3.1.1-bin-hadoop3.2.tgz
! pip install -q findspark
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
findspark.init("spark-3.1.1-bin-hadoop3.2")

In [1]:
import csv
import json
import numpy as np
from pympler.asizeof import asizeof
from typing import Iterable
from collections import defaultdict, Counter
from itertools import combinations
from dotenv import load_dotenv
from functools import reduce
from pyspark.sql import SparkSession
import gc

# Market-basket analysis of Letterboxd dataset

## Preprocessing

Starting from a csv table that associate film identifiers to actors, we want to have on disk a list of baskets

In [None]:
data = defaultdict(list)
reader = csv.reader(open("actors.csv"))
next(reader)

for row in reader:
    data[row[0]] = [row[1]]

with open("baskets.txt", "w") as f:
    f.write("\n".join([json.dumps(basket) for basket in data.values()]))


We define an iterator that implement a lazy loading of file data. In this way, we can load in memory a basket at a time instead of the whole dataset.

In [35]:
class Baskets(Iterable):
    def __init__(self, filename, stop: int | None = None) -> None:
        self.filename = filename
        self.stop = stop
        
    def __iter__(self):
        self.file = open(self.filename)
        self.read = 0
        return self
    
    def __next__(self):
        if self.file.closed: raise StopIteration
        line = self.file.readline()
        if not line or self.read == self.stop: 
            self.file.close()
            raise StopIteration
        self.read += 1
        return tuple(json.loads(line))
    

## Algorithms

For the analysis, the Savasere, Omiecinski and Navathe (SON) algorithm will be implemented using the Park, Cheng and Yu (PCY) algorithm for the chunks.

### PCY

First we define a very simple class representing a bitmap useful for the PCY algorithm implementation.

In [2]:
class Bitmap:
    def __init__(self, bits_arr: list) -> None:
        self.bytes = np.packbits(bits_arr, bitorder="little")

    def get(self, index: int) -> bool:
        return bool(self.bytes[index // 8] & pow(2, index % 8))

    def set(self, index: int):
        self.bytes[index // 8] |= pow(2, index % 8)

    def __str__(self) -> str:
        return " ".join(["{0:08b}".format(b) for b in self.bytes])

After that, the PCY algorithm is implemented. The garbage collector is manually triggered for deleting from memory the counters after the bitmap creation.

In [3]:

def hash_t(itemset: tuple) -> int:
    """hash tuple ignoring order"""
    return reduce(lambda p, c: p ^ hash(c), itemset, 0)

def pcy(
    baskets: Iterable[tuple[str]],
    threshold: int,
    buckets: int,
) -> list[tuple]:
    item_counts = Counter()
    itemset_counts = np.zeros(buckets, dtype=np.uint32)

    for basket in baskets:
        for item in basket:
            item_counts[item] += 1
        for itemset in combinations(basket, 2):
            itemset_counts[hash_t(itemset) % buckets] += 1
            
            
    bitmap = Bitmap([count > threshold for count in itemset_counts])
    del itemset_counts
    gc.collect()

    freq_items = [item for item, count in item_counts.items() if count > threshold]
    del item_counts
    gc.collect()

    return [
        itemset
        for itemset in combinations(freq_items, 2)
        if bitmap.get(hash_t(itemset) % buckets)
    ]

PCY alone can be used to retrieve the frequent itemsets using a single node for computation.

In [None]:
pcy(Baskets("baskets.txt"), 100, int(1e9))

The result contains also false positive, that is infrequent itemset put in a frequent bucket. To remove false positives, another pass would be needed. The topic is best delved in SON implementation.

### SON

Execution times can be improved by using SON, parallelizing the execution of PCY on a number of chunks and combining the results. The Apache Spark framework is used for the implementation of the SON algorithm.

In [None]:
load_dotenv()

CHUNKS = 5
spark = SparkSession.builder\
    .appName("movie-actor-mb-analysis")\
    .config("spark.default.parallelism", str(CHUNKS))\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/11/03 17:04:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


The first step is to import the dataset from the CSV file. To get a convenient representation of data, rows will be grouped by film ID to obtain a Spark dataframe with a basket for each row.

In [5]:
df = spark.read.csv("actors.csv", header=True, sep=",", mode="DROPMALFORMED")
baskets = df.rdd\
    .map(lambda x: (x["id"], x["name"]))\
    .groupByKey()\
    .map(lambda x: x[1])


In [33]:
baskets.count()

                                                                                

603162

### Data structures size

The objective is to have the maximum memory usage without swapping and thus thrashing. The main elements to store in memory are:
- The hash table of item counters
- The array of bucket counters
- The bitmap of frequent buckets

The memory usage of the bitmap and the array of counters is easy to predict given the size, more tricky is the hash table of counters. For this purpose, we use a tool for observing memory behaviour of Python objects, namely Pympler. The `asizeof` method return an approximation of the memory usage of an object.

We measure the size of the `Counter` object after counting all items.

In [None]:
counter = Counter(baskets.flatMap(lambda x: x).collect())
print("{0:.3f} MB".format(asizeof(counter) / 1e6))

Thus, we can assume that a single node won't use more than 200 MB for storing the item counters. The remaining space can be used to store the bucket counters. Assuming we want to use up to 2 GB of memory for each computing node, we can use a number of buckets with 32-bit unsigned integer counters equal to:
$$
\frac{2 \cdot 10^9 \text{ B} - 2 \cdot 10^8 \text{ B}}{4 \text{ B}} = 4.5 \cdot 10^8 \text{ buckets}
$$

while the bitmap will occupy $1/32$ of the space.

The last parameters to tune are the thresholds for labelling a bucket in a chunk and an itemset in the whole dataset as frequent. Those will be chosen experimentally in a way to obtain an output of reasonable size, say around 50 itemsets. 

### Map-reduce implementation

In [32]:
BUCKETS = int(1e7)
THRESHOLD = 100

def count_occurrences(baskets, candidates):
    out = {tuple(sorted(k)):0 for k in candidates}
    for basket in baskets:
        for itemset in combinations(basket, 2):
            itemset = tuple(sorted(itemset))
            if itemset in out:
                out[itemset] += 1
    return out.items()


candidates = baskets\
    .mapPartitions(lambda chunk: pcy(chunk, THRESHOLD // CHUNKS, BUCKETS))\
    .distinct()\
    .collect()

frequent_itemsets = baskets\
    .mapPartitions(lambda chunk: count_occurrences(chunk, candidates))\
    .reduceByKey(lambda a, b: a + b)\
    .filter(lambda x:  x[1] > THRESHOLD)\
    .collect()

frequent_itemsets.sort(key=lambda x: -x[1])

for x in frequent_itemsets[:10]: print(x)
with open("frequent_itemsets.json", "w") as f:
    json.dump(frequent_itemsets, f, indent=2)

print("Number of candidates:", len(candidates))
print("Number of actual frequent itemsets:", len(frequent_itemsets))




(('Larry Fine', 'Moe Howard'), 235)
(('Jack Mercer', 'Mae Questel'), 167)
(('Ali Basha', 'Brahmanandam'), 159)
(('Oliver Hardy', 'Stan Laurel'), 150)
(('Bebe Daniels', 'Harold Lloyd'), 150)
(('Harold Lloyd', "Harry 'Snub' Pollard"), 148)
(('Bebe Daniels', "Harry 'Snub' Pollard"), 148)
(('James Hetfield', 'Lars Ulrich'), 148)
(('Kirk Hammett', 'Lars Ulrich'), 146)
(('James Hetfield', 'Kirk Hammett'), 145)
Number of candidates: 313
Number of actual frequent itemsets: 48


                                                                                