# Exercise 2

## Imports

In [35]:
import os.path
import random
import pyspark.sql.functions as F
from pyspark.sql import SparkSession, Row
from pyspark.sql.types import StringType, ArrayType, IntegerType
from itertools import combinations, chain
from functools import partial
from typing import Iterable, Any, List, Callable

In [3]:
# Shingle size
k = 9

# Min-hash: number of hash functions
num_functions = 100

# Seed for the random number generator
seed = 123

In [4]:
random.seed(seed)

## Spark Initialization

In [5]:
spark = SparkSession.builder \
    .appName('LSH') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/16 11:26:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Prepare the Data

In [6]:
# TODO: Configure partitions for speedup?
df = spark.read \
    .option('header', True) \
    .json('./data/covid_news_small.json.bz2')

                                                                                

## Pipeline

### Generate shingles

In [7]:
@F.udf(returnType=ArrayType(IntegerType(), False))
def generate_shingles(text: str):
    shingles = (text[idx:idx+k] for idx in range(len(text) - k + 1))
    # Get last 32 bits in order to have 4-byte integers (Python allows arbitrarily large integers)
    to_integer = lambda s: hash(s) & ((1 << 32) - 1)
    return sorted(set(to_integer(shingle_str) for shingle_str in shingles))

In [8]:
df_shingles = df \
    .drop('url') \
    .withColumn('text', generate_shingles('text'))

### Min-hash

In [9]:
# Assumes the values to hash are 4-byte integers
def generate_universal_hash_family(K: int) -> List[Callable[[int], int]]:
    N = 1 << 32
    p = 2305843009213693951

    parameters = set()
    while (len(parameters) < K):
        parameters |= {(random.randint(1, N), random.randint(0, N)) for _ in range(K - len(parameters))}
    
    return [partial(lambda x, a, b, p, N: ((a * x + b) % p) % N, a=a, b=b, p=p, N=N) for a, b in parameters]

In [10]:
@F.udf(returnType=ArrayType(IntegerType(), False))
def min_hash(shingles: List[int]):
    return [min(h(shingle) for shingle in shingles) for h in generate_universal_hash_family(num_functions)]

In [15]:
df_minhash = df_shingles.withColumn('min_hash', min_hash('text'))

### LSH

In [14]:
similarity_threshold = 0.8

b = 10
r = num_functions // b

assert num_functions % b == 0, 'The number of rows of the signature matrix has to be divisible by the number of bands!'

In [16]:
lst = [1,2,3,4,5,6,7,8,9,10]

In [20]:
[lst[x:x+2] for x in range(0, len(lst), 2)]

[[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]]

In [22]:
@F.udf(returnType=ArrayType(ArrayType(IntegerType(), False), False))
def generate_even_slices(minhashes: List[int]):
    return [minhashes[i:i+b] for i in range(0, num_functions, b)]

In [33]:
df_lsh = df_minhash \
    .withColumn('min_hash', generate_even_slices('min_hash')) \
    .select('tweet_id', *(F.hash(F.col('min_hash')[i]).alias(f'band_{i}') for i in range(b)))

In [38]:
df_lsh.foreach(accumulate_buckets)

[Stage 11:>                                                         (0 + 2) / 2]

23/03/16 12:05:55 ERROR PythonRunner: Python worker exited unexpectedly (crashed)
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/pedro/Desktop/MEI-4ano/2semestre/MDLE/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/worker.py", line 666, in main
    eval_type = read_int(infile)
  File "/home/pedro/Desktop/MEI-4ano/2semestre/MDLE/spark-3.3.2-bin-hadoop3/python/lib/pyspark.zip/pyspark/serializers.py", line 595, in read_int
    raise EOFError
EOFError

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:552)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:758)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:740)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:505)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(It

[Stage 11:>                                                         (0 + 1) / 2]

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "/tmp/ipykernel_3175/1514246737.py", line 3, in min_hash
  File "/tmp/ipykernel_3175/1514246737.py", line 3, in <listcomp>
ValueError: min() arg is an empty sequence


23/03/16 12:05:59 WARN PythonUDFRunner: Incomplete task 0.0 in stage 11 (TID 14) interrupted: Attempting to kill Python Worker
23/03/16 12:05:59 WARN TaskSetManager: Lost task 0.0 in stage 11.0 (TID 14) (localhost executor driver): TaskKilled (Stage cancelled)
