# Exercise 2

## Imports

In [37]:
import os.path
import random
import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql.types import StringType, ArrayType, IntegerType
from itertools import combinations, chain
from functools import partial
from typing import Iterable, Any, List, Callable

In [38]:
# Shingle size
k = 9

# Min-hash: number of hash functions
num_functions = 100

# Seed for the random number generator
seed = 123

In [39]:
random.seed(seed)

## Spark Initialization

In [40]:
spark = SparkSession.builder \
    .appName('SandboxAssign2') \
    .config('spark.master', 'local[*]') \
    .getOrCreate()

## Prepare the Data

In [41]:
df = spark.read \
    .option('header', True) \
    .json('./data/covid_news_small.json.bz2')

                                                                                

## Pipeline

### Generate shingles

In [42]:
@F.udf(returnType=ArrayType(IntegerType(), False))
def generate_shingles(text: str):
    shingles = (text[idx:idx+k] for idx in range(len(text) - k + 1))
    # Get last 32 bits in order to have 4-byte integers (Python allows arbitrarily large integers)
    to_integer = lambda s: hash(s) & ((1 << 32) - 1)
    return sorted(set(to_integer(shingle_str) for shingle_str in shingles))

In [43]:
df_shingles = df \
    .drop('url') \
    .withColumn('text', generate_shingles('text'))

### Min-hash

In [44]:
def generate_universal_hash_family(K: int) -> List[Callable[[int], int]]:
    
    # TODO: mod N (what is N? the number of possible shingles or the number of effective shingles present in the data?)
    N = 1 << 32

    p = 2305843009213693951

    parameters = set()
    while (len(parameters) < K):
        parameters |= {(random.randint(1, N), random.randint(0, N)) for _ in range(K - len(parameters))}
    
    return [partial(lambda x, a, b, p, N: ((a * x + b) % p) % N, a=a, b=b, p=p, N=N) for a, b in parameters]

In [45]:
@F.udf(returnType=ArrayType(IntegerType(), False))
def min_hash(shingles: List[int]):
    return [min(h(shingle) for shingle in shingles) for h in generate_universal_hash_family(num_functions)]


In [46]:
df_shingles.withColumn('min_hash', min_hash('text')).show()

[Stage 2:>                                                          (0 + 1) / 1]

+--------------------+-------------------+--------------------+
|                text|           tweet_id|            min_hash|
+--------------------+-------------------+--------------------+
|[274676170, 98138...|1349048668570189824|[836953030, 21302...|
|[179130, 9300061,...|1349046528405606404|[3527590, 1253384...|
|[4082978, 7968454...|1349045380101648384|[8485900, 994263,...|
|[363914, 2392224,...|1349683509351370752|[1315065, 1046285...|
|[2385118, 7039982...|1349678975753342984|[1880380, 5745177...|
|[919142, 3143704,...|1349675124472115201|[1406691, 1113491...|
|[363914, 2392224,...|1349669108229533696|[3318244, 1275084...|
|[410480, 1547329,...|1349664821717983232|[38089, 2758718, ...|
|[3239677, 1329399...|1349656187357323265|[3091937, 1148837...|
|[363914, 2392224,...|1349654208337862656|[734012, 325553, ...|
|[2385118, 2593209...|1349654072882814977|[282442, 935, 142...|
|[648273, 668292, ...|1349652813731737603|[400199, 1338472,...|
|[6020, 179130, 51...|134965214173297868

                                                                                

### LSH