In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

# Exercise 2.1

In [2]:
# Spark Constants
APP_NAME = 'assignment1ex2'
MASTER = 'local[*]'

# Similarity Constants
MIN_HIGH_SIMILARITY = (.9, .85)
MAX_LOW_SIMILARITY = (.05, .6)

# Exercise Input Constants
BANDS_ROWS_MAX_VALUE = 50

In [3]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).config("spark.driver.memory", "15g").getOrCreate()

23/03/23 16:14:59 WARN Utils: Your hostname, pedro-duarte resolves to a loopback address: 127.0.1.1; using 192.168.49.131 instead (on interface wlp2s0)
23/03/23 16:14:59 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/23 16:15:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Lambda functions to abstract probability calcs
similar_probability = lambda s, r: s**r
not_similar_in_bands_probability = lambda s, r, b: (1 - similar_probability(s, r))**b
similar_in_bands_probability = lambda s, r, b: 1 - not_similar_in_bands_probability(s, r, b)

In [5]:
bands_rows_valid_pairs = sc.parallelize([(r, b) for r in range(BANDS_ROWS_MAX_VALUE) for b in range(BANDS_ROWS_MAX_VALUE)]) \
    .filter(lambda v: similar_in_bands_probability(MIN_HIGH_SIMILARITY[1], v[0], v[1]) >= MIN_HIGH_SIMILARITY[0]) \
    .filter(lambda v: similar_in_bands_probability(MAX_LOW_SIMILARITY[1], v[0], v[1]) < MAX_LOW_SIMILARITY[0]) \
    .reduceByKey(min) \
    .sortBy(lambda v: v[0]) \
    .cache()

bands_rows_valid_pairs.collect()

                                                                                

[(11, 13),
 (12, 16),
 (13, 18),
 (14, 22),
 (15, 26),
 (16, 30),
 (17, 36),
 (18, 42)]

In [6]:
r, b = bands_rows_valid_pairs.first()
N_FUNCTIONS = r*b

N_FUNCTIONS

143

# Exercise 2.2

In [7]:
# Data Constants
TWEET_ID_COLUMN = 'tweet_id'
URL_COLUMN = 'url'
TEXT_COLUMN = 'text'

# Algorithm Constants
SHINGLE_SIZE = 4
MINHASH_LENGTH = 5

# Input Constants
INPUT_FILE = 'covid_news_small.json.bz2'

In [8]:
ds = spark.read.json(INPUT_FILE)
ds.schema

[Stage 10:>                                                         (0 + 2) / 2]

StructType([StructField('text', StringType(), True), StructField('tweet_id', StringType(), True), StructField('url', StringType(), True)])


                                                                                

In [9]:
shingles = ds.rdd \
  .map(lambda v: (v[TWEET_ID_COLUMN], v[TEXT_COLUMN].casefold())) \
  .filter(lambda v: len(v[1])) \
  .mapValues(lambda v: set(sorted({v[i:i+SHINGLE_SIZE] for i in range(len(v) - SHINGLE_SIZE + 1)}))) \
  .cache()

shingles.count()

                                                                                

18898

In [10]:
import random

def build_hashes(shingles, functions):
    return [min([((a*shingle + b)%24862048)%len(shingles) for shingle in shingles]) for a, b in functions]

In [11]:
functions = [(random.randint(0, 100), random.randint(0, 100)) for _ in range(N_FUNCTIONS)]

In [12]:
import itertools

def calc_band_hash(row):
    return [((hash(row[1][i*r:(i+1)*r]), i), row[0]) for i in range(b)]

similar_pairs = shingles \
  .mapValues(lambda v: [hash(s) for s in v]) \
  .mapValues(lambda v: build_hashes(v, functions)) \
  .flatMap(lambda v: calc_band_hash((v[0], tuple(v[1])))) \
  .groupByKey() \
  .flatMap(lambda v: itertools.combinations(v[1], 2)) \
  .distinct()
  
similar_pairs.collect()

[Stage 12:>                                                         (0 + 2) / 2]

In [None]:
candidate_pairs = similar_pairs.flatMap(lambda v: [v, v[::-1]]).groupByKey().mapValues(set).collectAsMap()
doc_shingles = shingles.collectAsMap()

ConnectionRefusedError: [Errno 111] Connection refused

In [None]:
def find_similar(article_id: str):
    shingles.key

# Exercise 2.3