In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

# Exercise 2.1

In [3]:
# Spark Constants
APP_NAME = 'assignment1ex2'
MASTER = 'local[*]'

# Similarity Constants
MIN_HIGH_SIMILARITY = (.9, .85)
MAX_LOW_SIMILARITY = (.05, .6)

# Exercise Input Constants
BANDS_ROWS_MAX_VALUE = 50

In [4]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).config("spark.driver.memory", "15g").getOrCreate()

23/03/23 21:26:54 WARN Utils: Your hostname, pedro-duarte resolves to a loopback address: 127.0.1.1; using 192.168.1.201 instead (on interface wlp2s0)
23/03/23 21:26:54 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/23 21:26:55 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
# Lambda functions to abstract probability calcs
similar_probability = lambda s, r: s**r
not_similar_in_bands_probability = lambda s, r, b: (1 - similar_probability(s, r))**b
similar_in_bands_probability = lambda s, r, b: 1 - not_similar_in_bands_probability(s, r, b)

In [6]:
bands_rows_valid_pairs = sc.parallelize([(r, b) for r in range(BANDS_ROWS_MAX_VALUE) for b in range(BANDS_ROWS_MAX_VALUE)]) \
    .filter(lambda v: similar_in_bands_probability(MIN_HIGH_SIMILARITY[1], v[0], v[1]) >= MIN_HIGH_SIMILARITY[0]) \
    .filter(lambda v: similar_in_bands_probability(MAX_LOW_SIMILARITY[1], v[0], v[1]) < MAX_LOW_SIMILARITY[0]) \
    .reduceByKey(min) \
    .sortBy(lambda v: v[0]) \
    .cache()

bands_rows_valid_pairs.collect()

                                                                                

[(11, 13),
 (12, 16),
 (13, 18),
 (14, 22),
 (15, 26),
 (16, 30),
 (17, 36),
 (18, 42)]

In [7]:
r, b = bands_rows_valid_pairs.first()
N_FUNCTIONS = r*b

N_FUNCTIONS

143

# Exercise 2.2

In [45]:
# Data Constants
TWEET_ID_COLUMN = 'tweet_id'
URL_COLUMN = 'url'
TEXT_COLUMN = 'text'

# Algorithm Constants
MAX_SHINGLE_SIZE = 5

# Input Constants
INPUT_FILE = 'covid_news_small.json.bz2'

In [46]:
ds = spark.read.json(INPUT_FILE)
ds.schema

                                                                                

StructType([StructField('text', StringType(), True), StructField('tweet_id', StringType(), True), StructField('url', StringType(), True)])

In [47]:
import re

shingles = ds.rdd \
  .map(lambda v: (v[TWEET_ID_COLUMN], v[TEXT_COLUMN].casefold())) \
  .filter(lambda v: len(v[1])) \
  .mapValues(lambda v: [shingle for word in v.split() for shingle in ({word[i:i+MAX_SHINGLE_SIZE] for i in range(len(word) - MAX_SHINGLE_SIZE + 1)} if len(word) > MAX_SHINGLE_SIZE else [word])]) \
  .mapValues(sorted) \
  .mapValues(set) \
  .cache()

shingles.count()

                                                                                

18898

In [48]:
import random

def build_hashes(shingles, functions):
    return [min([((a*shingle + b)%24862048)%len(shingles) for shingle in shingles]) for a, b in functions]

In [49]:
functions = [(random.randint(0, 100), random.randint(0, 100)) for _ in range(N_FUNCTIONS)]

In [50]:
import itertools

def calc_band_hash(row):
    return [((hash(row[1][i*r:(i+1)*r]), i), row[0]) for i in range(b)]

similar_pairs = shingles \
  .mapValues(lambda v: [hash(s) for s in v]) \
  .mapValues(lambda v: build_hashes(v, functions)) \
  .flatMap(lambda v: calc_band_hash((v[0], tuple(v[1])))) \
  .groupByKey() \
  .flatMap(lambda v: itertools.combinations(v[1], 2)) \
  .distinct()
  
similar_pairs.collect()

                                                                                

[('1349048668570189824', '1349364498935775239'),
 ('1349048668570189824', '1349730663352594435'),
 ('1349048668570189824', '1349761964982079491'),
 ('1349048668570189824', '1349784610834563072'),
 ('1349048668570189824', '1346807651351592961'),
 ('1349048668570189824', '1348649790830227456'),
 ('1349048668570189824', '1348821326379810816'),
 ('1349048668570189824', '1348998338453110784'),
 ('1349048668570189824', '1345515387044114436'),
 ('1349048668570189824', '1345346774697824257'),
 ('1349048668570189824', '1348963549352230914'),
 ('1349048668570189824', '1347181981755437056'),
 ('1349048668570189824', '1354372498402533376'),
 ('1349048668570189824', '1354371238051913728'),
 ('1349048668570189824', '1354347333111189507'),
 ('1349048668570189824', '1353704344793788416'),
 ('1349048668570189824', '1352561817105674240'),
 ('1349048668570189824', '1352191881002811393'),
 ('1349048668570189824', '1351847107313938432'),
 ('1349048668570189824', '1354770118052601860'),
 ('13490486685701898

In [51]:
candidate_pairs = similar_pairs.flatMap(lambda v: [v, v[::-1]]).groupByKey().mapValues(set).collectAsMap()
shingles = shingles.collectAsMap()

                                                                                

In [52]:
def find_similar(article_id: str, sim_treshold: float):
    return [pair for pair in candidate_pairs[article_id] if len([None for s in shingles[article_id] if s in shingles[pair]])/len(shingles[article_id]) > sim_treshold]
        
def calc_similar(article_id: str):
    return [len([None for s in shingles[article_id] if s in shingles[pair]])/len(shingles[article_id]) for pair in candidate_pairs[article_id]]
        

In [53]:
find_similar('1349014860248584200', .85)

['1349046528405606404']

In [54]:
calc_similar('1349014860248584200')

[0.05714285714285714,
 0.2938775510204082,
 0.23673469387755103,
 0.12653061224489795,
 0.10612244897959183,
 0.10204081632653061,
 0.3224489795918367,
 1.0,
 0.2653061224489796,
 0.5183673469387755,
 0.17551020408163265,
 0.23673469387755103,
 0.20408163265306123,
 0.23673469387755103,
 0.2,
 0.14285714285714285,
 0.2857142857142857,
 0.2163265306122449,
 0.24081632653061225,
 0.40816326530612246,
 0.3510204081632653,
 0.3020408163265306,
 0.2653061224489796,
 0.053061224489795916,
 0.2897959183673469]

In [55]:
len(candidate_pairs['1349014860248584200'])

25

# Exercise 2.3