In [1]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

# Exercise 2.1

In [2]:
# Spark Constants
APP_NAME = 'assignment1ex2'
MASTER = 'local[*]'

# Similarity Constants
MIN_HIGH_SIMILARITY = (.9, .85)
MAX_LOW_SIMILARITY = (.05, .6)

# Exercise Input Constants
BANDS_ROWS_MAX_VALUE = 50

In [3]:
conf = SparkConf().setAppName(APP_NAME).setMaster(MASTER)
sc = SparkContext.getOrCreate(conf=conf)

spark = SparkSession.builder.appName(APP_NAME).master(MASTER).config("spark.driver.memory", "15g").getOrCreate()

23/03/24 01:49:42 WARN Utils: Your hostname, pedro-duarte resolves to a loopback address: 127.0.1.1; using 192.168.1.201 instead (on interface wlp2s0)
23/03/24 01:49:42 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/24 01:49:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Lambda functions to abstract probability calcs
similar_probability = lambda s, r: s**r
not_similar_in_bands_probability = lambda s, r, b: (1 - similar_probability(s, r))**b
similar_in_bands_probability = lambda s, r, b: 1 - not_similar_in_bands_probability(s, r, b)

In [5]:
bands_rows_valid_pairs = sc.parallelize([(r, b) for r in range(BANDS_ROWS_MAX_VALUE) for b in range(BANDS_ROWS_MAX_VALUE)]) \
    .filter(lambda v: similar_in_bands_probability(MIN_HIGH_SIMILARITY[1], v[0], v[1]) >= MIN_HIGH_SIMILARITY[0]) \
    .filter(lambda v: similar_in_bands_probability(MAX_LOW_SIMILARITY[1], v[0], v[1]) < MAX_LOW_SIMILARITY[0]) \
    .reduceByKey(min) \
    .sortBy(lambda v: v[0]) \
    .cache()

bands_rows_valid_pairs.collect()

                                                                                

[(11, 13),
 (12, 16),
 (13, 18),
 (14, 22),
 (15, 26),
 (16, 30),
 (17, 36),
 (18, 42)]

In [6]:
r, b = (13, 18)
N_FUNCTIONS = r*b

N_FUNCTIONS

234

# Exercise 2.2

In [7]:
# Data Constants
TWEET_ID_COLUMN = 'tweet_id'
URL_COLUMN = 'url'
TEXT_COLUMN = 'text'

# Algorithm Constants
MAX_SHINGLE_SIZE = 5

# Input Constants
INPUT_FILE = 'covid_news_small.json.bz2'

In [8]:
ds = spark.read.json(INPUT_FILE)
ds.schema

                                                                                

StructType([StructField('text', StringType(), True), StructField('tweet_id', StringType(), True), StructField('url', StringType(), True)])

In [9]:
import re

shingles = ds.rdd \
  .map(lambda v: (v[TWEET_ID_COLUMN], v[TEXT_COLUMN].casefold())) \
  .filter(lambda v: len(v[1])) \
  .mapValues(lambda v: [shingle for word in v.split() for shingle in ({word[i:i+MAX_SHINGLE_SIZE] for i in range(len(word) - MAX_SHINGLE_SIZE + 1)} if len(word) > MAX_SHINGLE_SIZE else [word])]) \
  .mapValues(sorted) \
  .mapValues(set) \
  .cache()

shingles.count()

                                                                                

18898

In [10]:
import random

def build_hashes(shingles, functions):
    return [min([((a*shingle + b)%24862048)%len(shingles) for shingle in shingles]) for a, b in functions]

In [11]:
functions = [(random.randint(0, 100), random.randint(0, 100)) for _ in range(N_FUNCTIONS)]

In [12]:
import itertools

def calc_band_hash(row):
    return [((hash(row[1][i*r:(i+1)*r]), i), row[0]) for i in range(b)]

similar_pairs = shingles \
  .mapValues(lambda v: [hash(s) for s in v]) \
  .mapValues(lambda v: build_hashes(v, functions)) \
  .flatMap(lambda v: calc_band_hash((v[0], tuple(v[1])))) \
  .groupByKey() \
  .flatMap(lambda v: itertools.combinations(v[1], 2)) \
  .distinct()
  
similar_pairs.collect()

                                                                                

[('1349048668570189824', '1349665231132389377'),
 ('1349048668570189824', '1349341852617601025'),
 ('1349048668570189824', '1349364498935775239'),
 ('1349048668570189824', '1349398073085190151'),
 ('1349048668570189824', '1349730663352594435'),
 ('1349048668570189824', '1349728145788731395'),
 ('1349048668570189824', '1349690395236704262'),
 ('1349048668570189824', '1349695433459822592'),
 ('1349048668570189824', '1349331788833943552'),
 ('1349048668570189824', '1349263835472023552'),
 ('1349048668570189824', '1348031202389848069'),
 ('1349048668570189824', '1348035481141977089'),
 ('1349048668570189824', '1346806396118052864'),
 ('1349048668570189824', '1346807651351592961'),
 ('1349048668570189824', '1346546291145404417'),
 ('1349048668570189824', '1346244700546740225'),
 ('1349048668570189824', '1346718311812374528'),
 ('1349048668570189824', '1348328925215260674'),
 ('1349048668570189824', '1348323890859307013'),
 ('1349048668570189824', '1346192324548849664'),
 ('13490486685701898

In [13]:
candidate_pairs = similar_pairs.flatMap(lambda v: [v, v[::-1]]).groupByKey().mapValues(set).collectAsMap()
shingles = shingles.collectAsMap()

                                                                                

In [14]:
def find_similar(article_id: str, sim_treshold: float):
    return [pair for pair in candidate_pairs[article_id] if len([None for s in shingles[article_id] if s in shingles[pair]])/len(shingles[article_id]) > sim_treshold]
        
def calc_similar(article_id: str):
    return [len([None for s in shingles[article_id] if s in shingles[pair]])/len(shingles[article_id]) for pair in candidate_pairs[article_id]]
        

In [15]:
find_similar('1346893198283694085', .85)

[]

In [16]:
calc_similar('1346893198283694085')

[0.15861344537815125,
 0.08928571428571429,
 0.19852941176470587,
 0.08823529411764706,
 0.09558823529411764,
 0.08718487394957983,
 0.2668067226890756,
 0.2668067226890756,
 0.0976890756302521,
 0.14915966386554622,
 0.27205882352941174,
 0.2773109243697479,
 0.18382352941176472,
 0.11134453781512606,
 0.0661764705882353,
 0.34243697478991597,
 0.20903361344537816,
 0.16071428571428573,
 0.15231092436974789,
 0.13550420168067226,
 0.11134453781512606,
 0.14180672268907563,
 0.05567226890756303,
 0.27941176470588236,
 0.16071428571428573,
 0.11554621848739496,
 0.007352941176470588,
 0.07563025210084033,
 0.26785714285714285,
 0.14705882352941177,
 0.12184873949579832,
 0.15966386554621848,
 0.014705882352941176,
 0.025210084033613446,
 0.14600840336134455,
 0.10714285714285714,
 0.3277310924369748,
 0.08928571428571429,
 0.09663865546218488,
 0.12394957983193278,
 0.046218487394957986,
 0.3014705882352941,
 0.0934873949579832,
 0.14705882352941177,
 0.22268907563025211,
 0.10189075630

In [17]:
len(candidate_pairs['1348954786692005888'])

38

# Exercise 2.3

In [22]:
subset = sc.parallelize(list(shingles.keys())[:100])

In [23]:
false_positive_rate = subset \
  .filter(lambda v: v in candidate_pairs.keys()) \
  .map(lambda k: len([None for v in candidate_pairs[k] if v not in find_similar(k, .85)]) / len(candidate_pairs[k])) \
  .reduce(lambda v1, v2: v1 + v2) / len(shingles.keys()) * 100

false_positive_rate



In [None]:
similarity = lambda v1, v2: len([v for v in v1 if v in v2])/len(v1)

In [None]:
false_negative_percentages = subset.cartesian(subset) \
  .filter(lambda v: v[0] > v[1]) \
  .filter(lambda v: similarity(shingles[v[0]], shingles[v[1]]) > .85) \
  .flatMap(lambda v: [v, v[::-1]]) \
  .groupByKey() \
  .map(lambda v: (v[0], len([None for p in v[1] if p not in candidate_pairs[v[0]]]) / len(v[1]) if v[0] in candidate_pairs else 1)) \
  .groupByKey() \
  .mapValues(lambda v: sum(v)/len(v)) \
  .map(lambda v: v[1]) \
  .collect()

false_negative_rate = sum(false_negative_percentages)/len(false_negative_percentages)
false_negative_rate

                                                                                

0.8