In [37]:
import os
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
openjdk-8-jdk-headless is already the newest version (8u352-ga-1~20.04).
0 upgraded, 0 newly installed, 0 to remove and 23 not upgraded.


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [38]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType
from pyspark.sql.functions import col, current_timestamp, to_date, hour, dayofweek\

import numpy as np

from itertools import combinations

In [39]:
def create_new_spark_context(appName):
    return SparkSession.builder.appName(appName)\
        .master("local[*]").getOrCreate()


spark_session = create_new_spark_context("LSH")
sc = spark_session.sparkContext


In [40]:
schema = StructType([
    StructField("DEVICE_CODE", IntegerType(), True),
    StructField("SYSTEM_ID", IntegerType(), True),
    StructField("ORIGINE_CAR_KEY", StringType(), True),
    StructField("FINAL_CAR_KEY", StringType(), True),
    StructField("CHECK_STATUS_KEY", IntegerType(), True),
    StructField("COMPANY_ID", StringType(), True),
    StructField("PASS_DAY_TIME", TimestampType(), True)
])


In [41]:
df = spark_session.read.csv(
    '/content/drive/MyDrive/MDA/HW3/TrafficData.csv', header=True, schema=schema)
df.show(1)


+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|DEVICE_CODE|SYSTEM_ID|ORIGINE_CAR_KEY|FINAL_CAR_KEY|CHECK_STATUS_KEY|COMPANY_ID|      PASS_DAY_TIME|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
|   22010122|      284|       97955760|     64111706|               7|       161|2022-01-10 08:58:02|
+-----------+---------+---------------+-------------+----------------+----------+-------------------+
only showing top 1 row



In [42]:
traffic_rdd = df.rdd.map(lambda x: ((x["FINAL_CAR_KEY"], x["PASS_DAY_TIME"].date()), x["DEVICE_CODE"]))\
                    .groupByKey()\
                    .map(lambda x: (x[0], set(x[1])))


In [43]:
# make a numpy array with size of the number of distinct device codes
device_codes = traffic_rdd.flatMap(lambda x: tuple(x[1])).distinct().collect()
num_device = len(device_codes)
num_device

946

In [44]:
# hash function to map each device code to a number between 0 and num_device
device_index_map = {}
for i in range(num_device):
    device_index_map[device_codes[i]] = i

In [45]:
path_vec = np.zeros(num_device)
indices = np.random.choice(np.arange(num_device), replace=False,
                           size=int(num_device * 0.8))
path_vec[indices] = 1
path = []
for i in range(len(path_vec)):
    if path_vec[i] == 1:
        path.append(device_codes[i])


In [46]:
len(path)

756

In [47]:
def path_similarity(x):
    similarity = 0
    for device_code in x:
        similarity += path_vec[device_index_map[device_code]]
    return similarity / ((len(x) ** 0.5) * len(path) ** 0.5)


most_similar_path = traffic_rdd.map(lambda x: (x[0], path_similarity(x[1]))).sortBy(lambda x: x[-1], ascending=False)
most_similar_path.take(5)

[(('64111706', datetime.date(2022, 1, 10)), 0.8395095799193328),
 (('64111706', datetime.date(2022, 1, 8)), 0.8390337632522099),
 (('64111706', datetime.date(2022, 1, 11)), 0.8378173112911534),
 (('64111706', datetime.date(2022, 1, 12)), 0.8240384548552595),
 (('64111706', datetime.date(2022, 1, 9)), 0.8232996903942893)]

In [48]:
b = 10
r = 15
num_planes = b * r

random_planes = []
for i in range(num_planes):
    random_planes.append(np.random.choice([-1.0, 1.0], size=num_device))
random_planes = np.array(random_planes)
random_planes

array([[-1., -1., -1., ...,  1.,  1., -1.],
       [-1., -1., -1., ..., -1.,  1., -1.],
       [ 1.,  1.,  1., ...,  1., -1.,  1.],
       ...,
       [ 1.,  1., -1., ..., -1., -1., -1.],
       [-1.,  1.,  1., ..., -1.,  1.,  1.],
       [-1., -1., -1., ..., -1.,  1.,  1.]])

In [49]:
def calculate_hash(x):
    items = x
    lst = ""
    for plane in random_planes:
        i = 0
        for item in items:
            i += plane[device_index_map[item]]
        z = 1
        if i < 0:
            z = 0
        lst += str(z)
    return lst

hashed = traffic_rdd.map(lambda x : (x[0], calculate_hash(x[1])))

In [50]:
def hash_vector(x):
    hash_values = []
    for plane in random_planes:
        hash_value = 0
        for code in x:
            hash_value += plane[device_index_map[code]]
        hash_values.append(hash_value)
    sig = "".join(["1" if x > 0 else "0" for x in hash_values])
    return sig


In [51]:
hashed_path = hash_vector(path)

In [52]:
def match_hash(x):
    first = 0
    last = r-1
    for i in range(b):
        if hashed_path[first:last] == x[first:last]:
            return True
        first += r
        last += r
    return False

In [53]:
candidates = hashed.filter(lambda x: match_hash(x[1])).collect()
len(candidates)

5237

In [54]:
unique_candidates = set(map(lambda x: tuple(x[0]), candidates))

In [55]:
# get the most similar path from the candidates
threshold = 0.8

most_similar_path = traffic_rdd.map(lambda x: (x[0], (tuple(x[1]), path_similarity(x[1]))))\
    .filter(lambda x: x[0] in unique_candidates)\
    .filter(lambda x: x[1][1] > threshold)\
    .collect()


In [56]:
len(most_similar_path)

4

In [58]:
for item in most_similar_path:
    print(item[0], item[1][1])

('64111706', datetime.date(2022, 1, 8)) 0.8390337632522099
('64111706', datetime.date(2022, 1, 10)) 0.8395095799193328
('64111706', datetime.date(2022, 1, 13)) 0.8087361135779054
('64111706', datetime.date(2022, 1, 12)) 0.8240384548552595
