``` bash
spark-submit task1.py <first_json_path> <second_json_path> <output_file_path>
```

In [1]:
import findspark
findspark.init()

In [2]:
import sys
import time
import json
# import os
import random
# from math import sqrt
import binascii

from pyspark import SparkConf, SparkContext, StorageLevel

In [87]:
RANDOM_SEED = 1208
NUM_HASH = 2
BIT_ARRAY_LENGTH = 200

In [3]:
first_json_path = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/homework/hw6/asnlib/publicdata/business_first.json"
second_json_path = "file:///Users/markduan/duan/USC_course/USC_APDS/INF553/homework/hw6/asnlib/publicdata/business_second.json"
output_file_path = "./output/output_task1.csv"

In [4]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")

In [89]:
cities = sc.textFile(first_json_path) \
    .map(json.loads) \
    .map(lambda x: x['city']) \
    .filter(lambda x: x != '') \
    .distinct()

In [114]:
len(cities.collect())

860

In [5]:
cities2 = sc.textFile(second_json_path) \
    .map(json.loads) \
    .map(lambda x: x['city']) \
    .filter(lambda x: x != '') \
    .distinct()

In [6]:
len(cities2.collect())

805

In [90]:
def convertStrToInt(s):
    return int(binascii.hexlify(s.encode('utf8')), 16)

In [91]:
def generateHashs(m, num_hash, seed=RANDOM_SEED):
    """
    m - the length of the filter bit array
    num_hash - the number of hash functions
    """
    def hashGenerator(i):
        a = a_s[i]
        b = b_s[i]
        def hashfunc(x):
            return (a * x + b) % m
        return hashfunc
    
    ab = set([])
    random.seed(seed)
    while(len(ab) < 2 * num_hash):
        ab.add(random.randint(1, 10 * num_hash))
        
    a_s = []
    for i in range(num_hash):
        a_s.append(ab.pop())
    b_s = list(ab)

    hash_functions = []
    for i in range(num_hash):
        hash_functions.append(hashGenerator(i))
    return hash_functions

In [92]:
hashs = generateHashs(BIT_ARRAY_LENGTH, NUM_HASH)

In [93]:
ones_position = cities.map(lambda x: convertStrToInt(x)) \
    .flatMap(lambda x: [h(x) for h in hashs]) \
    .distinct() \
    .collect()

In [94]:
class Bitmap():
    def __init__(self, list_size):
        """
        list_size - size of a list. the list consists of either 1 or 0
        """
        self.size = list_size // 31 + 1
        self.array = [0] * self.size
    
    def initialize(self, l):
        for x in l:
            self.setValue(x, 1)
    
    def getValue(self, n):
        """
        n - position number, from 0
        """
        ai, bp = self.getPosition(n)
        return self.array[ai] >> bp & 1

    def setValue(self, n, v):
        """
        n - position number, from 0
        v - value, 1 or 0
        """
        ai, bp = self.getPosition(n)
        if v == 1:
            # set the bit to one
            self.array[ai] = self.array[ai] | (1 << bp)
        elif v == 0:
            # set the bit to zero
            self.array[ai] = self.array[ai] & (~(1 << bp))
        else:
            print("wrong v value.")

    def getPosition(self, n):
        """
        n - position number, from 0
        """
        array_index = n // 31
        bit_position = n % 31
        return(array_index, bit_position)

In [98]:
bitarray = Bitmap(BIT_ARRAY_LENGTH)

In [99]:
bitarray.initialize(ones_position)

In [109]:
def checkCity(city, bitarray, hashs):
    """
    bitarray -  bloom filter bit array
    """
    if city == '':
        return '0'
    int_city = convertStrToInt(city)
    positions = [h(int_city) for h in hashs]
    for p in positions:
        if bitarray.getValue(p) != 1:
            return '0'
    return '1'

In [110]:
res = sc.textFile(second_json_path) \
    .map(json.loads) \
    .map(lambda x: x['city']) \
    .map(lambda x: checkCity(x, bitarray, hashs)) \
    .collect()

In [112]:
output = ' '.join(res)

In [113]:
with open(output_file_path, 'w', encoding='utf-8') as fp:
    fp.write(output)

In [84]:
a.sort()

In [67]:
a

['Agincourt',
 'Ahwatukee',
 'Airdrie',
 'Ajax',
 'Alburg',
 'Allegheny',
 'Allison Park',
 'Ambridge',
 'Amherst',
 'Angus',
 'Anjou',
 'Anthem',
 'Argos',
 'Arnold',
 'Ashburn',
 'Aspinwall',
 'Auburn',
 'Auburn Township',
 'Aurora',
 'Avalon',
 'Avon',
 'Avon Lake',
 'Avondale',
 "Baie-D'urfe",
 'Bainbridge',
 'Bainbridge Township',
 'Baldwin',
 'Ballantyne',
 'Balzac',
 'Bay Village',
 'Bayview',
 'Beachwood',
 'Beaconsfield',
 'Beauharnois',
 'Bedford',
 'Bedford HTS',
 'Bedford Heights',
 'Bedford Hts.',
 'Bellagio',
 'Belleville',
 'Bellevue',
 'Bellvue',
 'Belmont',
 'Beloeil',
 'Beltline',
 'Ben Avon',
 'Bentleyville',
 'Berea',
 'Berry',
 'Bethel Park',
 'Black Earth',
 'Blainville',
 'Blakeney',
 'Blawnox',
 'Bloomfield',
 'Blue Diamond',
 'Boisbriand',
 'Bolton',
 'Bond Head',
 'Boston',
 'Boston Heights',
 'Boucherville',
 'Boulder City',
 'Braddock',
 'Bradford',
 'Bradford West Gwillimbury',
 'Bradfordwoods',
 'Brampton',
 'Brampton FKA Bramalea',
 'Bratenahl',
 'Brecksv

In [62]:
convertStrToInt('')

ValueError: invalid literal for int() with base 16: b''