``` bash
spark-submit task2.py <port #> <output_file_path>
```

In [1]:
import findspark
findspark.init()

In [19]:
import sys
import time
import json
import random
import binascii
import platform
from math import inf

from pyspark import SparkConf, SparkContext, StorageLevel
from pyspark.streaming import StreamingContext

In [38]:
LARGE_NUMBER = inf
HOST_NAME = 'localhost'
WINDOW_LENGTH = 30
SLIDING_INTERVAL = 10

RANDOM_SEED = 1208
EQUAL_THRESHOLD = 0.001
NUM_HASH = 10
BITS_LENGTH = 20
NOF_HASH_BUCKETS = 2 ** BITS_LENGTH
NOF_SMALL_GROUPS = 3

In [4]:
port_num = 9999
output_file_path = "./output/output_task2.csv"

In [5]:
class OneDimensionIntKMeans:
    def __init__(self, k_cluster, seed=RANDOM_SEED):
        self.k_cluster = k_cluster
        self.seed=seed
        
    def initialCentroids(self, d, seed=RANDOM_SEED):
        # d: dict - {idx: value, ...}
        random.seed(seed)
        centroids = {}
        distance = {}
        for i in range(self.k_cluster):
            if i == 0:
                first_centroid_tag = random.choice(list(d.keys()))
                centroids[first_centroid_tag] = d[first_centroid_tag]
                for idx in d:
                    distance[idx] = self.computeDistance(d[idx], d[first_centroid_tag])
            else:
                largest_one = [None, -1]
                for k in distance:
                    if distance[k] > largest_one[1]:
                        largest_one = [k, distance[k]]
                next_centroid_tag = largest_one[0]
                centroids[next_centroid_tag] = d[next_centroid_tag]
                for idx in d:
                    new_distance = self.computeDistance(d[idx], d[next_centroid_tag])
                    distance[idx] = min(distance[idx], new_distance)
        self.initial_centroids = centroids
        return centroids
                    
    def computeDistance(self, i, c):
        return abs(i - c)
    
    def fit(self, d):
        old_centroids = None
        new_centroids = self.initialCentroids(d, self.seed)
        go_on = True
        while(go_on):
            clusters = self.clusterPoints(d, new_centroids)
            old_centroids = new_centroids
            new_centroids = self.computeCentroids(d, clusters)
            go_on = self.checkCentroidsChanged(new_centroids, old_centroids, EQUAL_THRESHOLD)
        return(clusters)
            
    def clusterPoints(self, d, centroids):
        clusters = {}
        for idx in d:
            min_distance = [0, LARGE_NUMBER]
            for tag in centroids:
                dis = self.computeDistance(d[idx], centroids[tag])
                if dis < min_distance[1]:
                    min_distance = [tag, dis]
            new_tag = min_distance[0]
            if clusters.get(new_tag) == None:
                clusters[new_tag] = [idx]
            else:
                clusters[new_tag].append(idx)
        return clusters
    
    def computeCentroids(self, d, clusters):
        new_centroids = {}
        for tag in clusters:
            cluster = clusters[tag]
            new_centroid_value = sum([d[idx] for idx in cluster]) / len(cluster)
            new_centroids[tag] = new_centroid_value
        return new_centroids
    
    def checkCentroidsChanged(self, new_centroids, old_centroids, equal_threshold=EQUAL_THRESHOLD):
        if old_centroids == None:
            return True
        if len(old_centroids) != len(new_centroids):
            return True
        sum_all = 0
        for k in new_centroids:
            sum_all += abs(new_centroids[k] - old_centroids[k])
        dis_avg = sum_all / len(new_centroids)
        print(dis_avg)
        if dis_avg > equal_threshold:
            return True
        else:
            return False
        
        

In [6]:
conf = SparkConf() \
    .setAppName("task") \
    .setMaster("local[*]") \
    .set("spark.driver.memory","4g")
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
scc = StreamingContext(sc, 5)

In [8]:
batches = scc.socketTextStream(HOST_NAME, port_num) \
    .window(WINDOW_LENGTH, SLIDING_INTERVAL) \
    .map(json.loads) \
    .map(lambda x: x['city']) \
    .cache()

In [9]:
def convertStrToInt(s):
    return int(binascii.hexlify(s.encode('utf8')), 16)

In [10]:
def generateHashs(m, num_hash, seed=RANDOM_SEED):
    """
    m - the number of the hash buckets
    num_hash - the number of hash functions
    """
    def hashGenerator(i):
        a = a_s[i]
        b = b_s[i]
        def hashfunc(x):
            return (a * x + b) % m
        return hashfunc
    
    ab = set([])
    random.seed(seed)
    while(len(ab) < 2 * num_hash):
        ab.add(random.randint(1, 10 * num_hash))
        
    a_s = []
    for i in range(num_hash):
        a_s.append(ab.pop())
    b_s = list(ab)

    hash_functions = []
    for i in range(num_hash):
        hash_functions.append(hashGenerator(i))
    return hash_functions

In [28]:
def countTrailingZeros(n, bits_length):
    # n - int
    if n == 0:
        return bits_length
    count = 0
    while(n & 1 ^ 1):
        count += 1
        n = n >> 1
    return count

In [31]:
countTrailingZeros(0, 8)

8

In [12]:
hashs = generateHashs(NOF_HASH_BUCKETS, NUM_HASH)

In [14]:
ground_truth = batches.map(lambda x: (x, 1)) \
    .reduceByKey(lambda x, y: 1) \
    .count()

In [15]:
powers = batches.map(lambda x: convertStrToInt(x)) \
    .map(lambda x: [h(x) for h in hashs]) \
    .map(lambda x: [countTrailingZeros(i) for i in x]) \
    .flatMap(lambda x: [(i, [x[i]]) for i in range(len(x))]) \
    .reduceByKey(lambda x, y: x + y) \
    .map(lambda x: max(x[1])) \
    .collect()

AttributeError: 'TransformedDStream' object has no attribute 'collect'

In [None]:
powers_dict = dict([(i, powers[i]) for i in range(len(powers))])

In [None]:
km = OneDimensionIntKMeans(NOF_SMALL_GROUPS)
clusters = km.fit(powers_dict)

In [None]:
avg_groups = []
for k in clusters:
    group = clusters[k]
    group_avg = sum([2**powers_dict[x] for x in group]) / len(group)
    avg_groups.append(group_avg)

In [None]:
def getMedian(l):
    l = sorted(l)
    length_l = len(l)
    if length_l % 2 == 0:
        # even
        return (l[int(length_l/2)-1] + l[(int(length_l/2))]) / 2
    else:
        return (l[int((length_l-1)/2)])

In [None]:
estimation = getMedian(avg_groups)

In [None]:
print('ground_truth:', ground_truth)
print('estimation:', estimation)

In [None]:
scc.start()

In [None]:
scc.awaitTermination()

In [93]:
ones_position = cities.map(lambda x: convertStrToInt(x)) \
    .flatMap(lambda x: [h(x) for h in hashs]) \
    .distinct() \
    .collect()

In [112]:
output = ' '.join(res)

In [113]:
with open(output_file_path, 'w', encoding='utf-8') as fp:
    fp.write(output)

In [84]:
a.sort()

In [67]:
a

['Agincourt',
 'Ahwatukee',
 'Airdrie',
 'Ajax',
 'Alburg',
 'Allegheny',
 'Allison Park',
 'Ambridge',
 'Amherst',
 'Angus',
 'Anjou',
 'Anthem',
 'Argos',
 'Arnold',
 'Ashburn',
 'Aspinwall',
 'Auburn',
 'Auburn Township',
 'Aurora',
 'Avalon',
 'Avon',
 'Avon Lake',
 'Avondale',
 "Baie-D'urfe",
 'Bainbridge',
 'Bainbridge Township',
 'Baldwin',
 'Ballantyne',
 'Balzac',
 'Bay Village',
 'Bayview',
 'Beachwood',
 'Beaconsfield',
 'Beauharnois',
 'Bedford',
 'Bedford HTS',
 'Bedford Heights',
 'Bedford Hts.',
 'Bellagio',
 'Belleville',
 'Bellevue',
 'Bellvue',
 'Belmont',
 'Beloeil',
 'Beltline',
 'Ben Avon',
 'Bentleyville',
 'Berea',
 'Berry',
 'Bethel Park',
 'Black Earth',
 'Blainville',
 'Blakeney',
 'Blawnox',
 'Bloomfield',
 'Blue Diamond',
 'Boisbriand',
 'Bolton',
 'Bond Head',
 'Boston',
 'Boston Heights',
 'Boucherville',
 'Boulder City',
 'Braddock',
 'Bradford',
 'Bradford West Gwillimbury',
 'Bradfordwoods',
 'Brampton',
 'Brampton FKA Bramalea',
 'Bratenahl',
 'Brecksv

In [36]:
r = [0, 0, 6, 7, 1, 9, 0, 3, 0, 7]
r1 = dict([(i, r[i]) for i in range(len(r))])

In [37]:
r1

{0: 0, 1: 0, 2: 6, 3: 7, 4: 1, 5: 9, 6: 0, 7: 3, 8: 0, 9: 7}

In [38]:
km = OneDimensionIntKMeans(NOF_SMALL_GROUPS)
km.fit()

In [39]:
km.fit(r1)

0.4444444444444445
0.0


{0: [0, 1, 4, 6, 7, 8], 2: [2, 3, 9], 5: [5]}

In [61]:
def convertStrToInt(s):
    return int(binascii.hexlify(s.encode('utf8')), 16)

def generateHashs(m, num_hash, seed=RANDOM_SEED):
    """
    m - the number of the hash buckets
    num_hash - the number of hash functions
    """
#     print(m)
    def hashGenerator(i):
        a = a_s[i]
        b = b_s[i]
        def hashfunc(x):
            return (a * x + b) % m
        return hashfunc
    
    ab = set([])
    random.seed(seed)
    while(len(ab) < 2 * num_hash):
        ab.add(random.randint(1, 10*num_hash))
        
    a_s = []
    for i in range(num_hash):
        a_s.append(ab.pop())
    b_s = list(ab)
    print(a_s)
    print(b_s)

    hash_functions = []
    for i in range(num_hash):
        hash_functions.append(hashGenerator(i))
    return hash_functions

def countTrailingZeros(n, bits_length):
    # n - int
    if n == 0:
        return 0
    count = 0
    while(n & 1 ^ 1):
        count += 1
        n = n >> 1
    return count

In [62]:
hashs = generateHashs(NOF_HASH_BUCKETS, NUM_HASH)

[32, 66, 4, 36, 100, 5, 70, 9, 42, 47]
[81, 50, 82, 84, 93, 54, 23, 27, 61, 63]


In [63]:
city1 = "los angeles"

In [64]:
i = convertStrToInt(city1)
i

131090294906814430402340211

In [65]:
hs = hashs[1](i)
hs

141272

In [66]:
bin(hs)

'0b100010011111011000'

In [67]:
countTrailingZeros(hs, BITS_LENGTH)

3