- **Student's Names:** Elham Jangravi, Parisa Khanjani Malayeri, Ahmad Bahmani, Zavash Ghorbanpourbaboli
- **Course:** CS550 Big Data Management and Analytics


In [None]:
import pprint
!pip install pyspark
from pyspark.sql import SparkSession
import random
import numpy as np
import math
import hashlib
import csv

spark = SparkSession.builder.master("local[*]").getOrCreate()

hospital_data_trial = spark.sparkContext.textFile('./trial_COVID-19_Hospital_Impact.csv', 32)
hospital_data_test = spark.sparkContext.textFile('./test_COVID-19_Hospital_Impact.csv', 32)

# Tasks
## A. Extract binary features
- Extract binary features (i.e., a sparse representation of the characteristic matrix) per hospital.



**Checkpoint A:**
We employ the combination of `csvreader` and `mapPartitions` for effectively transforming each data row into an array format. This process involves storing the header in a broadcast variable, which facilitates the mapping of each row to its corresponding header name. Additionally, we utilize a set union within a `reduceByKey` function. This method is critical for amalgamating various rows associated with a singular hospital pk, resulting in a consolidated RDD record.


In [None]:
hospital_data_trial = hospital_data_trial.mapPartitions(lambda y: map(lambda line: line.split(','), y))
bc = spark.sparkContext.broadcast(hospital_data_trial.take(1))

hospital_data_trial = hospital_data_trial.filter(lambda row: row != bc.value)
hospital_data_trial = hospital_data_trial.mapPartitions(
    lambda y: map(
        lambda x: (x[0], set([f'{b}:{a}' for a, b in zip(x, bc.value[0])])), y
        )
    ).reduceByKey(lambda x, y: x.union(y))
length = hospital_data_trial.count()

Checkpoint_A = ['150034', '050739', '330231', '241326', '070008']
pks = hospital_data_trial.filter(lambda x: x[0] in Checkpoint_A)

print('Start_____Task A_____Trail_____Data_____')
task_a_Print = pprint.PrettyPrinter(width=150, compact=True, depth = 3)
Task_A_Print.pprint(pks.collect())
print('End_____Task A_____Trail_____Data_____')

                                                                                

Start_____Task A_____Trail _____Data_____




[('070008',
  {'address:201 CHESTNUT HILL ROAD', 'all_adult_hospital_beds_7_day_avg:65.6', 'all_adult_hospital_beds_7_day_coverage:7',
   'all_adult_hospital_beds_7_day_sum:459', 'all_adult_hospital_inpatient_bed_occupied_7_day_avg:31.6',
   'all_adult_hospital_inpatient_bed_occupied_7_day_coverage:7', 'all_adult_hospital_inpatient_bed_occupied_7_day_sum:221',
   'all_adult_hospital_inpatient_beds_7_day_avg:60.3', 'all_adult_hospital_inpatient_beds_7_day_coverage:7',
   'all_adult_hospital_inpatient_beds_7_day_sum:422', 'ccn:070008', 'city:STAFFORD SPRINGS', 'collection_week:2020/07/31', 'fips_code:09013',
   'geocoded_hospital_address:', 'hhs_ids:[C070008-A]', 'hospital_name:JOHNSON MEMORIAL HOSPITAL', 'hospital_pk:070008',
   'hospital_subtype:Short Term', 'icu_beds_used_7_day_avg:-999999', 'icu_beds_used_7_day_coverage:7', 'icu_beds_used_7_day_sum:16',
   'icu_patients_confirmed_influenza_7_day_avg:', 'icu_patients_confirmed_influenza_7_day_coverage:0', 'icu_patients_confirmed_influ

                                                                                

In [None]:
hospital_data_test = hospital_data_test.mapPartitions(lambda y: map(lambda line: line.split(','), y))
bc = spark.sparkContext.broadcast(hospital_data_test.take(1))

hospital_data_test = hospital_data_test.filter(lambda row: row != bc.value)
hospital_data_test = hospital_data_test.mapPartitions(
    lambda y: map(
        lambda x: (x[0], set([f'{b}:{a}' for a, b in zip(x, bc.value[0])])), y
        )
    ).reduceByKey(lambda x, y: x.union(y))
length = hospital_data_test.count()

Checkpoint_A = ['150034', '050739', '330231', '241326', '070008']
pks = hospital_data_test.filter(lambda x: x[0] in Checkpoint_A)

print('Start_____Task A_____Test_____Data_____')
task_a_Print = pprint.PrettyPrinter(width=150, compact=True, depth = 3)
Task_A_Print.pprint(pks.collect())
print('End_____Task A_____Test_____Data_____')

                                                                                

Start_____Task A_____Test _____Data_____




[('070008',
  {'address:201 CHESTNUT HILL ROAD', 'all_adult_hospital_beds_7_day_avg:49.3', 'all_adult_hospital_beds_7_day_avg:50.7',
   'all_adult_hospital_beds_7_day_avg:54.6', 'all_adult_hospital_beds_7_day_avg:55', 'all_adult_hospital_beds_7_day_avg:56.9',
   'all_adult_hospital_beds_7_day_avg:60.3', 'all_adult_hospital_beds_7_day_avg:65.6', 'all_adult_hospital_beds_7_day_avg:68',
   'all_adult_hospital_beds_7_day_coverage:6', 'all_adult_hospital_beds_7_day_coverage:7', 'all_adult_hospital_beds_7_day_sum:330',
   'all_adult_hospital_beds_7_day_sum:345', 'all_adult_hospital_beds_7_day_sum:355', 'all_adult_hospital_beds_7_day_sum:382',
   'all_adult_hospital_beds_7_day_sum:385', 'all_adult_hospital_beds_7_day_sum:398', 'all_adult_hospital_beds_7_day_sum:422',
   'all_adult_hospital_beds_7_day_sum:459', 'all_adult_hospital_beds_7_day_sum:476', 'all_adult_hospital_inpatient_bed_occupied_7_day_avg:23.4',
   'all_adult_hospital_inpatient_bed_occupied_7_day_avg:23.9', 'all_adult_hospital_i

                                                                                

## B. Minhash: Create a “signature” for each hospital
- Use the efficient Minhashing approach to convert the set representation of each hospital into 100 dimensions by using hashes on the set strings.




**Checkpoint B:**
We store the hash functions in a broadcast variable. This setup allows for the configuration of a process where a `reduceByKey`, utilizing sid as the key, is implemented. The purpose of this configuration is to efficiently identify the minimum hashed value corresponding to each 'feat' for every 'sid'.

In [None]:
import math
import random
import pprint

def hash_fun(rdd):
    hospital_id, signature = rdd[0], rdd[1]
    min_hash_value = math.inf
    min_hash_results, hash_functions = [], []

    hash_functions = [(random.randint(1, 99), random.randint(1, 99)) for _ in range(0, 100)]
    min_hash_results = [min([((data * hash_element) + seed) % (10**6) for hash_element in signature]) for data, seed in hash_functions]

    return min_hash_results

def process_data(data):
    printer = pprint.PrettyPrinter(width=90, compact=True)
    processed_data_rdd = data.mapPartitions(lambda y: map(lambda x: (x[0], hash_fun(x)), y))

    checkpoint_data = ['150034', '050739', '330231', '241326', '070008']
    requested_data = processed_data_rdd.filter(lambda x: x[0] in checkpoint_data)
    printer.pprint(requested_data.collect())

    return processed_data_rdd

print('Start_____Custom Task_____Trail Data:')
custom_set_signatures_trail = process_data(hospital_data_trial)
print('End_____Custom Task_____Trail Data')
custom_set_signatures_trail
print('Start_____Custom Task_____Test Data:')
custom_set_signatures_test = process_data(hospital_data_test)
print('End_____Custom Task_____Test Data')
custom_set_signatures_test


Start_____Task B_____Trail Data:


                                                                                

[('150034',
  [184932, 218288, 109046, 60922, 150707, 685705, 45397, 92860, 60878, 19497, 132209,
   61882, 87964, 481933, 38445, 45807, 319960, 191585, 132233, 61869, 110256, 196337,
   90812, 29412, 28815, 218304, 68102, 3691, 38402, 79488, 147455, 65747, 191594, 191628,
   128181, 68131, 61948, 34078, 4051, 369832, 24728, 90849, 191643, 190007, 30479, 130147,
   56767, 184996, 3661, 169297, 60936, 147444, 61872, 481908, 19533, 241677, 34111,
   140692, 45773, 110195, 469321, 19231, 102132, 337927, 272940, 34115, 469365, 319956,
   102112, 98208, 45395, 108314, 70372, 347350, 21036, 61888, 19215, 130136, 4036, 130128,
   90808, 68161, 178412, 30437, 90828, 46409, 3631, 218353, 151260, 166164, 65677, 130144,
   196293, 95997, 56817, 34043, 29398, 135270, 300682, 150692]),
 ('050739',
  [67285, 56799, 300683, 4019, 135265, 178449, 102114, 77426, 79400, 61878, 57615, 50511,
   300655, 38463, 137821, 128199, 347335, 79417, 30997, 30969, 109106, 108306, 19538,
   150723, 266035, 76842, 39



[('150034',
  [151272, 157033, 319973, 151215, 64000, 1886, 140692, 60873, 157033, 19264, 265957,
   70376, 77372, 11596, 319915, 3611, 191602, 236650, 19475, 208574, 30967, 214622,
   266039, 481964, 189995, 171013, 88882, 208574, 92842, 189985, 56738, 70356, 4036,
   218292, 319907, 31017, 236656, 150759, 227573, 76862, 191605, 67289, 272993, 102155,
   214654, 185009, 407859, 88019, 65655, 149484, 151221, 108315, 147411, 4036, 76875,
   67223, 110213, 241704, 32896, 188961, 407856, 48028, 407857, 685749, 96010, 337971,
   191590, 469315, 481911, 19250, 100754, 157003, 347378, 30990, 79435, 337937, 48019,
   132165, 407848, 3994, 140629, 32922, 56766, 168959, 19280, 227618, 108316, 46456,
   407839, 29433, 241724, 32886, 60952, 188933, 151281, 92799, 143518, 19521, 96086,
   128257]),
 ('050739',
  [96003, 208647, 87953, 104325, 275624, 67239, 89253, 61912, 11433, 110243, 298793,
   61903, 34093, 695141, 140623, 38472, 90264, 92862, 77337, 300645, 446015, 10915, 30510,
   96011, 2293

                                                                                

PythonRDD[73] at RDD at PythonRDD.scala:53

## C. Find similar pairs using LSH
- Run LSH to find approximately 20 candidates.



**Checkpoint C:**
We should employ Locality-Sensitive Hashing (LSH) to identify roughly 20 hospitals that exhibit similarities to our specified hospitals: 150034, 50739, 330231, 241326, and 70008. Within the LSH framework, each hospital is conceptualized as a column, with each row signifying a signature matrix value. To arrive at approximately 20 candidate hospitals, it's necessary to fine-tune the number of bands and rows per band in our approach.

In [None]:
import random

similarity_dict = {}

def calculate_jaccard_similarity(sig_x, sig_y):
    set_x, set_y = set(sig_x), set(sig_y)
    str_set_x, str_set_y = str(set_x), str(set_y)

    key = tuple(sorted([str_set_x, str_set_y]))

    if key in similarity_dict:
        return similarity_dict[key]
    else:
        res_jsim = len(set_x.intersection(set_y)) / len(set_x.union(set_y))
        similarity_dict[key] = res_jsim
        return res_jsim

def LSH(sig):
    hospital_pk, signature = sig[0], sig[1]
    buckets = []

    for itms in range(int(len(signature) / 8)):
        opti, y = itms, itms * 8
        elements = ' '.join(str(nsg) for nsg in signature[y:y+8])
        buckets.append(((opti, ((random.randint(1, 100) * hash(elements)) + random.randint(1, 100)) % (10**6)), hospital_pk))

    return iter(buckets)

def print_task_output(signature_set, candidates):
    checkpoint_c_list = ['150034', '050739', '330231', '241326', '070008']

    selected_candidates = candidates.filter(lambda x: x[0] in checkpoint_c_list).collect()
    selected_signatures = signature_set.filter(lambda x: x[0] in checkpoint_c_list).collect()

    for i in range(5):
        print('Hospital_pk:', checkpoint_c_list[i])
        nums = 1
        candidate_hospitals = selected_candidates[i][1]
        candidate_signatures = signature_set.filter(lambda x: x[0] in candidate_hospitals and x[0] != checkpoint_c_list[i]).collect()

        for j in candidate_signatures:
            if nums <= 20:
                print(f'\t{nums} Hospital_pk:', j[0])
                print(f'\tsimilarity:', calculate_jaccard_similarity(selected_signatures[i][1], j[1]))
                print(f'\tsignature matrix:', j[1][0:10], '\n')
                nums += 1

length = hospital_data_trial.count()
trail_hosp_coll = set_signatures_trail.flatMap(LSH).groupByKey()
trail_coll_hosp_data = trail_hosp_coll.mapPartitions(lambda y: map(lambda x: (x[0], list(x[1])), y))
trail_candidates = trail_coll_hosp_data.flatMap(lambda x: [(item, x[1]) for item in x[1]]).reduceByKey(lambda a, b: list(set(a + b)))

print('Trial C')
print_task_output(set_signatures_trail, trail_candidates)
print()

length = hospital_data_trial.count()
test_hosp_coll = set_signatures_test.flatMap(LSH).groupByKey()
test_coll_hosp_data = test_hosp_coll.mapPartitions(lambda y: map(lambda x: (x[0], list(x[1])), y))
test_candidates = test_coll_hosp_data.flatMap(lambda x: [(item, x[1]) for item in x[1]]).reduceByKey(lambda a, b: list(set(a + b)))

print('Test C')
print_task_output(set_signatures_test, test_candidates)


                                                                                

Trial C


                                                                                

Hospital_pk:  150034


                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [497059, 74743, 64837, 65873, 39780, 263497, 164493, 32958, 30063, 18848] 

	2 Hospital_pk:  131312
	similarity:  0.005050505050505051
	signature matrix:  [71887, 4031, 118635, 397799, 22887, 10570, 321755, 30952, 254997, 181715] 

	3 Hospital_pk:  050739
	similarity:  0.0
	signature matrix:  [65138, 15548, 247612, 65133, 39738, 90205, 337774, 11434, 19207, 69726] 

	4 Hospital_pk:  330194
	similarity:  0.0
	signature matrix:  [518034, 130124, 61934, 150696, 57666, 518059, 227597, 77365, 227638, 22941] 

	5 Hospital_pk:  521357
	similarity:  0.005076142131979695
	signature matrix:  [323589, 166178, 70331, 92854, 191591, 1866, 355365, 19274, 355357, 87999] 

	6 Hospital_pk:  100051
	similarity:  0.005025125628140704
	signature matrix:  [270521, 347379, 130134, 193190, 96002, 247688, 10915, 191633, 237164, 140642] 

	7 Hospital_pk:  271323
	similarity:  0.0
	signature matrix:  [88021, 110247, 77363, 19249, 135317, 166222,

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [11784, 89594, 38634, 124746, 163422, 22044, 37648, 11372, 138718, 193096] 

	2 Hospital_pk:  131312
	similarity:  0.005076142131979695
	signature matrix:  [51458, 87956, 270478, 15481, 147396, 51477, 140680, 67208, 316923, 193172] 

	3 Hospital_pk:  150034
	similarity:  0.0
	signature matrix:  [28827, 151238, 169349, 21081, 22718, 32880, 241692, 272982, 214600, 29440] 

	4 Hospital_pk:  330194
	similarity:  0.0
	signature matrix:  [88903, 19206, 77351, 214641, 44068, 61871, 88913, 57597, 555475, 181638] 

	5 Hospital_pk:  521357
	similarity:  0.0
	signature matrix:  [481967, 171068, 4037, 323595, 685765, 107276, 193193, 236655, 96031, 32879] 

	6 Hospital_pk:  100051
	similarity:  0.005050505050505051
	signature matrix:  [109032, 321686, 270477, 270446, 87994, 60875, 270480, 43998, 15525, 270485] 

	7 Hospital_pk:  271323
	similarity:  0.0
	signature matrix:  [189987, 695186, 4043, 208617, 46407, 57669, 130120, 397800,

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [107444, 11806, 21963, 209130, 48054, 43961, 48075, 19334, 38649, 163419] 

	2 Hospital_pk:  131312
	similarity:  0.0
	signature matrix:  [140639, 4004, 140707, 15542, 137802, 1853, 300596, 77411, 46591, 12053] 

	3 Hospital_pk:  150034
	similarity:  0.0
	signature matrix:  [30459, 132201, 149421, 265960, 190009, 67219, 178482, 76842, 308954, 70331] 

	4 Hospital_pk:  050739
	similarity:  0.005025125628140704
	signature matrix:  [65103, 39764, 46480, 128245, 95994, 19208, 158748, 264391, 4015, 151236] 

	5 Hospital_pk:  330194
	similarity:  0.005025125628140704
	signature matrix:  [36022, 300620, 321730, 237222, 321755, 166209, 135317, 370821, 584514, 31019] 

	6 Hospital_pk:  521357
	similarity:  0.005025125628140704
	signature matrix:  [140612, 46455, 177660, 19249, 130208, 118602, 65681, 574707, 100207, 272995] 

	7 Hospital_pk:  100051
	similarity:  0.005025125628140704
	signature matrix:  [21057, 19532, 57600, 1352

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [43596, 54098, 79506, 54905, 164502, 54890, 128547, 120783, 191392, 163433] 

	2 Hospital_pk:  131312
	similarity:  0.01020408163265306
	signature matrix:  [140662, 227686, 34623, 227574, 45785, 22900, 193182, 35942, 237160, 584523] 

	3 Hospital_pk:  150034
	similarity:  0.0
	signature matrix:  [64018, 57620, 130126, 140645, 143518, 120905, 19249, 108359, 128203, 61870] 

	4 Hospital_pk:  050739
	similarity:  0.0
	signature matrix:  [96028, 108294, 39711, 69724, 1804, 39702, 110195, 22699, 110241, 109117] 

	5 Hospital_pk:  330194
	similarity:  0.0
	signature matrix:  [208584, 304390, 140695, 24027, 35969, 179788, 11641, 38397, 555416, 300619] 

	6 Hospital_pk:  521357
	similarity:  0.0
	signature matrix:  [4067, 177736, 77337, 168952, 70385, 685738, 32882, 135297, 57638, 30504] 

	7 Hospital_pk:  100051
	similarity:  0.0
	signature matrix:  [128227, 275553, 107230, 298884, 108357, 39685, 89193, 15504, 193135, 191626] 

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [134481, 28007, 43584, 65881, 128532, 142726, 21683, 95740, 12254, 12344] 

	2 Hospital_pk:  131312
	similarity:  0.005050505050505051
	signature matrix:  [23970, 22620, 319924, 208579, 130117, 10591, 3633, 24036, 208590, 45829] 

	3 Hospital_pk:  150034
	similarity:  0.0
	signature matrix:  [128221, 169366, 191665, 79451, 110190, 4001, 61918, 45431, 167624, 98133] 

	4 Hospital_pk:  050739
	similarity:  0.005025125628140704
	signature matrix:  [44015, 140609, 109079, 20472, 279583, 30126, 279523, 19484, 237432, 60908] 

	5 Hospital_pk:  330194
	similarity:  0.005050505050505051
	signature matrix:  [181625, 11591, 193159, 275570, 92889, 227583, 19295, 11995, 4026, 214595] 

	6 Hospital_pk:  521357
	similarity:  0.0
	signature matrix:  [193177, 43979, 151235, 63948, 65710, 140704, 77394, 11573, 51492, 140687] 

	7 Hospital_pk:  100051
	similarity:  0.0
	signature matrix:  [137776, 22891, 104333, 104324, 24751, 1860, 6951

                                                                                

Test C


                                                                                

Hospital_pk:  150034


                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [95744, 263480, 21715, 21988, 54060, 261432, 131805, 164732, 117254, 164495] 

	2 Hospital_pk:  131312
	similarity:  0.0
	signature matrix:  [469296, 181692, 65093, 407873, 15469, 110204, 130182, 193177, 214653, 87970] 

	3 Hospital_pk:  050739
	similarity:  0.005050505050505051
	signature matrix:  [319893, 22905, 56746, 96074, 178464, 109044, 24694, 9634, 56815, 92828] 

	4 Hospital_pk:  330194
	similarity:  0.0
	signature matrix:  [347340, 60895, 22929, 407903, 28814, 208645, 270463, 147389, 67251, 214654] 

	5 Hospital_pk:  521357
	similarity:  0.005050505050505051
	signature matrix:  [19223, 171059, 214634, 136529, 109046, 100283, 147393, 167621, 76808, 46450] 

	6 Hospital_pk:  100051
	similarity:  0.005050505050505051
	signature matrix:  [128395, 110207, 87999, 19199, 30991, 347418, 48014, 48003, 128388, 4056] 

	7 Hospital_pk:  271323
	similarity:  0.0
	signature matrix:  [88841, 67200, 150751, 57598, 237405, 574

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [18243, 39764, 24058, 54969, 42073, 56598, 64807, 74743, 131767, 286188] 

	2 Hospital_pk:  131312
	similarity:  0.0
	signature matrix:  [46495, 15547, 22644, 22596, 277754, 150730, 517972, 140707, 4009, 22571] 

	3 Hospital_pk:  150034
	similarity:  0.0
	signature matrix:  [369898, 135272, 96005, 191569, 88901, 30510, 191623, 167588, 135313, 65057] 

	4 Hospital_pk:  330194
	similarity:  0.005050505050505051
	signature matrix:  [61918, 370753, 9629, 100791, 167785, 157061, 65082, 46434, 275585, 67231] 

	5 Hospital_pk:  521357
	similarity:  0.01015228426395939
	signature matrix:  [337922, 149439, 135237, 685785, 96055, 147472, 227605, 316921, 76796, 469340] 

	6 Hospital_pk:  271323
	similarity:  0.005076142131979695
	signature matrix:  [321719, 584478, 92828, 30450, 70332, 355287, 181638, 51452, 321767, 46455] 

	7 Hospital_pk:  020024
	similarity:  0.0
	signature matrix:  [446019, 179152, 218268, 128379, 150695, 3379

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [28045, 11456, 234542, 11055, 65920, 65924, 50171, 152477, 27987, 261396] 

	2 Hospital_pk:  131312
	similarity:  0.0
	signature matrix:  [321709, 38453, 237199, 22564, 107270, 47988, 110208, 150729, 15511, 140665] 

	3 Hospital_pk:  150034
	similarity:  0.01015228426395939
	signature matrix:  [44031, 196348, 270478, 128378, 61888, 481949, 135284, 147383, 1856, 481909] 

	4 Hospital_pk:  050739
	similarity:  0.005025125628140704
	signature matrix:  [4036, 38495, 10907, 11352, 61953, 11563, 70371, 264350, 34063, 90173] 

	5 Hospital_pk:  330194
	similarity:  0.00510204081632653
	signature matrix:  [3989, 128398, 252996, 10606, 218325, 584512, 181637, 60893, 128384, 167773] 

	6 Hospital_pk:  521357
	similarity:  0.0
	signature matrix:  [104384, 24776, 177641, 469279, 92886, 128384, 88859, 214469, 107257, 67283] 

	7 Hospital_pk:  100051
	similarity:  0.005076142131979695
	signature matrix:  [30967, 10876, 316930, 634650,

                                                                                

	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [131768, 134506, 104733, 43964, 138750, 124774, 28059, 64826, 23550, 33009] 

	2 Hospital_pk:  131312
	similarity:  0.01015228426395939
	signature matrix:  [237390, 67271, 10590, 65153, 316950, 218290, 227703, 83911, 3666, 104378] 

	3 Hospital_pk:  150034
	similarity:  0.0
	signature matrix:  [104333, 407830, 308909, 147425, 149421, 685708, 24718, 169314, 265967, 218341] 

	4 Hospital_pk:  050739
	similarity:  0.0
	signature matrix:  [76817, 149468, 43991, 22710, 151233, 61865, 30071, 266014, 11631, 98170] 

	5 Hospital_pk:  330194
	similarity:  0.010256410256410256
	signature matrix:  [60864, 24770, 237425, 19444, 214572, 319975, 157069, 237162, 118617, 12045] 

	6 Hospital_pk:  521357
	similarity:  0.01020408163265306
	signature matrix:  [130181, 265982, 181640, 150776, 151214, 208631, 19270, 237185, 238428, 15509] 

	7 Hospital_pk:  100051
	similarity:  0.010101010101010102
	signature matrix:  [38440, 20472, 384975,



	1 Hospital_pk:  hospital_pk
	similarity:  0.0
	signature matrix:  [37624, 12513, 28005, 56019, 10701, 57976, 56442, 428801, 59276, 138772] 

	2 Hospital_pk:  131312
	similarity:  0.0
	signature matrix:  [96027, 119864, 227622, 119884, 45833, 275560, 300667, 518066, 255038, 70332] 

	3 Hospital_pk:  150034
	similarity:  0.005050505050505051
	signature matrix:  [136547, 32887, 9636, 28812, 196301, 337921, 128386, 3606, 65708, 88001] 

	4 Hospital_pk:  050739
	similarity:  0.0
	signature matrix:  [30438, 265989, 30936, 188969, 196326, 69755, 109030, 110268, 34036, 149436] 

	5 Hospital_pk:  330194
	similarity:  0.0
	signature matrix:  [270527, 59974, 214596, 214664, 22941, 88833, 23963, 71940, 48006, 518060] 

	6 Hospital_pk:  521357
	similarity:  0.005025125628140704
	signature matrix:  [178421, 29419, 347350, 105622, 30526, 60918, 30947, 265969, 4069, 181656] 

	7 Hospital_pk:  271323
	similarity:  0.005025125628140704
	signature matrix:  [237166, 60905, 377783, 60910, 109046, 323634, 

                                                                                