In [1]:
import pandas as pd
import pickle
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np

In [2]:
collection = 'covid_county_formatted'

db = MongoClient("lattice-100", 27018)
no_of_records = 1064950

pipeline = [
    {
        "$sample": {"size": no_of_records // 5}
    }
]

cursor = db.sustaindb[collection].aggregate(pipeline)
df_original = pd.DataFrame(list(cursor))

In [3]:
df = df_original[["GISJOIN", "cases"]]
df

Unnamed: 0,GISJOIN,cases
0,G1800270,0
1,G5300350,24
2,G2001030,11
3,G5100310,5
4,G1300570,35
...,...,...
212985,G3600250,0
212986,G4701890,82
212987,G3700210,29
212988,G3601110,1


## Aggregate on 'cases' for each GISJoin

In [4]:
df1 = pd.DataFrame(df.groupby("GISJOIN")["cases"].sum())
dfX = df1['cases']
df1

Unnamed: 0_level_0,cases
GISJOIN,Unnamed: 1_level_1
G0100010,1232
G0100030,4460
G0100050,422
G0100070,372
G0100090,1181
...,...
G5600370,758
G5600390,688
G5600410,476
G5600430,181


In [5]:
dfX = np.array(dfX).reshape(-1, 1)

In [6]:
kmeans = KMeans(n_clusters=56, random_state=0).fit(dfX)
df_predict = kmeans.fit_predict(dfX)
centroids = kmeans.cluster_centers_
df1['cluster_id'] = df_predict
df1

Unnamed: 0_level_0,cases,cluster_id
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1
G0100010,1232,54
G0100030,4460,45
G0100050,422,40
G0100070,372,40
G0100090,1181,54
...,...,...
G5600370,758,20
G5600390,688,28
G5600410,476,53
G5600430,181,37


In [7]:
len(centroids)

56

In [8]:
cluster_ids_list = df1['cluster_id']
centroids_column = []

for id in cluster_ids_list:
    centroids_column.append(centroids[id][0])
    
print(len(centroids_column))

3115


In [9]:
df1['centroid'] = centroids_column

In [10]:
df1['distance'] = abs(df1['cases'] - df1['centroid'])
df1

Unnamed: 0_level_0,cases,cluster_id,centroid,distance
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G0100010,1232,54,1255.655172,23.655172
G0100030,4460,45,4211.925926,248.074074
G0100050,422,40,397.490196,24.509804
G0100070,372,40,397.490196,25.490196
G0100090,1181,54,1255.655172,74.655172
...,...,...,...,...
G5600370,758,20,791.022556,33.022556
G5600390,688,28,650.358974,37.641026
G5600410,476,53,515.933333,39.933333
G5600430,181,37,163.015152,17.984848


In [11]:
df2 = df1[["cluster_id", "distance"]].reset_index()
df2

Unnamed: 0,GISJOIN,cluster_id,distance
0,G0100010,54,23.655172
1,G0100030,45,248.074074
2,G0100050,40,24.509804
3,G0100070,40,25.490196
4,G0100090,54,74.655172
...,...,...,...
3110,G5600370,20,33.022556
3111,G5600390,28,37.641026
3112,G5600410,53,39.933333
3113,G5600430,37,17.984848


### Tag Parent GISJoins

In [12]:
df2_gr = df2.groupby('cluster_id')['distance'].min().reset_index()
parents = []
for i, row in df2_gr.iterrows():
    cluster_id = row['cluster_id']
    distance = row['distance']
    
    cluster_id_match = df2[df2['cluster_id'] == cluster_id]
    distance_match = cluster_id_match[cluster_id_match['distance'] == distance].reset_index()['GISJOIN']
    parent = list(distance_match)[0] # select only one GISJOIN
    parents.append(parent)
    
print(len(parents))

56


In [13]:
df_clusters = df2_gr.rename(columns={'distance': 'min_distance'})
df_clusters['max_distance'] = df2.groupby('cluster_id')['distance'].max().reset_index()['distance']
print(df_clusters.shape)
df_clusters.head()

(56, 3)


Unnamed: 0,cluster_id,min_distance,max_distance
0,0,0.973333,125.973333
1,1,0.0,0.0
2,2,258.0,258.0
3,3,32.941176,335.058824
4,4,153.333333,405.666667


In [14]:
for parent in parents:
    df2.loc[df2.GISJOIN == parent, "is_parent"] = 1

df2 = df2.fillna(0)
df2.head()

Unnamed: 0,GISJOIN,cluster_id,distance,is_parent
0,G0100010,54,23.655172,0.0
1,G0100030,45,248.074074,0.0
2,G0100050,40,24.509804,0.0
3,G0100070,40,25.490196,0.0
4,G0100090,54,74.655172,0.0


In [15]:
df3 = df2

for i, row in df2.iterrows():
    i_distance = row['distance']
    cluster_id = row['cluster_id']
    gis_join = row['GISJOIN']
    max_distance = df_clusters[df_clusters['cluster_id'] == cluster_id]['max_distance'].item()
    min_distance = df_clusters[df_clusters['cluster_id'] == cluster_id]['min_distance'].item()
    if max_distance == min_distance:
        frac = 0
    else:
        frac = (i_distance - min_distance)/(max_distance - min_distance)
    df3.loc[df3.GISJOIN == gis_join, "frac_distance"] = frac

In [16]:
sample_min = 0.05
sample_max = 0.25

df3['sample_percent'] = sample_min + (sample_max - sample_min) * df3['frac_distance']
df3

Unnamed: 0,GISJOIN,cluster_id,distance,is_parent,frac_distance,sample_percent
0,G0100010,54,23.655172,0.0,0.217618,0.093524
1,G0100030,45,248.074074,0.0,1.000000,0.250000
2,G0100050,40,24.509804,0.0,0.407112,0.131422
3,G0100070,40,25.490196,0.0,0.423729,0.134746
4,G0100090,54,74.655172,0.0,0.700163,0.190033
...,...,...,...,...,...,...
3110,G5600370,20,33.022556,0.0,0.457438,0.141488
3111,G5600390,28,37.641026,0.0,0.536232,0.157246
3112,G5600410,53,39.933333,0.0,0.595025,0.169005
3113,G5600430,37,17.984848,0.0,0.315426,0.113085


In [17]:
unique_parents1 = list(df3[df3['is_parent'] == 1]['GISJOIN'].unique())
print(len(unique_parents1))

56


In [18]:
df_cluster_parents = df3[df3['is_parent'] == 1][['cluster_id', 'GISJOIN']].sort_values('cluster_id').reset_index()
del df_cluster_parents['index']
cluster_parent_map = {}

for i, row in df_cluster_parents.iterrows():
    cluster_parent_map[row['cluster_id']] = row['GISJOIN']

In [19]:
cluster_parent_map.keys()

dict_keys([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55])

## Find Parent for each Child

In [20]:
children = list(df3[df3['is_parent'] == 0]['GISJOIN'])

child_parent_map = {}
for child in children:
    cluster_id = df3[df3['GISJOIN'] == child]['cluster_id'].item()
    parent = cluster_parent_map[cluster_id]
    child_parent_map[child] = parent
    
# child_parent_map.values().unique()

In [21]:
len(child_parent_map.keys())

3059

In [22]:
child_parent_map['G5600150']

'G2200910'

In [23]:
# serialize child_parent_map
pickle.dump(child_parent_map, open('ucc-21/covid_child_parent_map.pkl', 'wb'))

In [24]:
# Write child parent map to csv
children = []
parents = []
for child, parent in child_parent_map.items():
    children.append(child)
    parents.append(parent)
    
df_child_parent = pd.DataFrame(zip(children, parents), columns=['child', 'parent'])
# df_child_parent.to_csv('ucc-21/covid_child_parent_map.csv', index=False)

In [27]:
# Child Parent Map Sanity Check
unique_children = list(set(children))
unique_parents2 = list(set(parents))

print(len(unique_children))
print(len(unique_parents2))

temp_child_list = pickle.load(open('ucc-21/temp_child_list.pkl', 'rb'))

loaded_parents = ['G0400130', 'G0400190', 'G0400270', 'G0500590', 'G0500690', 'G0501030', 'G0600370', 'G0600590', 'G0600650', 'G0600710', 'G0600730', 'G0800050', 'G0800150', 'G0900090', 'G1200090', 'G1200110', 'G1200170', 'G1200830', 'G1201050', 'G1301350', 'G1700310', 'G1700430', 'G1701110', 'G1800970', 'G1900610', 'G1901010', 'G2405100', 'G2601210', 'G2700810', 'G2900950', 'G3000130', 'G3200030', 'G3300150', 'G3400030', 'G3500010', 'G3600510', 'G3600590', 'G3600910', 'G3900170', 'G3900350', 'G3900410', 'G4001090', 'G4200030', 'G4201010', 'G4600990', 'G4800270', 'G4800290', 'G4800850', 'G4801090', 'G4801130', 'G4801570', 'G4802010', 'G4802150', 'G4900350', 'G5100090', 'G5400550']

yes = 0
no = 0
errors = 0
for child in temp_child_list:
    try:
        parent = child_parent_map[child]
        if parent in loaded_parents:
            yes += 1
        else:
            no += 1
    except:
        errors += 1

print(f'#yes: {yes}')
print(f'#no: {no}')
print(f'#errors: {errors}')

3059
49
#yes: 14
#no: 1785
#errors: 18


In [29]:
len(set(loaded_parents) - set(unique_parents2))

49

In [None]:
len(set(unique_parents1) - set(unique_parents2))

In [None]:
df_final = df3
# df_final.to_csv('~/ucc-21/clusters-covid.csv')