In [68]:
import pandas as pd
import pickle
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np

In [51]:
collection = 'covid_county_formatted'

db = MongoClient("lattice-100", 27018)
no_of_records = 1064950

pipeline = [
    {
        "$sample": {"size": no_of_records // 10}
    }
]

cursor = db.sustaindb[collection].aggregate(pipeline)
df_original = pd.DataFrame(list(cursor))

In [52]:
df = df_original[["GISJOIN", "cases"]]
df

Unnamed: 0,GISJOIN,cases
0,G2001650,0
1,G0801170,17
2,G1600870,5
3,G5500630,29
4,G3600790,67
...,...,...
106490,G1700130,0
106491,G5300730,5
106492,G0600410,10
106493,G2001030,0


## Aggregate on 'cases' for each GISJoin

In [77]:
df1 = pd.DataFrame(df.groupby("GISJOIN")["cases"].sum())
dfX = df1['cases']
df1

Unnamed: 0_level_0,cases
GISJOIN,Unnamed: 1_level_1
G0100010,546
G0100030,2751
G0100050,192
G0100070,223
G0100090,437
...,...
G5600370,397
G5600390,273
G5600410,124
G5600430,72


In [78]:
dfX = np.array(dfX).reshape(-1, 1)

In [79]:
kmeans = KMeans(n_clusters=56, random_state=0).fit(dfX)
df_predict = kmeans.fit_predict(dfX)
centroids = kmeans.cluster_centers_
df1['cluster_id'] = df_predict
df1

Unnamed: 0_level_0,cases,cluster_id
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1
G0100010,546,23
G0100030,2751,37
G0100050,192,43
G0100070,223,43
G0100090,437,53
...,...,...
G5600370,397,53
G5600390,273,34
G5600410,124,9
G5600430,72,22


In [109]:
len(centroids)

56

In [106]:
cluster_ids_list = df1['cluster_id']
centroids_column = []

for id in cluster_ids_list:
    centroids_column.append(centroids[id][0])
    
print(len(centroids_column))

3115

In [107]:
df1['centroid'] = centroids_column

In [114]:
df1['distance'] = abs(df1['cases'] - df1['centroid'])
df1

Unnamed: 0_level_0,cases,cluster_id,centroid,distance
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G0100010,546,23,505.491935,40.508065
G0100030,2751,37,2857.650000,106.650000
G0100050,192,43,190.920000,1.080000
G0100070,223,43,190.920000,32.080000
G0100090,437,53,411.383648,25.616352
...,...,...,...,...
G5600370,397,53,411.383648,14.383648
G5600390,273,34,260.253275,12.746725
G5600410,124,9,126.590206,2.590206
G5600430,72,22,72.209302,0.209302


In [188]:
df2 = df1[["cluster_id", "distance"]].reset_index()
df2

Unnamed: 0,GISJOIN,cluster_id,distance
0,G0100010,23,40.508065
1,G0100030,37,106.650000
2,G0100050,43,1.080000
3,G0100070,43,32.080000
4,G0100090,53,25.616352
...,...,...,...
3110,G5600370,53,14.383648
3111,G5600390,34,12.746725
3112,G5600410,9,2.590206
3113,G5600430,22,0.209302


### Tag Parent GISJoins

In [219]:
df2_gr = df2.groupby('cluster_id')['distance'].min().reset_index()
parents = []
for i, row in df2_gr.iterrows():
    cluster_id = row['cluster_id']
    distance = row['distance']
    
    cluster_id_match = df2[df2['cluster_id'] == cluster_id]
    distance_match = cluster_id_match[cluster_id_match['distance'] == distance].reset_index()['GISJOIN']
    parent = list(distance_match)[0] # select only one GISJOIN
    parents.append(parent)
    
print(len(parents))

56


In [221]:
df_clusters = df2_gr.rename(columns={'distance': 'min_distance'})
df_clusters['max_distance'] = df2.groupby('cluster_id')['distance'].max().reset_index()['distance']
print(df_clusters.shape)
df_clusters.head()

(56, 3)


Unnamed: 0,cluster_id,min_distance,max_distance
0,0,0.447368,38.447368
1,1,0.0,0.0
2,2,0.0,0.0
3,3,106.6,180.4
4,4,241.5,241.5


In [225]:
for parent in parents:
    df2.loc[df2.GISJOIN == parent, "is_parent"] = 1

df2 = df2.fillna(0)
df2.head()

Unnamed: 0,GISJOIN,cluster_id,distance,is_parent
0,G0100010,23,40.508065,0
1,G0100030,37,106.65,0
2,G0100050,43,1.08,0
3,G0100070,43,32.08,0
4,G0100090,53,25.616352,0


In [238]:
df3 = df2

for i, row in df2.iterrows():
    i_distance = row['distance']
    cluster_id = row['cluster_id']
    gis_join = row['GISJOIN']
    max_distance = df_clusters[df_clusters['cluster_id'] == cluster_id]['max_distance'].item()
    min_distance = df_clusters[df_clusters['cluster_id'] == cluster_id]['min_distance'].item()
    if max_distance == min_distance:
        frac = 0
    else:
        frac = (i_distance - min_distance)/(max_distance - min_distance)
    df3.loc[df3.GISJOIN == gis_join, "frac_distance"] = frac

In [242]:
sample_min = 0.05
sample_max = 0.25

df3['sample_percent'] = sample_min + (sample_max - sample_min) * df3['frac_distance']
df3

Unnamed: 0,GISJOIN,cluster_id,distance,is_parent,frac_distance,sample_percent
0,G0100010,23,40.508065,0,0.869916,0.223983
1,G0100030,37,106.650000,0,0.751708,0.200342
2,G0100050,43,1.080000,0,0.029412,0.055882
3,G0100070,43,32.080000,0,0.941176,0.238235
4,G0100090,53,25.616352,0,0.545776,0.159155
...,...,...,...,...,...,...
3110,G5600370,53,14.383648,0,0.302816,0.110563
3111,G5600390,34,12.746725,0,0.351993,0.120399
3112,G5600410,9,2.590206,0,0.070336,0.064067
3113,G5600430,22,0.209302,0,0.000000,0.050000


In [243]:
df_final = df3
df_final.to_csv('~/ucc-21/clusters-covid.csv')