In [1]:
import pandas as pd
import pickle
from pymongo import MongoClient
from sklearn.cluster import KMeans
from sklearn.metrics import pairwise_distances_argmin_min
import numpy as np

In [2]:
collection = 'covid_county_formatted'

db = MongoClient("lattice-100", 27018)
no_of_records = 1064950

pipeline = [
    {
        "$sample": {"size": no_of_records // 5}
    }
]

cursor = db.sustaindb[collection].aggregate(pipeline)
df_original = pd.DataFrame(list(cursor))

In [3]:
df = df_original[["GISJOIN", "cases"]]
df

Unnamed: 0,GISJOIN,cases
0,G3901190,55
1,G4001390,9
2,G1302690,1
3,G2101830,2
4,G3000310,3
...,...,...
212985,G3700030,22
212986,G4801970,2
212987,G5400210,1
212988,G2801410,7


## Aggregate on 'cases' for each GISJoin

In [4]:
df1 = pd.DataFrame(df.groupby("GISJOIN")["cases"].sum())
dfX = df1['cases']
df1

Unnamed: 0_level_0,cases
GISJOIN,Unnamed: 1_level_1
G0100010,1240
G0100030,3685
G0100050,494
G0100070,574
G0100090,1391
...,...
G5600370,708
G5600390,396
G5600410,365
G5600430,170


In [5]:
dfX = np.array(dfX).reshape(-1, 1)

In [6]:
kmeans = KMeans(n_clusters=56, random_state=0).fit(dfX)
df_predict = kmeans.fit_predict(dfX)
centroids = kmeans.cluster_centers_
df1['cluster_id'] = df_predict
df1

Unnamed: 0_level_0,cases,cluster_id
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1
G0100010,1240,39
G0100030,3685,37
G0100050,494,47
G0100070,574,22
G0100090,1391,29
...,...,...
G5600370,708,22
G5600390,396,0
G5600410,365,0
G5600430,170,49


In [7]:
len(centroids)

56

In [8]:
cluster_ids_list = df1['cluster_id']
centroids_column = []

for id in cluster_ids_list:
    centroids_column.append(centroids[id][0])
    
print(len(centroids_column))

3115


In [9]:
df1['centroid'] = centroids_column

In [10]:
df1['distance'] = abs(df1['cases'] - df1['centroid'])
df1

Unnamed: 0_level_0,cases,cluster_id,centroid,distance
GISJOIN,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
G0100010,1240,39,1232.417476,7.582524
G0100030,3685,37,3663.722222,21.277778
G0100050,494,47,471.352941,22.647059
G0100070,574,22,627.160338,53.160338
G0100090,1391,29,1446.718750,55.718750
...,...,...,...,...
G5600370,708,22,627.160338,80.839662
G5600390,396,0,330.687679,65.312321
G5600410,365,0,330.687679,34.312321
G5600430,170,49,215.901763,45.901763


In [11]:
df2 = df1[["cluster_id", "distance"]].reset_index()
df2

Unnamed: 0,GISJOIN,cluster_id,distance
0,G0100010,39,7.582524
1,G0100030,37,21.277778
2,G0100050,47,22.647059
3,G0100070,22,53.160338
4,G0100090,29,55.718750
...,...,...,...
3110,G5600370,22,80.839662
3111,G5600390,0,65.312321
3112,G5600410,0,34.312321
3113,G5600430,49,45.901763


### Tag Parent GISJoins

In [12]:
df2_gr = df2.groupby('cluster_id')['distance'].min().reset_index()
parents = []
for i, row in df2_gr.iterrows():
    cluster_id = row['cluster_id']
    distance = row['distance']
    
    cluster_id_match = df2[df2['cluster_id'] == cluster_id]
    distance_match = cluster_id_match[cluster_id_match['distance'] == distance].reset_index()['GISJOIN']
    parent = list(distance_match)[0] # select only one GISJOIN
    parents.append(parent)
    
print(len(parents))

56


In [13]:
df_clusters = df2_gr.rename(columns={'distance': 'min_distance'})
df_clusters['max_distance'] = df2.groupby('cluster_id')['distance'].max().reset_index()['distance']
print(df_clusters.shape)
df_clusters.head()

(56, 3)


Unnamed: 0,cluster_id,min_distance,max_distance
0,0,0.312321,70.312321
1,1,310.5,310.5
2,2,0.0,0.0
3,3,10.5,349.5
4,4,264.5,264.5


In [14]:
for parent in parents:
    df2.loc[df2.GISJOIN == parent, "is_parent"] = 1

df2 = df2.fillna(0)
df2.head()

Unnamed: 0,GISJOIN,cluster_id,distance,is_parent
0,G0100010,39,7.582524,0.0
1,G0100030,37,21.277778,0.0
2,G0100050,47,22.647059,0.0
3,G0100070,22,53.160338,0.0
4,G0100090,29,55.71875,0.0


In [15]:
df3 = df2

for i, row in df2.iterrows():
    i_distance = row['distance']
    cluster_id = row['cluster_id']
    gis_join = row['GISJOIN']
    max_distance = df_clusters[df_clusters['cluster_id'] == cluster_id]['max_distance'].item()
    min_distance = df_clusters[df_clusters['cluster_id'] == cluster_id]['min_distance'].item()
    if max_distance == min_distance:
        frac = 0
    else:
        frac = (i_distance - min_distance)/(max_distance - min_distance)
    df3.loc[df3.GISJOIN == gis_join, "frac_distance"] = frac

In [16]:
sample_min = 0.05
sample_max = 0.25

df3['sample_percent'] = sample_min + (sample_max - sample_min) * df3['frac_distance']
df3

Unnamed: 0,GISJOIN,cluster_id,distance,is_parent,frac_distance,sample_percent
0,G0100010,39,7.582524,0.0,0.046117,0.059223
1,G0100030,37,21.277778,0.0,0.109459,0.071892
2,G0100050,47,22.647059,0.0,0.288432,0.107686
3,G0100070,22,53.160338,0.0,0.611449,0.172290
4,G0100090,29,55.718750,0.0,0.522091,0.154418
...,...,...,...,...,...,...
3110,G5600370,22,80.839662,0.0,0.930779,0.236156
3111,G5600390,0,65.312321,0.0,0.928571,0.235714
3112,G5600410,0,34.312321,0.0,0.485714,0.147143
3113,G5600430,49,45.901763,0.0,0.803571,0.210714


In [17]:
df_final = df3
df_final.to_csv('~/ucc-21/clusters-covid.csv')