## Classification of crowd data points using K Means Clustering Annotations

In this notebook we propose a novel technique to classify head points of data which could be classified into Sparse, Medium or Dense using K Means Algorithm.
Initially labeled data is used to generate a data set (exported in a csv file for convenience) which is then used to generate a decision tree.

Setting source data directory, along with classes and other variables

In [4]:
source_dir = "./data_subset"
classes = ["dense", "medium", "sparse"]

Handling imports

In [37]:
import numpy as np
import pandas as pd
from mat4py import loadmat
from sklearn.cluster import KMeans
import os
import math

Defining methods to handle clustering logic

In [10]:
K_CONSTANT = 5

In [52]:
# imgnum, ab, ac, ad, ae, bc, bd, be, cd, ce, de, target
IMG_NUM = []
SOURCE_DATASET = []
AB = []
AC = []
AD = []
AE = []
BC = []
BD = []
BE = []
CD = []
CE = []
DE = []
TARGET = []

In [53]:
# reads mat file and returns formatted list of points in it
def read_pts(file):
    data = loadmat(file)
    pts = []
    for loc in data['image_info']['location']:
        pts.append((int(loc[0]), int(loc[1])))
    return pts

# creates clusters of pts list
def make_clusters(pts):
    est = KMeans(K_CONSTANT)
    est.fit(pts)
    y_kmeans = est.predict(pts)
    # cluster list
    cluster_list = [[], [], [], [], []]
    for index in range(len(pts)):
        cluster_list[y_kmeans[index]].append(pts[index])
    return cluster_list

# returns dist between 2 points
def pt_dist(p1, p2):
    xx = p1[0] - p2[0]
    yy = p1[1] - p2[1]
    return math.sqrt(xx*xx + yy*yy)

# returns min dist between cluster 1 and cluster 2
def min_dist(c1, c2):
    min = 9999999
    for p1 in c1:
        for p2 in c2:
            d = pt_dist(p1, p2)
            if min > d:
                min = d
    return min

def handle_class(classname, source_dataset):
    files = os.listdir(source_dir + "/" + classname)
    for file in files:
        if file.endswith(".mat"):
            imgname = str(file)[7:-4]
            IMG_NUM.append(imgname)
            SOURCE_DATASET.append(source_dataset)
            TARGET.append(classname)
            all_clusters = make_clusters(read_pts(source_dir + "/" + classname + "/" + file))
            AB.append(min_dist(all_clusters[0], all_clusters[1]))
            AC.append(min_dist(all_clusters[0], all_clusters[2]))
            AD.append(min_dist(all_clusters[0], all_clusters[3]))
            AE.append(min_dist(all_clusters[0], all_clusters[4]))
            BC.append(min_dist(all_clusters[1], all_clusters[2]))
            BD.append(min_dist(all_clusters[1], all_clusters[3]))
            BE.append(min_dist(all_clusters[1], all_clusters[4]))
            CD.append(min_dist(all_clusters[2], all_clusters[3]))
            CE.append(min_dist(all_clusters[2], all_clusters[4]))
            DE.append(min_dist(all_clusters[3], all_clusters[4]))


handle_class(classes[0], "A")
handle_class(classes[1], "B")
handle_class(classes[2], "B")

dataset = pd.DataFrame()
dataset["IMG_NUM"] = IMG_NUM
dataset["SOURCE_DATASET"] = SOURCE_DATASET
dataset["AB"] = AB
dataset["AC"] = AC
dataset["AD"] = AD
dataset["AE"] = AE
dataset["BC"] = BC
dataset["BD"] = BD
dataset["BE"] = BE
dataset["CD"] = CD
dataset["CE"] = CE
dataset["DE"] = DE
dataset["TARGET"] = TARGET
display(dataset)

Unnamed: 0,IMG_NUM,SOURCE_DATASET,AB,AC,AD,AE,BC,BD,BE,CD,CE,DE,TARGET
0,28,A,133.454112,7.071068,105.095195,10.0,154.003247,88.617154,5.830952,9.055385,7.071068,11.661904,dense
1,42,A,62.801274,84.504438,42.059482,31.622777,152.947703,339.676317,59.3043,57.45433,385.149322,336.154726,dense
2,14,A,41.629317,29.732137,31.622777,26.172505,40.607881,35.22783,287.14108,215.520301,33.136083,368.98916,dense
3,21,A,41.868843,63.600314,69.46222,285.91782,35.22783,293.586444,368.110038,42.544095,22.472205,38.013156,dense
4,19,A,330.15148,3.162278,6.0,164.003049,169.002959,510.553621,4.472136,187.024063,4.123106,349.322201,dense
5,43,A,216.92395,11.045361,15.231546,278.145645,431.885401,22.847319,44.407207,252.097203,516.468779,32.388269,dense
6,23,A,26.925824,49.929951,62.008064,34.058773,15.811388,205.847031,241.281993,49.040799,250.337772,101.118742,dense
7,37,A,47.853944,19.104973,38.078866,18.35756,40.311289,35.902646,250.059993,158.902486,280.349782,49.193496,dense
8,30,A,225.0,6.0,502.48184,356.54593,15.0,8.485281,26.248809,265.318676,51.351728,34.928498,dense
9,34,A,59.413803,19.235384,17.262677,400.880281,261.354166,27.018512,13.0,6.324555,515.427978,266.31748,dense


In [54]:
dataset.to_csv("exported.csv")