## Classification of crowd data points using K Means Clustering Annotations

In this notebook we propose a novel technique to classify head points of data which could be classified into Sparse, Medium or Dense using K Means Algorithm.
Initially labeled data is used to generate a data set (exported in a csv file for convenience) which is then used to generate a decision tree.

Setting source data directory, along with classes and other variables

In [7]:
source_dir = "./data_subset"
classes = ["dense", "medium", "sparse"]

Handling imports

In [8]:
import numpy as np
import pandas as pd
from mat4py import loadmat
from sklearn.cluster import KMeans
import os
import math

Defining methods to handle clustering logic

In [9]:
K_CONSTANT = 5

In [10]:
# imgnum, ab, ac, ad, ae, bc, bd, be, cd, ce, de, target
IMG_NUM = []
SOURCE_DATASET = []
AB = []
AC = []
AD = []
AE = []
BC = []
BD = []
BE = []
CD = []
CE = []
DE = []
TARGET = []

In [11]:
# reads mat file and returns formatted list of points in it
def read_pts(file):
    data = loadmat(file)
    pts = []
    for loc in data['image_info']['location']:
        pts.append((int(loc[0]), int(loc[1])))
    return pts

# creates clusters of pts list
def make_clusters(pts):
    est = KMeans(K_CONSTANT)
    est.fit(pts)
    y_kmeans = est.predict(pts)
    # cluster list
    cluster_list = [[], [], [], [], []]
    for index in range(len(pts)):
        cluster_list[y_kmeans[index]].append(pts[index])
    return cluster_list

# returns dist between 2 points
def pt_dist(p1, p2):
    xx = p1[0] - p2[0]
    yy = p1[1] - p2[1]
    return math.sqrt(xx*xx + yy*yy)

# returns min dist between cluster 1 and cluster 2
def min_dist(c1, c2):
    min = 9999999
    for p1 in c1:
        for p2 in c2:
            d = pt_dist(p1, p2)
            if min > d:
                min = d
    return min

def handle_class(classname, source_dataset):
    files = os.listdir(source_dir + "/" + classname)
    for file in files:
        if file.endswith(".mat"):
            imgname = str(file)[7:-4]
            IMG_NUM.append(imgname)
            SOURCE_DATASET.append(source_dataset)
            TARGET.append(classname)
            all_clusters = make_clusters(read_pts(source_dir + "/" + classname + "/" + file))
            AB.append(min_dist(all_clusters[0], all_clusters[1]))
            AC.append(min_dist(all_clusters[0], all_clusters[2]))
            AD.append(min_dist(all_clusters[0], all_clusters[3]))
            AE.append(min_dist(all_clusters[0], all_clusters[4]))
            BC.append(min_dist(all_clusters[1], all_clusters[2]))
            BD.append(min_dist(all_clusters[1], all_clusters[3]))
            BE.append(min_dist(all_clusters[1], all_clusters[4]))
            CD.append(min_dist(all_clusters[2], all_clusters[3]))
            CE.append(min_dist(all_clusters[2], all_clusters[4]))
            DE.append(min_dist(all_clusters[3], all_clusters[4]))


handle_class(classes[0], "A")
handle_class(classes[1], "B")
handle_class(classes[2], "B")

dataset = pd.DataFrame()
dataset["IMG_NUM"] = IMG_NUM
dataset["SOURCE_DATASET"] = SOURCE_DATASET
dataset["AB"] = AB
dataset["AC"] = AC
dataset["AD"] = AD
dataset["AE"] = AE
dataset["BC"] = BC
dataset["BD"] = BD
dataset["BE"] = BE
dataset["CD"] = CD
dataset["CE"] = CE
dataset["DE"] = DE
dataset["TARGET"] = TARGET
display(dataset)

Unnamed: 0,IMG_NUM,SOURCE_DATASET,AB,AC,AD,AE,BC,BD,BE,CD,CE,DE,TARGET
0,28,A,226.2145,6.403124,174.287119,12.041595,86.815897,9.219544,6.324555,9.486833,7.071068,8.485281,dense
1,42,A,339.676317,62.801274,152.947703,70.710678,42.059482,57.45433,354.407957,84.504438,49.091751,385.149322,dense
2,341,A,71.168813,322.076078,307.483333,52.239832,234.326695,557.288076,306.778422,542.86094,486.864458,43.680659,dense
3,378,A,302.828334,21.633308,243.895469,37.48333,550.905618,728.715308,21.095023,294.183616,283.44488,493.852205,dense
4,14,A,26.172505,41.10961,287.14108,368.98916,29.732137,41.629317,31.622777,40.607881,215.520301,25.495098,dense
5,21,A,344.013081,41.868843,17.691806,295.854694,228.319951,23.769729,38.013156,63.600314,69.46222,47.707442,dense
6,275,A,464.474972,705.01773,223.18154,866.810821,15.811388,25.0,249.777901,260.37665,273.248971,494.417839,dense
7,395,A,521.92145,72.138755,272.442655,171.283391,377.690349,779.133493,41.303753,444.567205,59.908263,494.725176,dense
8,19,A,171.819673,5.385165,184.010869,2.828427,331.360831,513.658447,5.385165,18.11077,162.520768,348.0,dense
9,43,A,516.468779,22.847319,278.145645,17.029386,252.097203,11.045361,431.885401,15.231546,26.925824,216.92395,dense


In [12]:
dataset.to_csv("exported.csv")