# Cluster drawing to create samples from it
Previous attemps to learn on entire drawing was unsuccessfull, so I decided to move to new approach.
I.e. i will try to create a cluster of lines from the drawing that will be a learning sample x.
Than we will calculate some distance from cluster to dimensions on the drawing to attribute each dimension with the cluster. That attribution will be our Y.

In [1]:
# Start with basic example of unsupervised clustering with unknown number of clusters
# https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py

In [2]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [3]:
import pymongo
import pandas as pd
from pymongo import MongoClient

import matplotlib.pyplot as plt
%matplotlib inline

import torch

# from processing import *

In [4]:
client = MongoClient('mongodb://10.0.4.26:27017')
db = client.geometry

fileids  = db.files.find({'Valid':True}).distinct('FileId')
print('files count', len(fileids))

# all the data will be too much for now, so we will take say 42 files:
fileids = fileids[0:300]

query = {
    'ClassName':
    {
        '$in':[
            'AcDbLine',
            #'AcDbPolyline',
            'AcDbText',
            'AcDbRotatedDimension'
        ]
    },
    'FileId':{
        '$in':fileids
    }
}

data = pd.DataFrame(list(db.objects.find(query)))
print('data length', len(data))

files count 322
data length 877347


# Split data coordinates to columns

In [5]:
from processing import Col2Numpy

groupped = data.groupby(['FileId'])
column_names = ['StartPoint', 'EndPoint', 'XLine1Point', 'XLine2Point']

# Whole point here is that we're keeping index from the original dataset in file_points

file_points = {} #dic[FileId]list_Of_Points
# all file ids
file_ids = list(groupped.groups.keys())
for file_id in file_ids:
    pnts = Col2Numpy(groupped.get_group(file_id), column_names)
    # print(pnts)
    file_points[file_id]=pnts

num_points = 0
for k in file_points:
    num_points += len(file_points[k])
print('Total number of points: ', num_points)

Total number of points:  1570386


# Implement and check clustering algorithm

In [6]:
def Clusterize(x, eps = 0.2):
    '''
    The very basic clustering with unknown number of clusters
    https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py
    '''
    
    # algorithm='kd_tree'
    db = DBSCAN(eps=eps, min_samples=6, n_jobs=-1).fit(x)
    labels = db.labels_
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    return labels, n_clusters, n_noise, db.core_sample_indices_

# Test and display clustering result

In [7]:
from plot_graphics import draw_set
from processing import scale_ds


def AddLabels2Dataset(points_of_one_file, df_to_export_labels, fileid_to_print="", epses=0.2):
    result = df_to_export_labels
    
    # split indexes from coordinates
    x1, data_indexes = np.split(points_of_one_file,[3], axis = 1)
    data_indexes = data_indexes.astype('int')
    
    # print(x1)
    # scale dataframe in order to clusterize it properly
    x1, scl = scale_ds(x1)
    
    # for now we will magically assign eps on base of scale
    # epses = 15*scl
    
    # epses could be a number or list of numbers
    # I use list of numbers in order
    # to visually assess clustering
    ep = epses
    if type(epses) != list:
        ep=[epses]
        
    for eps in ep:
        # perform clustering of points to calculate labels
        
        labels, n_clusters, n_noise, core_indices = Clusterize(x1, eps=eps)

        # print(len(x1), len(labels))
        # draw_set(x1, labels, core_indices)  

        print("fileid:{} points:{} clusters:{} noise pnts:{} eps:{:0.3f} scale:{:0.6f}".format(
              fileid_to_print,
              x1.shape[0], 
              n_clusters,
              n_noise, 
              eps,
              scl))
    
    data_indexes['label'] = labels

    # https://stackoverflow.com/questions/22918212/fastest-way-to-drop-duplicated-index-in-a-pandas-dataframe
    # data_indexes = data_indexes.drop_duplicates('id')
    data_indexes = data_indexes.groupby(data_indexes.index).first()
    
    for i in range(len(data_indexes.index)):
        indx = data_indexes.index[i]
        # https://stackoverflow.com/questions/13021654/get-column-index-from-column-name-in-python-pandas
        result.iloc[indx, result.columns.get_loc('label')] = data_indexes.iloc[i, data_indexes.columns.get_loc('label')]    
    print(len(result.loc[result['label'] == -1]))
    return result

Could not import drawSvg


In [8]:
x_columns = ['StartPoint.X', 'StartPoint.Y', 'StartPoint.Z',
             'EndPoint.X', 'EndPoint.Y', 'EndPoint.Z']
y_columns = ['XLine1Point.X', 'XLine1Point.Y','XLine1Point.Z', 
    'XLine2Point.X', 'XLine2Point.Y', 'XLine2Point.Z']
join_index = ['FileId','label']
groupping_columns = [ 'ClassName', 'FileId','label']
dataset_columns = x_columns + y_columns + groupping_columns 


In [None]:
from processing import expand_columns

labeled_data = pd.DataFrame(data)
labeled_data['label'] = -1

i = 0
for file_id, _x1 in file_points.items():
    if len(_x1) > 20000:
        print(file_id, " len>20000")
        continue
    #print(df.groupby(['FileId', 'label']).count())
    #df = df.join(data_indexes, on=['id'], how='left', rsuffix='_r')
    labeled_data = AddLabels2Dataset(
        points_of_one_file=_x1, 
        df_to_export_labels=labeled_data, 
        fileid_to_print=file_id,
        epses=[0.03]
    )
    
    i+=1
    if i%10==0:
        df = expand_columns(pd.DataFrame(labeled_data), column_names)
        df = df[dataset_columns]
        df.to_pickle('test_dataset_cluster_labeled.pickle')      
        

fileid:006cb2dd-6f18-4203-8b70-fc865c08105a points:790 clusters:6 noise pnts:40 eps:0.030 scale:0.001698
876968
fileid:006f290c-7280-491b-b7d7-971ed82de1a5 points:662 clusters:5 noise pnts:52 eps:0.030 scale:0.001698
876662
fileid:0100c16b-c7c9-4265-92b9-950bf5d2c9ec points:362 clusters:7 noise pnts:36 eps:0.030 scale:0.001698
876496
fileid:013edfac-9865-4422-a5f2-3220d142116d points:2130 clusters:32 noise pnts:137 eps:0.030 scale:0.002410
875493
fileid:01ea3780-b297-4075-8a26-77461e995b7d points:7114 clusters:2 noise pnts:0 eps:0.030 scale:0.000004
871936
fileid:01f9cbe6-c14f-4dc0-817e-4d80380d4e26 points:2558 clusters:18 noise pnts:44 eps:0.030 scale:0.001197
870674
fileid:02401882-a7cc-47ad-b8c2-c38eb9be323b points:1534 clusters:12 noise pnts:36 eps:0.030 scale:0.001698
869926
fileid:024be99c-0302-43b3-92b3-aa0aba5a376a points:1416 clusters:12 noise pnts:26 eps:0.030 scale:0.001196
869232
fileid:02580ab5-a831-4845-a46a-874a6fce6230 points:1008 clusters:11 noise pnts:15 eps:0.030 sca

In [None]:
print("total data length:", len(labeled_data))
print("unlabeled data length:",len(labeled_data.loc[labeled_data['label'] == -1]))

In [None]:
df = pd.DataFrame(labeled_data)
df = expand_columns(df, column_names)

df = df[dataset_columns]

In [None]:
df.drop(df[df["label"] == -1].index).groupby(['FileId']).count()

In [None]:
df.to_pickle('test_dataset_cluster_labeled.pickle')

# Test classes to work with data

In [None]:
from torch.utils.data import Dataset, SubsetRandomSampler
from dataset import EntityDataset


In [None]:
d = pd.read_pickle('test_dataset_cluster_labeled.pickle')

for x,y in EntityDataset(d)[:22]:
    print(x.shape,y)
    

In [None]:
from dataset import DwgDataset

In [None]:
dwg_dataset = DwgDataset('test_dataset_cluster_labeled.pickle', batch_size = 4)

In [None]:
dwg_dataset.entities.data_frame.groupby(['FileId']).max()

In [None]:
from IPython.display import Image
from IPython.display import clear_output
from plot_graphics import generate_file
    
g = dwg_dataset.entities.data_frame.groupby(['FileId', 'label'])
for kkk in list(g.groups.keys()):
    if kkk[0] in ['006f290c-7280-491b-b7d7-971ed82de1a5', '006f290c-7280-491b-b7d7-971ed82de1a5']:
        chunk = g.get_group(kkk)
        dr = generate_file(chunk, save_file=False, verbose=False)
        display(dr)

In [None]:
from plot_graphics import draw_sample

j=0
for (x, y) in iter(dwg_dataset.train_loader):
    # print(x,y)
    for i in range(len(x)):
        # print(x[i].shape, y[i])
        display(draw_sample(x[i],y[i]))
        j+=1
    if j>3:
        break