# Cluster drawing to create samples from it
Previous attemps to learn on entire drawing was unsuccessfull, so I decided to move to new approach.
I.e. i will try to create a cluster of lines from the drawing that will be a learning sample x.
Than we will calculate some distance from cluster to dimensions on the drawing to attribute each dimension with the cluster. That attribution will be our Y.

In [1]:
# Start with basic example of unsupervised clustering with unknown number of clusters
# https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py

In [2]:
import numpy as np
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn import metrics

In [3]:
import pymongo
import pandas as pd
from pymongo import MongoClient

import matplotlib.pyplot as plt
%matplotlib inline

import torch

# from processing import *

In [4]:
client = MongoClient('mongodb://192.168.1.49:27017')
db = client.geometry

fileids  = db.files.find({'Valid':True}).distinct('FileId')
print('files count', len(fileids))

# all the data will be too much for now, so we will take say 42 files:
fileids = fileids[12:125]

query = {
    'ClassName':
    {
        '$in':[
            'AcDbLine',
            #'AcDbPolyline',
            'AcDbText',
            'AcDbRotatedDimension'
        ]
    },
    'FileId':{
        '$in':fileids
    }
}

data = pd.DataFrame(list(db.objects.find(query)))
print('data length', len(data))

files count 322
data length 307920


# Split data coordinates to columns

In [5]:
from processing import Col2Numpy

groupped = data.groupby(['FileId'])
column_names = ['StartPoint', 'EndPoint', 'XLine1Point', 'XLine2Point']

# Whole point here is that we're keeping index from the original dataset in file_points

file_points = {} #dic[FileId]list_Of_Points
# all file ids
file_ids = list(groupped.groups.keys())
for file_id in file_ids:
    pnts = Col2Numpy(groupped.get_group(file_id), column_names)
    # print(pnts)
    file_points[file_id]=pnts

num_points = 0
for k in file_points:
    num_points += len(file_points[k])
print('Total number of points: ', num_points)

Total number of points:  530780


# Implement and check clustering algorithm

In [6]:
def Clusterize(x, eps = 0.2):
    '''
    The very basic clustering with unknown number of clusters
    https://scikit-learn.org/stable/auto_examples/cluster/plot_dbscan.html#sphx-glr-auto-examples-cluster-plot-dbscan-py
    '''
    
    # algorithm='kd_tree'
    db = DBSCAN(eps=eps, min_samples=6, n_jobs=-1).fit(x)
    labels = db.labels_
    
    n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
    n_noise = list(labels).count(-1)
    
    return labels, n_clusters, n_noise, db.core_sample_indices_

# Test and display clustering result

In [7]:
from plot_graphics import draw_set
from processing import scale_ds


def AddLabels2Dataset(points_of_one_file, df_to_export_labels, fileid_to_print="", epses=0.2):
    result = df_to_export_labels
    
    # split indexes from coordinates
    x1, data_indexes = np.split(points_of_one_file,[3], axis = 1)
    data_indexes = data_indexes.astype('int')
    
    # print(x1)
    # scale dataframe in order to clusterize it properly
    x1, scl = scale_ds(x1)
    
    # for now we will magically assign eps on base of scale
    # epses = 15*scl
    
    # epses could be a number or list of numbers
    # I use list of numbers in order
    # to visually assess clustering
    ep = epses
    if type(epses) != list:
        ep=[epses]
        
    for eps in ep:
        # perform clustering of points to calculate labels
        
        labels, n_clusters, n_noise, core_indices = Clusterize(x1, eps=eps)

        # print(len(x1), len(labels))
        # draw_set(x1, labels, core_indices)  

        print("fileid:{} points:{} clusters:{} noise pnts:{} eps:{:0.3f} scale:{:0.6f}".format(
              fileid_to_print,
              x1.shape[0], 
              n_clusters,
              n_noise, 
              eps,
              scl))
    
    data_indexes['label'] = labels

    # https://stackoverflow.com/questions/22918212/fastest-way-to-drop-duplicated-index-in-a-pandas-dataframe
    # data_indexes = data_indexes.drop_duplicates('id')
    data_indexes = data_indexes.groupby(data_indexes.index).first()
    
    for i in range(len(data_indexes.index)):
        indx = data_indexes.index[i]
        # https://stackoverflow.com/questions/13021654/get-column-index-from-column-name-in-python-pandas
        result.iloc[indx, result.columns.get_loc('label')] = data_indexes.iloc[i, data_indexes.columns.get_loc('label')]    
    print(len(result.loc[result['label'] == -1]))
    return result

Could not import drawSvg


In [8]:
labeled_data = pd.DataFrame(data)
labeled_data['label'] = -1

i = 0
for file_id, _x1 in file_points.items():
    #print(df.groupby(['FileId', 'label']).count())
    #df = df.join(data_indexes, on=['id'], how='left', rsuffix='_r')
    labeled_data = AddLabels2Dataset(
        points_of_one_file=_x1, 
        df_to_export_labels=labeled_data, 
        fileid_to_print=file_id,
        epses=[0.03]
    )
    
    i+=1
    if i>3:
        #debug break
        pass

fileid:032ea8f0-ea69-41f8-9263-ec8b965f9a55 points:1524 clusters:16 noise pnts:51 eps:0.030 scale:0.001698
307178
fileid:033199cc-1ef8-49f5-a92e-58a50ea8b9a8 points:8540 clusters:22 noise pnts:17 eps:0.030 scale:0.001202
302918
fileid:04451949-017f-4fdb-a8d2-6ad06b73dfe7 points:12582 clusters:9 noise pnts:3 eps:0.030 scale:0.000002
296628
fileid:046a75c3-b749-4ab3-930a-b9e8b27924a8 points:1848 clusters:15 noise pnts:15 eps:0.030 scale:0.001196
295711
fileid:046f716f-fbd5-4825-9b19-f4a10e60d544 points:500 clusters:1 noise pnts:0 eps:0.030 scale:0.000002
295461
fileid:048d04a5-31e4-4adf-8e00-8257b3d128f2 points:2266 clusters:7 noise pnts:45 eps:0.030 scale:0.001285
294342
fileid:0511c359-ab90-4395-a866-f0c29a25036f points:5848 clusters:4 noise pnts:0 eps:0.030 scale:0.000169
291418
fileid:05147389-06aa-4db9-bd95-07358472e89c points:454 clusters:1 noise pnts:0 eps:0.030 scale:0.000004
291191
fileid:05d14556-8391-4d92-be3b-501c9d4aee20 points:20792 clusters:22 noise pnts:13 eps:0.030 scale

In [9]:
print("total data length:", len(labeled_data))
print("unlabeled data length:",len(labeled_data.loc[labeled_data['label'] == -1]))

total data length: 307920
unlabeled data length: 44159


In [10]:
from processing import expand_columns

df = pd.DataFrame(labeled_data)
df = expand_columns(df, column_names)
x_columns = ['StartPoint.X', 'StartPoint.Y', 'StartPoint.Z',
             'EndPoint.X', 'EndPoint.Y', 'EndPoint.Z']
y_columns = ['XLine1Point.X', 'XLine1Point.Y','XLine1Point.Z', 
    'XLine2Point.X', 'XLine2Point.Y', 'XLine2Point.Z']
join_index = ['FileId','label']
groupping_columns = [ 'ClassName', 'FileId','label']
dataset_columns = x_columns + y_columns + groupping_columns 

df = df[dataset_columns]

In [11]:
df.drop(df[df["label"] == -1].index).groupby(['FileId']).count()

Unnamed: 0_level_0,StartPoint.X,StartPoint.Y,StartPoint.Z,EndPoint.X,EndPoint.Y,EndPoint.Z,XLine1Point.X,XLine1Point.Y,XLine1Point.Z,XLine2Point.X,XLine2Point.Y,XLine2Point.Z,ClassName,label
FileId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
032ea8f0-ea69-41f8-9263-ec8b965f9a55,741,741,741,741,741,741,1,1,1,1,1,1,742,742
033199cc-1ef8-49f5-a92e-58a50ea8b9a8,4179,4179,4179,4179,4179,4179,81,81,81,81,81,81,4260,4260
04451949-017f-4fdb-a8d2-6ad06b73dfe7,6122,6122,6122,6122,6122,6122,168,168,168,168,168,168,6290,6290
046a75c3-b749-4ab3-930a-b9e8b27924a8,917,917,917,917,917,917,0,0,0,0,0,0,917,917
046f716f-fbd5-4825-9b19-f4a10e60d544,157,157,157,157,157,157,93,93,93,93,93,93,250,250
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25dcd640-97cb-47cf-b93d-6cf9a59eda7a,235,235,235,235,235,235,47,47,47,47,47,47,282,282
2639ef82-e1fc-4da8-b905-12a1466a4213,162,162,162,162,162,162,8,8,8,8,8,8,170,170
2726deaf-21b1-4d10-bf74-d02cf747e18b,1715,1715,1715,1715,1715,1715,3,3,3,3,3,3,1718,1718
27e05dc6-5d43-4a80-91d5-92b06ba953e0,3153,3153,3153,3153,3153,3153,0,0,0,0,0,0,3153,3153


In [12]:
df.to_pickle('test_dataset_cluster_labeled.pickle')

# Test classes to work with data

In [13]:
from torch.utils.data import Dataset, SubsetRandomSampler
from dataset import EntityDataset


In [20]:
d = pd.read_pickle('test_dataset_cluster_labeled.pickle')

for x,y in EntityDataset(d)[:22]:
    print(x.shape,y)
    

torch.Size([52, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([118, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([16, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([105, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([70, 4]) tensor([0.9101, 0.3378, 0.9101, 0.3616])
torch.Size([264, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([11, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([13, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([14, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([38, 4]) tensor([[0., 0., 0., 0.]])
torch.Size([50, 4]) tensor([0.8916, 0.2506, 0.8937, 0.1639])
torch.Size([50, 4]) tensor([0.8937, 0.1639, 0.8918, 0.1726])
torch.Size([50, 4]) tensor([0.8795, 0.1726, 0.8816, 0.1856])
torch.Size([50, 4]) tensor([0.8915, 0.2506, 0.8932, 0.1986])
torch.Size([50, 4]) tensor([0.8809, 0.2289, 0.8917, 0.2419])
torch.Size([50, 4]) tensor([0.8917, 0.2419, 0.8917, 0.2506])
torch.Size([50, 4]) tensor([0.8717, 0.2289, 0.8899, 0.2159])
torch.Size([1193, 4]) tensor([0.3732, 0.7188, 0.3900, 0.7464])
torch.Size([1193, 4]) t

In [15]:
from dataset import DwgDataset

In [16]:
dwg_dataset = DwgDataset('test_dataset_cluster_labeled.pickle', batch_size = 4)

In [17]:
dwg_dataset.entities.data_frame.groupby(['FileId']).max()

Unnamed: 0_level_0,StartPoint.X,StartPoint.Y,StartPoint.Z,EndPoint.X,EndPoint.Y,EndPoint.Z,XLine1Point.X,XLine1Point.Y,XLine1Point.Z,XLine2Point.X,XLine2Point.Y,XLine2Point.Z,ClassName,label
FileId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
032ea8f0-ea69-41f8-9263-ec8b965f9a55,4.817245e+02,4.028817e+02,0.000000e+00,5.890000e+02,4.028817e+02,0.000000e+00,2.787783e+02,1.034826e+02,0.0,2.787783e+02,1.107732e+02,0.0,AcDbRotatedDimension,15
033199cc-1ef8-49f5-a92e-58a50ea8b9a8,7.741505e+02,5.743989e+02,4.897269e+01,7.741505e+02,5.801517e+02,4.897269e+01,7.741505e+02,5.611339e+02,0.0,7.551211e+02,5.611339e+02,0.0,AcDbRotatedDimension,21
04451949-017f-4fdb-a8d2-6ad06b73dfe7,2.196463e+05,5.744830e+04,2.220317e-27,2.196463e+05,5.851009e+04,2.220317e-27,1.291549e+05,6.014069e+03,0.0,1.291649e+05,6.014069e+03,0.0,AcDbRotatedDimension,8
046a75c3-b749-4ab3-930a-b9e8b27924a8,8.159994e+02,5.472760e+02,0.000000e+00,8.359994e+02,5.472760e+02,0.000000e+00,,,,,,,AcDbLine,14
046f716f-fbd5-4825-9b19-f4a10e60d544,5.502873e+05,1.643399e+05,0.000000e+00,5.527224e+05,1.647805e+05,0.000000e+00,5.464979e+05,1.631076e+05,0.0,5.464984e+05,1.631076e+05,0.0,AcDbRotatedDimension,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25dcd640-97cb-47cf-b93d-6cf9a59eda7a,1.726941e+06,1.134777e+06,0.000000e+00,1.726941e+06,1.134777e+06,0.000000e+00,1.726841e+06,1.134237e+06,0.0,1.726941e+06,1.134777e+06,0.0,AcDbRotatedDimension,1
2639ef82-e1fc-4da8-b905-12a1466a4213,5.890000e+02,3.950000e+02,0.000000e+00,4.040000e+02,3.950000e+02,0.000000e+00,2.329783e+02,3.828990e+02,0.0,2.354783e+02,3.828990e+02,0.0,AcDbRotatedDimension,7
2726deaf-21b1-4d10-bf74-d02cf747e18b,5.191892e+03,7.041083e+02,1.687086e-14,5.193026e+03,7.041083e+02,1.687086e-14,4.801235e+03,3.510328e+02,0.0,4.801235e+03,2.910328e+02,0.0,AcDbRotatedDimension,0
27e05dc6-5d43-4a80-91d5-92b06ba953e0,8.283497e+02,5.606607e+02,0.000000e+00,8.283497e+02,5.578466e+02,0.000000e+00,,,,,,,AcDbLine,3


In [18]:
from IPython.display import Image
from IPython.display import clear_output
from plot_graphics import generate_file
    
g = dwg_dataset.entities.data_frame.groupby(['FileId', 'label'])
for kkk in list(g.groups.keys()):
    if kkk[0] in ['006f290c-7280-491b-b7d7-971ed82de1a5', '006f290c-7280-491b-b7d7-971ed82de1a5']:
        chunk = g.get_group(kkk)
        dr = generate_file(chunk, save_file=False, verbose=False)
        display(dr)

In [19]:
from plot_graphics import draw_sample

j=0
for (x, y) in iter(dwg_dataset.train_loader):
    # print(x,y)
    for i in range(len(x)):
        # print(x[i].shape, y[i])
        display(draw_sample(x[i],y[i]))
        j+=1
    if j>3:
        break

NameError: name 'draw' is not defined