In [15]:
import sys; sys.path.append('../')

from src.data_loader import load_data
import matplotlib.pyplot as plt

from scipy.stats import pearsonr
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [13]:
%matplotlib notebook
plt.rcParams["figure.figsize"] = (8,5)

# Analysis of the final datasets of clusters

In [3]:
clusters, _ = load_data('../data/huge_sample_input_classified.txt')

In [4]:
data = clusters.replace('None', np.nan).dropna(axis=1)
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerical_columns = data.select_dtypes(include=numerics).columns

## Some numbers:

- Number of instances: 5958
- Number of features: 55
- Number of usable features (at least one not null): 42
- Number of numerical features: 40

In [5]:
data.describe()

Unnamed: 0,x,y,z,n_points,n_order,volume,positive_volume,negative_volume,area,confidence,...,coplanararity_index_mean_origin,coplanararity_index_sigma_origin,colinearity_index_mean_origin,colinearity_index_sigma_origin,coplanararity_index_mean_destination,coplanararity_index_sigma_destination,colinearity_index_mean_destination,colinearity_index_sigma_destination,angles_mean,angles_sigma
count,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,...,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0,5958.0
mean,12.682515,229.123232,32.979224,16265.735817,52.319235,0.081702,0.0337,0.048002,0.134139,100.0,...,1.957878,0.292065,0.821941,0.823488,2.085826,0.319902,0.806173,0.796824,0.360787,0.191988
std,36.840365,28.159572,30.995349,9029.966511,154.029769,0.274385,0.138491,0.156565,0.382508,0.0,...,0.980074,0.155084,0.973092,1.598243,0.991183,0.173816,0.948205,1.785389,0.296962,0.146856
min,-56.0026,177.373485,-24.307235,16.0,10.0,0.000627,0.0,0.0,1.8e-05,100.0,...,0.419699,0.022959,0.006282,0.001521,0.509398,0.012059,0.006961,0.003005,0.003637,0.00234
25%,-20.220991,205.727522,6.795726,8518.5,13.0,0.008574,0.001023,0.002968,0.02155,100.0,...,1.236568,0.189283,0.195195,0.099307,1.349461,0.20346,0.205576,0.098899,0.106548,0.053398
50%,12.289346,225.156643,31.244101,16322.5,20.0,0.022135,0.005782,0.011283,0.051703,100.0,...,1.661714,0.261343,0.5269,0.326393,1.777331,0.288994,0.553701,0.340897,0.2939,0.168439
75%,44.828669,250.218369,54.311898,24198.75,40.0,0.060498,0.023518,0.035502,0.120783,100.0,...,2.465741,0.353387,1.080846,0.925946,2.598506,0.395648,1.073101,0.874672,0.546358,0.313778
max,87.307455,320.017,110.874583,31700.0,4702.0,9.045179,5.182637,3.862542,13.3735,100.0,...,6.388965,1.952617,13.73716,40.191173,6.282414,2.04465,19.310895,54.604309,1.532109,0.662483


## Objective variable distribution

In [6]:
clusters.classification.hist()

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x7f3b22d5dbe0>

As we can see we are handling a highly unbalanced dataset, having more than 95% of the instances not belonging to the interesting variable, Candidate or Spread.

In [7]:
correlations = np.zeros((len(numerical_columns), len(numerical_columns)))

for i1, c1 in enumerate(numerical_columns):
    for i2, c2 in enumerate(numerical_columns):
        correlations[i1, i2] = np.abs(pearsonr(data[c1], data[c2])[0])



## Pearson correlation coefficient among numerical variables

In [8]:
fig, ax = plt.subplots()
im = ax.imshow(correlations, interpolation='nearest')

ax.set_title('Pearson Correlation Coefficient among numeric features')

plt.xticks(np.arange(len(numerical_columns)), numerical_columns, rotation='vertical')
plt.yticks(np.arange(len(numerical_columns)), numerical_columns)

ax.figure.colorbar(im, ax=ax)

<IPython.core.display.Javascript object>

<matplotlib.colorbar.Colorbar at 0x7f3a9d6eb5f8>

There are several variables that are highly related, the features that are mean and standard deviation for example. Some variables have the same value always as confidence.

## PCA

In [9]:
from sklearn.decomposition import PCA


X = data[numerical_columns].drop(
    ['confidence', 'texture_code_origin', 'texture_code_destination'],
    axis=1
)
X = (X - X.mean()) / X.std()
y = data['classification']

pca = PCA(n_components=2)
transformed_data = pca.fit_transform(X)

fig, ax = plt.subplots()
for classification in y.unique():
    ax.scatter(*transformed_data[y == classification].T, label=classification)
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f3ae14a3278>

In [14]:
from mpl_toolkits.mplot3d import Axes3D

X = data[['x', 'y', 'z']].values
y = data['classification']

fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
for classification in y.unique():
    ax.scatter(*X[y == classification].T, label=classification)
plt.legend()

<IPython.core.display.Javascript object>

<matplotlib.legend.Legend at 0x7f3ade00c160>