In [1]:
import numpy as np
import os
from mpl_toolkits.mplot3d import Axes3D
from matplotlib import pyplot as plt

# Import the data from the Gigydata folder
npzfilename=os.path.join('/home/max/data/GygiAnalysis/AnalysisOfGygiDataset.npz')
PTNdata= np.load(npzfilename) 
PTNdata=PTNdata['arr_0'].item()

# Load the three relevant datasets into lists 
xs=PTNdata['P_logsumE_off_FS']
ys=PTNdata['T_logsumE_off_FS']
zs=PTNdata['N_logsumE_off_FS']

# Visualization 1

This is a 3D representation of the classifion according to the Gigydata. The <span style = "color:green"> **green triangles** </span> indicate the sounds classified as **Pulse sounds**, the <span style = "color:red"> **red circles ** </span> indicate the sound classified as **Noise sounds** and finally the <span style = "color:blue"> **blue squares** </span> indicate the **tone sounds**.

In [None]:
# Create figure with a subplot object with a 3D projection and set XYZ axis
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
zdirs = (None, 'x', 'y', 'z', (1, 1, 0), (1, 1, 1))

# Create the 3D plot with different markers for three classifications
for typecolor, typemarker  in [('r','o'),('g','^'),('b','s')]:
    typerange=PTNdata['typecolors'][typecolor]
    ax.scatter(xs[typerange],ys[typerange],zs[typerange], c=typecolor, marker=typemarker,s=48)

# Set title and axis labels and show the 3D graph
ax.set_title('3D representation of PTN values of Gigydata') 
ax.set_xlabel('Pulse ')
ax.set_ylabel('Tone ')
ax.set_zlabel('Noise ') 
plt.show()

# Visualization 2

In this visualization three histograms can be seen. Each represent the number of occurences for certain values of <span style = "color:green">Pulse </span>, <span style = "color:red"> Tone </span> or <span style = "color:blue"> Noise </span> datapoints. This representation of the data provides an overview of how each variable is distibuted. 

In [None]:
# Create a figure with three plot objects
f, (ax1, ax2, ax3) = plt.subplots(3, sharey=True)

# First subplot
ax1.hist(xs, color = 'g')
ax1.set_title('Occurences of pulse amplitudes')
ax1.set_xlabel('pulse ')
ax1.set_ylabel('Occurrences ')

# Second subplot
ax2.hist(ys, color = 'b')
ax2.set_xlabel('Tone')
ax2.set_ylabel('Occurences')

# Thirds subplot
ax3.hist(zs, color = 'r')
ax3.set_xlabel('Noise')
ax3.set_ylabel('Occurrences')
        
# Show figure with subplots
f.show()

# Clustering

In this script the unclustered data is plugged into a clustering script based on the K-mediod method that uses the Manhattan metric. The inputted data consists of the Pulse and Tone values of the datapoints. The output of this script is an object containing the clustered data.

In [None]:
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
# script from > Bauckhage C. Numpy/scipy Recipes for Data Science: k-Medoids Clustering[R]. Technical Report, University of Bonn, 2015.
import kmedoids 

data = np.array([xs,ys]).T

# distance matrix
D = pairwise_distances(data, metric='manhattan')

# split into 3 clusters
M, C = kmedoids.kMedoids(D, 3)

# split the clustered data object C into three lists with clustered datapoints
cluster1 = data[C[0]]
cluster2 = data[C[1]]
cluster3 = data[C[2]]

# Visualization 3

This visualization is a 2D scatter plot of the clustered data. The clusters are indicated by the same markers as in visualization 1. 

In [None]:
# create a figure 
fig = plt.figure()
ax1 = fig.add_subplot(111)

# add the three clusters to the figure with different markers
ax1.set_xlabel('Pulse')
ax1.set_ylabel('Tone')
ax1.scatter(cluster1[:,0],cluster1[:,1], c='b', marker="^", label='cluster 1')
ax1.scatter(cluster2[:,0],cluster2[:,1], c='r', marker="o", label='cluster 2')
ax1.scatter(cluster3[:,0],cluster3[:,1], c='g', marker="s", label='cluster 3')
plt.legend(loc='upper left');
plt.show()

# Visualization 4

This visualization can be used to compare the **origional classification** of the Gigydata with the output of the **clustering algorithm** above, shown in visualization 3. When comparing the two ......

In [None]:
f=plt.figure(figsize=(16,6,))
ax = f.add_subplot(1,1,1)


for typecolor, typemarker  in [('r','o'),('g','^'),('b','s')]:
    typerange=PTNdata['typecolors'][typecolor]
    ax.scatter( xs[typerange],ys[typerange], c=typecolor, marker=typemarker,s=48)


for label, x, y in zip(PTNdata['labels'], xs,ys):
    ax.text(x, y, label[:3], None)    

ax.set_xlabel('Pulse ')
ax.set_ylabel('Tone ')
plt.show()