In [1]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

In [28]:
from random import randrange
import datetime 


def random_date(start,l):
   current = start
   while l >= 0:
      curr = current + datetime.timedelta(minutes=randrange(3600))
      yield curr
      l-=1



startDate = datetime.datetime(2013, 9, 20,13,00)
epoch = datetime.datetime(2013, 1, 1,00,00)

times = [[(x - epoch).total_seconds(), int(1)] for x in random_date(startDate,100)]
X = np.array(times)
print(min(times)[0], max(times)[0])

22687080.0 22899180.0


In [5]:
bandwidth = estimate_bandwidth(X, quantile=0.03)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_ # array of labels which assigns a cluster to each elt of X
cluster_centers = ms.cluster_centers_ # array of all cluster centers

labels_unique = np.unique(labels) # array of labels that designate a cluster
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
print(labels)
assert len(labels) == len(X)

number of estimated clusters : 35
[14  7 27  4 10 16  0 10  5  9  1 12 11  2 11 13 15 20 23 24 18 15 34  0 31
  8 17  1  4 24 19  0  2  1  4  2  6 30 22  7  4  3 16  0 23 12  7 13 12  2
  0  5 26 27  5  5  3  4 17 10  7 14  5 11 32 20 19 21 33 26  9  5 21 16 13
  6  1  8 25  0 29 12 22 25  1 14  6  3  2  8 15  9  4  6 14  0 17  1 28  8
 18]


In [4]:
import matplotlib.pyplot as plt
import matplotlib
from itertools import cycle
matplotlib.rcParams['figure.figsize'] = (20, 20)

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k # array of booleans for 
    cluster_center = cluster_centers[k]
    clustered = np.sort(X[my_members, 0])
    start, stop = datetime.timedelta(seconds=clustered[0]), datetime.timedelta(seconds=clustered[-1])
    try:
        gap = np.diff(clustered).max()
        gap = datetime.timedelta(seconds=gap)
    except:
        gap = stop-start
    print("Cluster {} starts at {} and ends at {}".format(k, epoch+start, epoch+stop))
    print("Duration: {}, comments: {}".format(stop - start, len(clustered)))
    print("Largest gap: {}".format(gap))
    plt.plot(X[my_members, 0], X[my_members, 1], col + '|')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=4)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

Cluster 0 starts at 2013-09-21 18:03:00 and ends at 2013-09-21 18:50:00
Duration: 0:47:00, comments: 7
Largest gap: 0:13:00
Cluster 1 starts at 2013-09-20 18:30:00 and ends at 2013-09-20 19:29:00
Duration: 0:59:00, comments: 6
Largest gap: 0:23:00
Cluster 2 starts at 2013-09-21 15:50:00 and ends at 2013-09-21 16:41:00
Duration: 0:51:00, comments: 5
Largest gap: 0:23:00
Cluster 3 starts at 2013-09-22 17:27:00 and ends at 2013-09-22 18:16:00
Duration: 0:49:00, comments: 5
Largest gap: 0:26:00
Cluster 4 starts at 2013-09-21 22:49:00 and ends at 2013-09-22 00:00:00
Duration: 1:11:00, comments: 6
Largest gap: 0:29:00
Cluster 5 starts at 2013-09-22 20:31:00 and ends at 2013-09-22 21:20:00
Duration: 0:49:00, comments: 5
Largest gap: 0:34:00
Cluster 6 starts at 2013-09-20 15:26:00 and ends at 2013-09-20 16:43:00
Duration: 1:17:00, comments: 6
Largest gap: 0:33:00
Cluster 7 starts at 2013-09-21 09:15:00 and ends at 2013-09-21 09:24:00
Duration: 0:09:00, comments: 3
Largest gap: 0:07:00
Cluster 

In [5]:
np.diff(X[:,0]).max()

191400.0

In [6]:
ms.labels_

array([25,  8,  8,  0, 24,  7, 15,  6, 18,  6, 15,  0, 22,  1, 12,  0, 23,
       11,  5, 11,  4,  9, 30, 17, 22, 34, 27, 12, 28, 26,  6,  8, 10,  1,
       32,  7,  2,  3,  4,  2, 13, 21,  1,  2, 19,  3,  3,  5, 25, 19,  7,
       20,  9, 24,  0, 19, 10, 23,  1, 13,  0,  3,  5,  5, 26,  6,  6, 14,
        6, 14, 10,  0,  4,  3, 31,  2,  4, 16,  1,  0, 17, 35, 16, 13, 11,
       33,  5,  2,  1, 14,  4, 21, 15, 16, 20,  9, 18, 17, 12, 29,  4])

In [7]:
cluster_centers


array([[  2.27894914e+07,   1.00000000e+00],
       [  2.27050400e+07,   1.00000000e+00],
       [  2.27816880e+07,   1.00000000e+00],
       [  2.28728040e+07,   1.00000000e+00],
       [  2.28084720e+07,   1.00000000e+00],
       [  2.28851160e+07,   1.00000000e+00],
       [  2.26957920e+07,   1.00000000e+00],
       [  2.27568200e+07,   1.00000000e+00],
       [  2.28244200e+07,   1.00000000e+00],
       [  2.28695400e+07,   1.00000000e+00],
       [  2.27211400e+07,   1.00000000e+00],
       [  2.27949600e+07,   1.00000000e+00],
       [  2.27261800e+07,   1.00000000e+00],
       [  2.27106000e+07,   1.00000000e+00],
       [  2.28424600e+07,   1.00000000e+00],
       [  2.27428200e+07,   1.00000000e+00],
       [  2.27169600e+07,   1.00000000e+00],
       [  2.27749500e+07,   1.00000000e+00],
       [  2.27698200e+07,   1.00000000e+00],
       [  2.28482700e+07,   1.00000000e+00],
       [  2.27000100e+07,   1.00000000e+00],
       [  2.27375400e+07,   1.00000000e+00],
       [  

In [8]:
b = np.array([955920,   956940, 1023240, 1028280, 1044720, 1047660, 1052160,
  1053120, 1073820, 1074060, 1088580, 1089180, 1089600, 1089960,
  1093140, 1105140, 1105380, 1107300, 1119660, 1125600, 1125720,
  1125960, 1126320, 1129380, 1130040, 1131780, 1132320, 1132980,
  1133280, 1133280, 1133640, 1133820, 1133940, 1134960, 1136820,
  1137660, 1138380, 1138680, 1138800, 1139640, 1139820, 1139880,
  1140120, 1140780, 1142340, 1142580, 1142940, 1143180, 1144080,
  1146420, 1147920, 1148640, 1152600, 1155960, 1156620, 1157760,
  1158360, 1159560, 1159800, 1159920, 1160400, 1160460, 1161120,
  1161360, 1162980, 1165320, 1181520, 1187100, 1194840, 1194840,
  1195500, 1209900, 1210860, 1216080, 1216620, 1221540, 1222440,
  1226520, 1227300, 1231440, 1233180, 1235220, 1235820, 1240800,
  1242540, 1247340])

In [9]:
datetime.timedelta(seconds=np.diff(b).max())

TypeError: unsupported type for timedelta seconds component: numpy.int64

In [None]:
list(enumerate(list('abc'), start=1))

In [101]:
from scipy.stats import norm
from sklearn.neighbors import KernelDensity
import matplotlib.pyplot as plt

In [102]:
startDate = datetime.datetime(2013, 9, 20,13,00)
epoch = datetime.datetime(2013, 1, 1,00,00)

times = [(x - epoch).total_seconds() for x in random_date(startDate,100)]
X = np.array(times)
a, b = int(min(times)), int(max(times))

In [103]:
#----------------------------------------------------------------------
# Plot the progression of histograms to kernels
#X = np.concatenate((np.random.normal(0, 1, 0.3 * N),
#                    np.random.normal(5, 1, 0.7 * N)))[:, np.newaxis]
X = X[:, np.newaxis]
X_plot = np.linspace(a, b, 5000)[:, np.newaxis]
bins = np.linspace(a, b, 50)

In [104]:
fig, ax = plt.subplots()
kde = KernelDensity(kernel='gaussian', bandwidth=5).fit(X)
log_dens = kde.score_samples(X_plot)
ax.fill(X_plot[:, 0], np.exp(log_dens), fc='#AAAAFF')
plt.show()

In [105]:
log_dens

array([  -7.14349696,  -44.40030027, -156.1707102 , ..., -156.1707102 ,
        -44.40030027,   -7.14349696])

In [108]:
from scipy.cluster.hierarchy import cophenet, dendrogram, linkage
from scipy.spatial.distance import pdist

In [110]:
Z = linkage(X, method='ward', metric='euclidean')
c, _ = cophenet(Z, pdist(X))
print("Cophenetic Correlation Coefficient with {}: {}".format('ward', c))

Cophenetic Correlation Coefficient with ward: 0.7660745582100776


In [111]:
Z

array([[  1.30000000e+01,   6.40000000e+01,   0.00000000e+00,
          2.00000000e+00],
       [  0.00000000e+00,   2.20000000e+01,   6.00000000e+01,
          2.00000000e+00],
       [  4.00000000e+00,   4.00000000e+01,   6.00000000e+01,
          2.00000000e+00],
       [  3.50000000e+01,   6.10000000e+01,   6.00000000e+01,
          2.00000000e+00],
       [  2.00000000e+00,   4.90000000e+01,   1.20000000e+02,
          2.00000000e+00],
       [  5.00000000e+00,   2.80000000e+01,   1.20000000e+02,
          2.00000000e+00],
       [  6.00000000e+00,   2.40000000e+01,   1.20000000e+02,
          2.00000000e+00],
       [  1.90000000e+01,   8.20000000e+01,   1.20000000e+02,
          2.00000000e+00],
       [  5.20000000e+01,   7.90000000e+01,   1.20000000e+02,
          2.00000000e+00],
       [  5.40000000e+01,   9.10000000e+01,   2.40000000e+02,
          2.00000000e+00],
       [  7.70000000e+01,   1.09000000e+02,   2.77128129e+02,
          3.00000000e+00],
       [  8.00000000e

In [112]:
ddata = dendrogram(Z, color_threshold=.07,
                              no_plot=False)