In [3]:
import numpy as np
from sklearn.cluster import MeanShift, estimate_bandwidth

In [4]:
from random import randrange
import datetime 


def random_date(start,l):
   current = start
   while l >= 0:
      curr = current + datetime.timedelta(minutes=randrange(3600))
      yield curr
      l-=1



startDate = datetime.datetime(2013, 9, 20,13,00)
epoch = datetime.datetime(2013, 1, 1,00,00)

times = [[(x - epoch).total_seconds(), int(1)] for x in random_date(startDate,100)]
X = np.array(times)

In [6]:
bandwidth = estimate_bandwidth(X, quantile=0.03)

ms = MeanShift(bandwidth=bandwidth, bin_seeding=True)
ms.fit(X)
labels = ms.labels_ # array of labels which assigns a cluster to each elt of X
cluster_centers = ms.cluster_centers_ # array of all cluster centers

labels_unique = np.unique(labels) # array of labels that designate a cluster
n_clusters_ = len(labels_unique)

print("number of estimated clusters : %d" % n_clusters_)
print(labels)
assert len(labels) == len(X)

number of estimated clusters : 37
[17 29 13 10  0 16  1  7 27  9 19 18  5  0  7 33  4 14  2 15  4  8  0 23 17
 14 17 26 15 22  8 32 36 12 25  2 20 27  6  0 18  3  7 21  7  1  2 11  0 21
  1  3  9 34  5 13 10  6  1 14 24  4 31 12 35  5 26  9 11 20 19  6  1 28 26
  3 16  8  4 24 13 27 15  5  7  3  1  9 10  8 16  6  3 10 22  0 12 30  0 25
  4]


In [None]:
import matplotlib.pyplot as plt
import matplotlib
from itertools import cycle
matplotlib.rcParams['figure.figsize'] = (20, 20)

plt.figure(1)
plt.clf()

colors = cycle('bgrcmykbgrcmykbgrcmykbgrcmyk')
for k, col in zip(range(n_clusters_), colors):
    my_members = labels == k # array of booleans for 
    cluster_center = cluster_centers[k]
    clustered = np.sort(X[my_members, 0])
    start, stop = datetime.timedelta(seconds=clustered[0]), datetime.timedelta(seconds=clustered[-1])
    try:
        gap = np.diff(clustered).max()
        gap = datetime.timedelta(seconds=gap)
    except:
        gap = stop-start
    print("Cluster {} starts at {} and ends at {}".format(k, epoch+start, epoch+stop))
    print("Duration: {}, comments: {}".format(stop - start, len(clustered)))
    print("Largest gap: {}".format(gap))
    plt.plot(X[my_members, 0], X[my_members, 1], col + '|')
    plt.plot(cluster_center[0], cluster_center[1], 'o', markerfacecolor=col,
             markeredgecolor='k', markersize=4)
plt.title('Estimated number of clusters: %d' % n_clusters_)
plt.show()

In [None]:
np.diff(X[:,0]).max()

In [None]:
ms.labels_

In [None]:
cluster_centers


In [None]:
b = np.array([955920,   956940, 1023240, 1028280, 1044720, 1047660, 1052160,
  1053120, 1073820, 1074060, 1088580, 1089180, 1089600, 1089960,
  1093140, 1105140, 1105380, 1107300, 1119660, 1125600, 1125720,
  1125960, 1126320, 1129380, 1130040, 1131780, 1132320, 1132980,
  1133280, 1133280, 1133640, 1133820, 1133940, 1134960, 1136820,
  1137660, 1138380, 1138680, 1138800, 1139640, 1139820, 1139880,
  1140120, 1140780, 1142340, 1142580, 1142940, 1143180, 1144080,
  1146420, 1147920, 1148640, 1152600, 1155960, 1156620, 1157760,
  1158360, 1159560, 1159800, 1159920, 1160400, 1160460, 1161120,
  1161360, 1162980, 1165320, 1181520, 1187100, 1194840, 1194840,
  1195500, 1209900, 1210860, 1216080, 1216620, 1221540, 1222440,
  1226520, 1227300, 1231440, 1233180, 1235220, 1235820, 1240800,
  1242540, 1247340])

In [None]:
datetime.timedelta(seconds=np.diff(b).max())

In [None]:
list(enumerate(list('abc'), start=1))