There was a mistake in recording my computations last time I did it (in EDA file). So, I am going to redo my computations here, and do the final analysis.

# 0 Import Libraries

In [1]:
import numpy as np
import pandas as pd

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics import silhouette_score
from sklearn.metrics import davies_bouldin_score
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition

# 1 Standardization and feature reduction 

In [2]:
#Import dataset
DE = pd.read_csv('cleaned_delay_events.csv')

In [3]:
def get_sample(dataset, frac=0.1):
    """Chooses a fraction of dataset randomly and performs the steps necessary to get to applying clustering."""
    if frac < 1.0: df = DE.sample(frac=frac)
    else: df = DE
    
    X_s = preprocessing.scale(df)
    
    # Dimensionality Reduction:
    pca = decomposition.PCA(n_components=.85, svd_solver='full')
    model = pca.fit(X_s)
    
    return (model.transform(X_s), df, X_s, model)

In [4]:
# Note: We are asking for decomposition.PCA(n_components=.85, svd_solver='full') in code
sample = get_sample(DE, 1)
X = sample[0]
X.shape

(551132, 25)

From Previous calculations, I know that the best results with KMean happens at one of $k \in \{4, 5, 6\}$. So, I do computations for a last time with these, and increse n_init to 25 for a better result.

# 2 KMeans on hand-picked values

In [5]:
hand_picked = [3, 4, 5, 6]

In [7]:
sample_size = 100000 #For silhouette score computation
kmeans_ch, kmeans_db, kmeans_s = [], [], []
kmeans_cluster = pd.DataFrame(index=DE.index)
centroid = []

for k in hand_picked:
    kmeans = KMeans(n_clusters=k, n_init=25)
    y_pred = kmeans.fit_predict(X)
    
    kmeans_cluster['cluster_kmeans_%d' %k] = pd.Series(y_pred)
    
    centroid.append((k, kmeans.cluster_centers_))

    kmeans_db.append((k, davies_bouldin_score(X, y_pred)))
    kmeans_ch.append((k, metrics.calinski_harabasz_score(X, y_pred)))
    kmeans_s.append((k, silhouette_score(X, y_pred, sample_size=sample_size)))

In [8]:
kmeans_cluster

Unnamed: 0,cluster_kmeans_3,cluster_kmeans_4,cluster_kmeans_5,cluster_kmeans_6
0,1,0,1,5
1,0,3,0,5
2,1,0,1,5
3,2,1,4,3
4,2,1,4,3
...,...,...,...,...
551127,1,0,1,1
551128,0,3,0,2
551129,1,0,1,1
551130,1,0,1,1


In [10]:
centroid

[(3,
  array([[ 1.84113858e+00,  2.63865736e+00,  1.45923347e-02,
           5.14572967e-01,  1.90909364e-01,  5.83818213e-02,
          -1.75245881e-01, -2.33223091e-01, -3.62287760e-01,
           3.32721421e-02, -4.37975059e-02, -4.78868171e-02,
          -2.86343983e-02, -1.92862473e-02,  2.34325295e-03,
           3.08328217e-03, -3.68796078e-02,  3.58009640e-02,
          -4.33931824e-02,  1.15637130e-02, -4.33499409e-02,
          -3.70734742e-02, -9.24890554e-03,  4.69913401e-02,
          -3.00462699e-02],
         [-8.43181081e-01, -7.33458974e-02, -2.24950283e-01,
          -2.33599840e-01,  6.82565695e-03, -1.15873844e-01,
           9.29199922e-02,  5.59178349e-02, -4.39139022e-03,
          -1.37280620e-02,  3.57510008e-02,  7.57137412e-02,
           1.40269862e-02,  1.65770814e-02, -1.65025556e-03,
          -4.17123287e-03,  4.15138755e-02, -1.69142950e-03,
          -1.86535196e-02, -5.08649801e-03, -1.12360849e-02,
           5.24992624e-03, -1.24391542e-02, -1.63578

In [11]:
kmeans_s

[(3, 0.18630579907780637),
 (4, 0.14843831250741757),
 (5, 0.15595014190621193),
 (6, 0.1285603926236869)]

In [12]:
kmeans_db

[(3, 2.6282135459818137),
 (4, 2.218079058937408),
 (5, 2.0396017354497338),
 (6, 2.036568623641768)]

In [13]:
kmeans_ch

[(3, 34392.163177648814),
 (4, 34406.13708213662),
 (5, 34219.98700908306),
 (6, 34467.80889510806)]

Silhouette score is best for $k \in \{3, 4, 5\}$. However, the db score is lowest for $k = 5$. So, I am going with $k = 5$. Admittedly, one could also argue that $k=4$ has a better ch score, and choose $k=4$ instead.

# 3 Final Analysis

## 3.1 Find distance to centroid and anamoly

In [18]:
centroid[2]

(5,
 array([[ 1.48929717e+00,  2.34106151e+00,  5.64730590e-02,
          1.30348031e-02, -5.00342370e-01,  2.62761080e-01,
         -1.32334142e-01, -1.87947769e-01, -4.74656851e-01,
          4.82839668e-02, -4.38978573e-02, -6.69097131e-02,
         -1.86469619e-02, -2.49238654e-02, -1.21064617e-02,
          2.74110739e-02, -7.07043550e-02,  6.44750125e-02,
         -5.76227922e-02, -1.04972554e-02, -7.76678694e-02,
         -6.78752206e-02, -2.49828648e-02,  9.57250025e-02,
         -2.74931259e-02],
        [-8.94952834e-01, -1.44924489e-01, -2.36708217e-01,
         -2.32828898e-01,  2.57022160e-02, -1.22694273e-01,
          1.05282736e-01,  5.06381023e-02,  4.16791898e-03,
         -1.64842918e-02,  4.28393180e-02,  9.37856892e-02,
          1.50379878e-04,  2.17233514e-02,  2.86053636e-03,
         -1.02381179e-02,  5.22227204e-02, -6.16520146e-03,
         -2.05852736e-02, -3.46294454e-03, -7.82722002e-03,
          1.14390234e-02, -1.22788276e-02, -7.01274067e-03,
         

In [20]:
centroid_5 = centroid[2][1]

In [25]:
cluster = kmeans_cluster['cluster_kmeans_5']
cluster

0         1
1         0
2         1
3         4
4         4
         ..
551127    1
551128    0
551129    1
551130    1
551131    1
Name: cluster_kmeans_5, Length: 551132, dtype: int32

In [28]:
cluster = pd.DataFrame(cluster)

In [36]:
distance = []
for i in range(cluster.shape[0]):
    c_i = cluster.loc[i, 'cluster_kmeans_5']
    d_i = np.sum(np.square(X[i] - centroid_5[c_i]))
    distance.append(d_i)

distance = np.array(distance)

In [40]:
cluster['distance'] = pd.Series(distance)

In [42]:
pd.concat([pd.read_csv('cleaned_delay_events.csv'),  cluster], axis=1, sort=False).to_csv('cleaned_DE_kmeans.csv')

Let's update DE to include cluster information as well.

In [49]:
up_DE = pd.concat([DE, cluster], axis=1, sort=False)
up_DE

Unnamed: 0,OperationDate,avg_ArriveLoad,avg_Temp,avg_Humidity,avg_Visibility,avg_WindSpeed,avgTravelTime,duration,mean_delay,median_delay,...,Proportion_BusBunchingFlag,ProportionBusGappingFlag,Time,DayType_0,DayType_1,DayType_2,Direction_E,Direction_N,cluster_kmeans_5,distance
0,9.03,17,12.7,95.0,48.3,9.0,36.0000,0,23,30.0,...,0.0,0.000,6.5,1,0,0,1,0,1,15.158619
1,9.03,17,16.4,78.0,24.1,16.0,28.4667,625,50,63.0,...,0.0,0.000,8.3,1,0,0,1,0,0,16.714720
2,9.03,32,16.4,78.0,24.1,16.0,17.0000,49,52,63.0,...,0.0,0.000,8.3,1,0,0,1,0,1,9.598607
3,9.03,27,15.8,82.0,24.1,16.0,93.2500,286,77,31.0,...,0.0,1.000,9.6,1,0,0,1,0,4,18.722744
4,9.03,25,16.0,84.0,24.1,9.0,26.5000,298,37,58.0,...,0.0,0.625,17.1,1,0,0,1,0,4,16.334885
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
551127,11.90,7,3.8,92.0,32.2,4.0,41.5000,16,19,71.5,...,0.0,0.000,5.6,0,0,1,1,0,1,25.467024
551128,11.90,21,4.8,94.0,32.2,6.0,23.9355,861,57,71.5,...,0.0,0.000,5.6,0,0,1,1,0,0,34.343156
551129,12.55,23,-6.5,75.0,32.2,13.0,28.8889,267,157,56.5,...,0.0,0.000,5.2,0,1,0,1,0,1,27.796294
551130,12.77,21,1.1,97.0,16.1,5.0,43.5000,75,48,78.0,...,0.0,0.000,5.2,0,1,0,1,0,1,46.270117


In [55]:
up_DE[up_DE['distance'] > distance.mean() + 2*distance.std()].index

Int64Index([    12,    111,    384,    886,   1415,   1722,   1733,   2119,
              4302,   4541,
            ...
            543897, 544558, 546533, 547130, 547250, 548395, 549590, 549984,
            550911, 551034],
           dtype='int64', length=1254)

In [56]:
a_index = up_DE[up_DE['distance'] > 29 + 2*280].index

In [59]:
up_DE.loc[a_index, 'anamoly'] = 1

In [63]:
df_anamoly = up_DE[up_DE['anamoly'] == 1]

In [65]:
df_anamoly.to_csv('anamoly.csv')

## 3.2 Find 'important' features

I will consider a feature 'important' if its |mean in a cluster| > 1 (note that 1 is the std of each feture)

In [99]:
X_s = sample[2]

In [104]:
X_s.shape

(551132, 43)

In [106]:
# split X_s with respect to their clusters 
Xs_0 = X_s[up_DE['cluster_kmeans_5'] == 0]
Xs_1 = X_s[up_DE['cluster_kmeans_5'] == 1]
Xs_2 = X_s[up_DE['cluster_kmeans_5'] == 2]
Xs_3 = X_s[up_DE['cluster_kmeans_5'] == 3]
Xs_4 = X_s[up_DE['cluster_kmeans_5'] == 4]

In [119]:
mean_0 = pd.Series(np.array(np.round([Xs_0[:, j].mean() for j in range(X_s.shape[1])], 1)))
mean_1 = pd.Series(np.array(np.round([Xs_1[:, j].mean() for j in range(X_s.shape[1])], 1)))
mean_2 = pd.Series(np.array(np.round([Xs_2[:, j].mean() for j in range(X_s.shape[1])], 1)))
mean_3 = pd.Series(np.array(np.round([Xs_3[:, j].mean() for j in range(X_s.shape[1])], 1)))
mean_4 = pd.Series(np.array(np.round([Xs_4[:, j].mean() for j in range(X_s.shape[1])], 1)))

In [120]:
mean_s = pd.DataFrame([mean_0, mean_1, mean_2, mean_3, mean_4])

In [121]:
mean_s

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,40,41,42
0,0.1,-0.1,-0.0,-0.0,-0.0,0.0,-0.0,1.5,0.8,0.6,...,0.2,0.2,0.0,-0.2,-0.1,0.1,-0.1,-0.1,-0.1,0.1
1,-0.0,-0.2,0.0,0.0,-0.0,-0.0,-0.3,-0.4,-0.3,-0.2,...,0.3,0.1,0.0,-0.3,0.0,-0.1,0.0,0.0,0.0,-0.0
2,-0.0,1.4,0.1,-0.1,0.1,0.0,2.7,-0.1,0.7,0.6,...,-0.2,-0.0,0.1,0.3,0.1,0.2,-0.1,-0.1,0.1,0.2
3,-0.1,0.1,0.2,0.0,0.0,0.0,0.2,1.0,0.5,0.4,...,0.0,0.1,-0.0,-0.0,0.0,-0.1,0.0,0.1,-0.0,-0.4
4,0.2,0.3,-0.1,-0.0,-0.0,-0.0,-0.0,-0.1,-0.2,-0.2,...,-2.8,-1.5,-0.2,3.1,-0.0,0.1,-0.1,-0.1,-0.0,-0.0


In [172]:
important = [i for i in range(mean_s.shape[1]) if np.abs(mean_s[i]).max()>=1]

In [173]:
important

[1, 6, 7, 10, 13, 14, 15, 16, 17, 20, 21, 24, 25, 33, 34, 36]

In [174]:
mean_s[important]

Unnamed: 0,1,6,7,10,13,14,15,16,17,20,21,24,25,33,34,36
0,-0.1,-0.0,1.5,1.5,-0.0,-0.1,-0.0,-0.2,-0.2,-0.0,0.1,-0.0,1.4,0.2,0.2,-0.2
1,-0.2,-0.3,-0.4,-0.3,-0.0,-0.1,-0.1,-0.1,-0.2,-0.1,0.1,-0.2,-0.4,0.3,0.1,-0.3
2,1.4,2.7,-0.1,-0.6,-0.0,-0.1,-0.1,1.2,2.1,1.3,-1.4,2.5,0.6,-0.2,-0.0,0.3
3,0.1,0.2,1.0,0.8,3.8,9.0,8.3,0.1,0.1,0.2,0.0,-0.0,0.9,0.0,0.1,-0.0
4,0.3,-0.0,-0.1,-0.2,-0.0,-0.1,-0.1,0.2,0.1,0.2,0.0,-0.1,-0.2,-2.8,-1.5,3.1


In [176]:
core = mean_s[important]

In [178]:
core = core.rename(columns={i:DE.columns[i] for i in important})

In [182]:
core =core.reset_index()