In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import AgglomerativeClustering
from scipy.cluster.hierarchy import dendrogram
import datetime
from itertools import islice

In [None]:
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')

In [None]:
def parse_timestamp(el):
    b = datetime.datetime.strptime(el, fmt)
    return b

In [None]:
def create_date_col(df_m):
    date_lst = []
    for index, row in df_m.iterrows():
        day = str(df_m['ride_begin'][index].day)
        month = str(df_m['ride_begin'][index].month)
        year = str(df_m['ride_begin'][index].year)
        date_text = year + '-' + month + '-' + day
        date_lst.append(date_text)
    return date_lst

In [None]:
fmt = '%Y-%m-%d %H:%M:%S'

In [None]:
url = "/bigdata/jelicicna/mobility_data_2021/anlysis_data_nextbike_2021-07.csv"
df = pd.read_csv(url, sep=';', low_memory=False)

In [None]:
url2 = "/bigdata/jelicicna/clustering_datasets/locations_for_clustering_2021-07.csv"
df_locations = pd.read_csv(url2, sep=';', low_memory=False)

In [None]:
df.drop('old_index', axis=1, inplace=True)

In [None]:
df['ride_begin'] = df['ride_begin'].map(parse_timestamp)
df['ride_end'] = df['ride_end'].map(parse_timestamp)

In [None]:
#separate data for clustering
x = df_locations.iloc[:,2:4]

In [None]:
x.head()

In [None]:
X = x.to_numpy()

## Agglomerative clustering

In [None]:
# A method for generating dendrogram
def plot_dendrogram(model, **kwargs):
    # Create linkage matrix and then plot the dendrogram

    # create the counts of samples under each node
    counts = np.zeros(model.children_.shape[0])
    n_samples = len(model.labels_)
    for i, merge in enumerate(model.children_):
        current_count = 0
        for child_idx in merge:
            if child_idx < n_samples:
                current_count += 1  # leaf node
            else:
                current_count += counts[child_idx - n_samples]
        counts[i] = current_count

    linkage_matrix = np.column_stack([model.children_, model.distances_, counts]).astype(float)

    # Plot the corresponding dendrogram
    dendrogram(linkage_matrix, **kwargs)

In [None]:
# instantiate Agglomerative Clustering instance
clustering_model = AgglomerativeClustering(distance_threshold=0, n_clusters=None)

In [None]:
# call fit method with array of sample coordinates passed as a parameter
categories = clustering_model.fit(X)

In [None]:
categories.labels_

In [None]:
clustering_model.children_

In [None]:
clustering_model.distances_.max()

In [None]:
clustering_model.distances_.min()

In [None]:
# instantiate Agglomerative Clustering instance
clustering_model = AgglomerativeClustering(distance_threshold=0.3, n_clusters=None)

# call fit method with array of sample coordinates passed as a parameter
categories = clustering_model.fit(X)

categories.labels_

In [None]:
df_locations['cluster'] = categories.labels_

In [None]:
#plot dendrogram to visualize clusters
#plot_dendrogram(categories)

In [None]:
# instantiate Agglomerative Clustering instance
clustering_model2 = AgglomerativeClustering(distance_threshold=0.2, n_clusters=None)

# call fit method with array of sample coordinates passed as a parameter
categories2 = clustering_model2.fit(X)

categories2.labels_

In [None]:
df_locations['cluster2'] = categories2.labels_

In [None]:
len(df_locations['cluster'].value_counts())

In [None]:
len(df_locations['cluster2'].value_counts())

In [None]:
customPalette = ['#7b241c', '#c0392b', '#cd6155', '#d98880', '#e6b0aa', '#f2d7d5', '#633974', '#76448a', 
                 '#884ea0', '#9b59b6', '#af7ac5', '#c39bd3', '#1f618d', '#2471a3', '#2980b9', '#5499c7',
                 '#7fb3d5', '#a9cce3', '#117864', '#148f77', '#17a589', '#1abc9c', '#48c9b0', '#76d7c4',
                 '#9c640c', '#b9770e', '#d68910', '#f39c12', '#f5b041', '#f8c471', '#873600', '#a04000',
                 '#ba4a00', '#d35400', '#dc7633', '#e59866', '#626567', '#797d7f', '#909497', '#a6acaf',
                 '#bdc3c7', '#cacfd2']
sns.set_palette(customPalette)
sns.palplot(customPalette)

In [None]:
labels = set(df_locations['cluster'].to_list())
labels2 = set(df_locations['cluster2'].to_list())

In [None]:
#create a new figure
plt.figure(figsize=(18,18))

#loop through labels and plot each cluster
for l in labels:

    #add data points 
    plt.scatter(x=df_locations.loc[df_locations['cluster_agg']==l, 'lng'], 
                y=df_locations.loc[df_locations['cluster_agg']==l, 'lat'], 
                color=customPalette[l], 
                alpha=1)
    
    #add label
    plt.annotate(l, 
                 df_locations.loc[df_locations['cluster_agg']==l,['lng','lat']].mean(),
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=20, weight='bold',
                 color='white',
                 backgroundcolor=customPalette[l]) 

In [None]:
#create a new figure
plt.figure(figsize=(18,18))

#loop through labels and plot each cluster
for l in labels:

    #add data points 
    plt.scatter(x=df_locations.loc[df_locations['cluster_agg']==l, 'lng'], 
                y=df_locations.loc[df_locations['cluster_agg']==l, 'lat'], 
                color=customPalette[l], 
                alpha=1)
    
    #add label
    plt.annotate(l, 
                 df_locations.loc[df_locations['cluster_agg']==l,['lng','lat']].mean(),
                 horizontalalignment='center',
                 verticalalignment='center',
                 size=20, weight='bold',
                 color='white',
                 backgroundcolor=customPalette[l]) 