In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
from itertools import islice
import statsmodels.api as sm
sns.set()
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
from kneed import KneeLocator
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler

In [2]:
matplotlib.rcParams['figure.figsize'] = (16.0, 12.0)
matplotlib.style.use('ggplot')

In [3]:
def parse_timestamp(el):
    b = datetime.datetime.strptime(el, fmt)
    return b

In [4]:
fmt = '%Y-%m-%d %H:%M:%S'

In [5]:
def trim_latlng(x):
    return round(x,4)

In [6]:
url = "/bigdata/jelicicna/mobility_data_2020/anlysis_data_nextbike_2020-01.csv"
df = pd.read_csv(url, sep=';', low_memory=False)

In [7]:
url2 = "/bigdata/jelicicna/clustering_datasets/locations_for_clustering_2020-01.csv"
df_locations = pd.read_csv(url2, sep=';', low_memory=False)

In [None]:
df['ride_begin'] = df['ride_begin'].map(parse_timestamp)
df['ride_end'] = df['ride_end'].map(parse_timestamp)

In [None]:
df['lat_begin'] = df['lat_begin'].map(trim_latlng)
df['lng_begin'] = df['lng_begin'].map(trim_latlng)
df['lat_end'] = df['lat_end'].map(trim_latlng)
df['lng_end'] = df['lng_end'].map(trim_latlng)

In [None]:
#adding tracking of original indexes for evaluation of results
df.reset_index(inplace=True)
df.rename({'index':'org_index'}, axis=1, inplace=True)

In [None]:
bikes = df[["name"]].drop_duplicates().reset_index(drop=True)
bikes["bike_id"]=bikes.index
bikes = bikes[["bike_id", "name"]]
bikes.head() #df with 2 columns containing bike id and corresponding bike names

In [None]:
#adding bike id to df
df = pd.merge(df, bikes, on='name', how='outer')

In [None]:
df_locations

## Deciding on an optimal number of clusters

### Elbow method

In [None]:
#separate data
x = df_locations.iloc[:,2:4]

In [None]:
x.head()

In [None]:
potential_k = []
for k in range(10,170,10):
    potential_k.append(k)

In [None]:
kmeans_kwargs = {
    "init": "random",
    "n_init": 10,
    "max_iter": 300,
    "random_state": 42,
}

# A list holds the SSE values for each k
sse = []
for k in potential_k:
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(x)
    sse.append(kmeans.inertia_)

In [None]:
plt.style.use("fivethirtyeight")
plt.plot(potential_k, sse)
plt.xticks(potential_k)
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.show()

In [None]:
kl = KneeLocator(range(10,170,10), sse, curve="convex", direction="decreasing")
kl.elbow

In [None]:
#assigning the ideal result of the elbow method as a future k for clustering
clust_k = kl.elbow

### Silhouette coefficient

In [None]:
# A list with sc for each k to evalueate previous conclusion
silhouette_coefficients = []

for k in range(10, 170, 10):
    kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
    kmeans.fit(x)
    score = silhouette_score(x, kmeans.labels_)
    silhouette_coefficients.append(score)

plt.style.use("fivethirtyeight")
plt.plot(range(10, 170, 10), silhouette_coefficients)
plt.xticks(range(10, 170, 10))
plt.xlabel("Number of Clusters")
plt.ylabel("Silhouette Coefficient")
plt.show()

## Clustering

In [None]:
x = x.to_numpy()

In [None]:
kmeans = KMeans(clust_k)
kmeans.fit(x)

In [None]:
identified_clusters = kmeans.fit_predict(x)
identified_clusters

In [None]:
len(identified_clusters)

In [None]:
df_locations['cluster'] = identified_clusters

In [None]:
df_locations.head(2)

In [None]:
plt.scatter(df_locations['lng'],df_locations['lat'],c=df_locations['cluster'],cmap='rocket')
plt.xlim(6.80,7.15)
plt.ylim(50.85,51.06)
plt.show()

In [None]:
start_loc = []
for index, row in df.iterrows():
    loc_str = str([row['lat_begin'],row['lng_begin']])
    start_loc.append(loc_str)

In [None]:
end_loc = []
for index, row in df.iterrows():
    loc_str = str([row['lat_end'],row['lng_end']])
    end_loc.append(loc_str)

In [None]:
len(start_loc)

In [None]:
len(end_loc)

In [None]:
df

In [None]:
df['latlng'] = start_loc #setting starting locations as a joint column

In [None]:
df_merge_loc = df_locations[['loc_id', 'latlng', 'cluster']]

In [None]:
df_merge_loc

In [None]:
df_clusters = pd.merge(df, df_merge_loc, on='latlng', how='left')

In [None]:
df_clusters

In [None]:
cls = {'latlng':'latlng_start','loc_id':'loc_id_start', 'cluster':'cluster_start'}
df_clusters.rename(columns=cls, inplace=True) #renaming columns to match data about the start of the ride

In [None]:
len(df_clusters)

In [None]:
df_clusters['latlng'] = end_loc #setting ending locations as a joint column

In [None]:
df_clusters = pd.merge(df_clusters, df_merge_loc, on='latlng', how='left')

In [None]:
df_clusters

In [None]:
cls = {'latlng':'latlng_end','loc_id':'loc_id_end', 'cluster':'cluster_end'}
df_clusters.rename(columns=cls, inplace=True) #renaming columns to match data about the start of the ride

In [None]:
df_g = df_clusters[['ride_time','cluster_start','cluster_end']]

In [None]:
df_g = df_g.groupby(['cluster_start','cluster_end']).count()

In [None]:
df_g.reset_index(inplace=True)

In [None]:
df_g.rename({'ride_time':'ride_count'}, axis=1, inplace=True)
df_g

In [None]:
df_g['cluster_start'].value_counts()

In [None]:
df_locations.rename({'cluster':'cluster_kmns'}, inplace=True, axis=1)

In [None]:
df_locations.head()

### Agglomerative Clustering with chosen k

In [None]:
len(x)

In [None]:
from sklearn.cluster import AgglomerativeClustering

In [None]:
clustering = AgglomerativeClustering(clust_k).fit(x)
clustering

In [None]:
clustering.labels_

In [None]:
df_locations['cluster_agg'] = clustering.labels_

In [None]:
plt.scatter(df_locations['lng'],df_locations['lat'],c=df_locations['cluster_agg'],cmap='rocket')
plt.xlim(6.80,7.10)
plt.ylim(50.85,51.05)
plt.show()

In [None]:
df_locations.head(20)

In [None]:
type(df_locations['latlng'])

In [None]:
len(df_locations)

In [None]:
df_locations['cluster_kmns'].value_counts(dropna=False)

In [None]:
df_locations['cluster_agg'].value_counts(dropna=False)