## clustering python notebook

In [1]:
import os 
os.getcwd()

'C:\\Users\\cex\\Documents\\Smart Cities and Urban Analytics\\Term 2\\SDC\\Assessment'

In [None]:
# importing libraries required
import pandas as pd
import plotly.express as px
import sklearn.preprocessing as preprocessing
from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import random
from sklearn.cluster import AgglomerativeClustering

# run this command too - just to allow more data to be displayed than default
pd.set_option('display.max_rows', 200)
# this one ensures graphs properly display in the notebook
%matplotlib inline

In [None]:
#define the filepath
file_path = ""

In [None]:
#put the data into a dataframe
data = pd.read_csv()
#check that is has been read correctly
data

In [None]:
#if we want to merge by origin
msoa = data.groupby("Area of residence").sum()

In [None]:
#to automatically scale the dataframe

#this takes arguments: X ,axis = 0, with_mean = True, with_std = True, copy = True
#X is the data to centre and scale
#axs: int (0 by default) - axis used to compute means and Std along, if 0  independentky
#standardise each feature, other (if 1) standardize each sample
#with_mean: boolean, True by default, if true centre the data before scaling
#with_std: boolean, True by default, if True scale the data to unit variance
#copy: boolean, optional, default True, set to False to perform inplace row normalisation and avoid a copy

#NaNs are treated as missing values: disregarded

scaled = preprocessing.scale(transport_percentage) # add your dataframe name here

#this creates a numpy array
scaled

In [None]:
#to change the numpy array back into a dataframe
scaled_df = pd.DataFrame({'Private_transport': scaled[:, 0], 'Public _transport': scaled[:, 1], 'People_power':scaled[:, 2]})
print(scaled_df)

## DBScan

In [None]:
# create DBSCAN cluster object

#eps is the maximum distance between two samples for one to be considered as in the neighbourhood of other other
#min samples is the number of samples within that distance to make that point a core point, this includes the point itself
#this also takes several other arguments e.g.
#metric= 'euclidean', metric_params=None, algorith='auto', leaf_szie=30, p = None, n_jobs=None
dbscan = DBSCAN(eps=0.1, min_samples=5) 

#algorith to be used by the NearestNeighbors module to compute pointwise distance and find neighest neighbours - see NN module documentation
# run the .fit() function on the scaled dataset
dbscan.fit(scaled) 

In [None]:
#Get the labels from the clusters
dbscan_labels = dbscan.labels_

In [None]:
#check the silhouette score
metrics.silhouette_score(scaled, dbscan_labels)

In [None]:
#assign the labels to the original dataframe
transport_percentage=transport_percentage.assign(label = dbscan_labels)

#and check the calue counts
transport_percentage.label.value_counts()

In [None]:
#check the number of clusters
n_clusters_ = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
#check the number of points outside the cluster
n_noise_ = list(dbscan_labels).count(-1)

#print the number of clusters
print(n_clusters_)
#print the number of points outside the cluster
print(n_noise_)

## kmeans clustering

In [None]:
#set the number of clusters to explore
k_cluster = 5
#set the random seed
random_seed = 1
#random state: int, default = None
#determines random number generation for ecntroid intialisation, use an int to make the randomness deterministic

In [None]:
#get the method for kmeans
kmeans_method = KMeans(n_clusters=k_cluster,random_state=random_seed)
#apply the fit to the scaled dataset
kmeans_method.fit(scaled)

In [None]:
#assign the labels to original databse
transport_percenatege_kmeans = transport_percentage.assign(label = kmeans_method.labels_)

In [None]:
#elbow plot over multiple k's

#calculate SSE for a range of number of cluster

list_SSE = []
min_k = 1
max_k = 10
range_k = range(min_k, max_k+1)
for i in range_k:
    km = KMeans(
        n_clusters=i, init='random',
        n_init=10, max_iter=300,
        tol=1e-04, random_state=0
    )
    km.fit(scaled)
    # inertia is a concept in physics. Roughly it means SSE of clustering.
    list_SSE.append(km.inertia_)

# plot
plt.plot(range_k, list_SSE, marker='o')
plt.xlabel('Number of clusters')
plt.ylabel('SSE')
plt.show()

In [None]:
#silhouette score over multiple k's

#empty array to hold the silhouette scores
silhouette = []

#iterate over the number of clusters from 2 to 10
for i in range(2, 21):
    #create an empty array to store the average
    average = []
    #repeat the kmeans clustering 10 times
    for x in range(1, 10):
        #setting the number of clusters as i
        k_cluster = i
        #create a random integer for the random seed
        random_seed = random.randint(1,101)
        #run the kmeans analysis
        kmeans_method = KMeans(n_clusters=k_cluster,random_state=random_seed)
        #fit it to the scaled dataset
        kmeans_method.fit(scaled)
        #get the labels
        labels = kmeans_method.labels_
        #get the silhouette score
        a = metrics.silhouette_score(scaled, labels)
        #append it to the average list
        average.append(a)
    #get the silhouette score and append it to the silhouette 
    silhouette.append(sum(average)/len(average))
    
#plot the silhouette score
plt.plot(silhouette)

## Agglomerative clustering

In [None]:
#set the number of clusters expected
n_clusters = 4

#get the hierarchy method
hierarchy = AgglomerativeClustering(n_clusters = n_clusters)

#fit it to the data
hierarchy.fit(scaled)

In [None]:
#assign labels to the original data
transport_percentage_hierarchy = transport_percentage.assign(label = hierarchy.labels_)

In [None]:
hierarchy_labels = hierarchy.labels_

#check the silhouette score
metrics.silhouette_score(scaled, hierarchy_labels)