<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# DBSCAN Practice

---

You're now familiar with how DBSCAN works. Let's practice it in sklearn.

In [1]:
import pandas as pd
import numpy as np

from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import DistanceMetric

from sklearn import cluster
from sklearn.cluster import DBSCAN
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist, squareform

from sklearn import datasets
from math import radians, cos, sin, asin, sqrt

import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

## UK postcodes

Now we will do clustering on UK postcodes based on their coordinates. Adjust the parameters of DBSCAN - different values will allow you to see different structures. What would you expect to see?

### 1. Read in the following CSV file containing the outgoing UK postcodes together with their coordinates of longitude and latitude

In [2]:
X = pd.read_csv(
    'https://www.freemaptools.com/download/outcode-postcodes/postcode-outcodes.csv')

HTTPError: HTTP Error 403: Forbidden

In [None]:
X.drop(X.index[(X.latitude == 0) | (X.longitude == 0)], inplace=True)

### 2. Load the following function. It calculates the distance between any points on the Earth's surface specified by their longitude and latitude in degrees.

In [None]:
def haversine(lonlat1, lonlat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians
    lat1, lon1 = lonlat1
    lat2, lon2 = lonlat2
    lon1, lat1, lon2, lat2 = list(map(radians, [lon1, lat1, lon2, lat2]))

    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2.)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2.)**2
    c = 2 * asin(sqrt(a))
    r = 6371  # Radius of earth in kilometers. Use 3956 for miles
    return c * r

In [None]:
X.shape

### 3. Calculate the distance matrix between each pair of points

In [None]:
my_metric = pdist(X.loc[:, ['latitude', 'longitude']],
                  (lambda u, v: haversine(u, v)))

In [None]:
distance_matrix = squareform(my_metric)

In [None]:
X.head()

In [None]:
print(distance_matrix.shape)

In [None]:
# distribution of mean distances from any given point
plt.hist(distance_matrix.mean(axis=1), bins=np.logspace(2, 2.9, 20))
plt.show()

In [None]:
plt.hist(distance_matrix.std(axis=1), bins=np.logspace(2, 2.4, 20))
plt.show()

### 4. Do DBSCAN clustering on the distance matrix with the precomputed metric. Adjust the values of eps and min_samples to obtain a suitable number of clusters

In [None]:
db = DBSCAN(eps=10, min_samples=20, metric='precomputed')
y_db = db.fit_predict(distance_matrix)  # do your fit on the distance matrix

In [None]:
X['cluster'] = y_db

In [None]:
X.head()

In [None]:
X['cluster'] = X.cluster

In [None]:
print(len(X.cluster.unique()))
X.cluster.value_counts()

In [None]:
X_postcode = X.copy()

### 5. Produce a scatter plot of the angular variables with points coloured according to their cluster label

In [None]:
from matplotlib import cm
plt.figure(figsize=(12, 16))
plt.scatter(X['longitude'], X['latitude'], c=X['cluster'],
            cmap=cm.get_cmap('rainbow'), s=40)
plt.ylim([50, 60])
plt.xlabel('Longitude', fontsize=24)
plt.ylabel('Latitude', fontsize=24)
plt.title('DBScan clustering (13 clusters)', fontsize=24)
plt.show()

## Bonus:

### Try k-means and hierarchical clustering on the given datasets. Which differences do you observe? Which algorithm do you find most suitable in each case?

In [None]:
from sklearn.cluster import KMeans, AgglomerativeClustering

### K-means and hierarchical clustering on the postcode dataset

In [None]:
k = 14
kmeans = KMeans(n_clusters=k)
kmeans.fit(distance_matrix)
labels = kmeans.labels_

In [None]:
X_postcode['cluster_kmeans'] = labels
X_postcode.cluster_kmeans.value_counts()

In [None]:
plt.figure(figsize=(7, 9))
plt.scatter(X_postcode['longitude'], X_postcode['latitude'],
            c=X_postcode['cluster_kmeans'],
            cmap=cm.get_cmap('rainbow'), s=10)
plt.ylim([50, 60])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('K-Means clustering (14 clusters)')
plt.show()

In [None]:
k = 60
kmeans = KMeans(n_clusters=k)
kmeans.fit(distance_matrix)
labels = kmeans.labels_
X_postcode['cluster_kmeans'] = labels
plt.figure(figsize=(7, 9))
plt.scatter(X_postcode['longitude'], X_postcode['latitude'],
            c=X_postcode['cluster_kmeans'],
            cmap=cm.get_cmap('rainbow'), s=10)
plt.ylim([50, 60])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('K-Means clustering (60 clusters)')
plt.show()

In [None]:
agg = cluster.AgglomerativeClustering(n_clusters=14,
                                      affinity='precomputed', linkage='average')
cluster_hiersk = agg.fit_predict(distance_matrix)
X_postcode['cluster_hiersk'] = cluster_hiersk
print(X_postcode.cluster_hiersk.value_counts())

In [None]:
plt.figure(figsize=(7, 9))
plt.scatter(X_postcode['longitude'], X_postcode['latitude'],
            c=cluster_hiersk,
            cmap=cm.get_cmap('rainbow'), s=10)
plt.ylim([50, 60])
plt.xlabel('Longitude')
plt.ylabel('Latitude')
plt.title('Hierarchical clustering with sklearn (14 clusters)')
plt.show()

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, cophenet, fcluster
from scipy.spatial.distance import pdist

In [None]:
# scipy needs flattened upper triangular part of the distance matrix
Z = linkage(distance_matrix[np.triu_indices(
    distance_matrix.shape[1], k=1)], 'average')
c, coph_dists = cophenet(Z, my_metric)

print(c)
print(len(coph_dists))
print(coph_dists.mean())

In [None]:
plt.title('Truncated Dendrogram')
plt.xlabel('Index Numbers')
plt.ylabel('Distance')
dendrogram(
    Z,
    truncate_mode='level',
    p=4,
    show_leaf_counts=True,
    leaf_rotation=90.,
    leaf_font_size=12.,
    show_contracted=True,
    color_threshold=200
)
plt.show()

In [None]:
clusters = fcluster(Z, 14, criterion='maxclust')
print(len(set(clusters)))