# Weather Station Clustering using DBSCAN

<b> DBSCAN ( Density-based spatial clustering of applications with noise ) </b> is especially very good for tasks like class identification on a spatial context. The wonderful attribute of DBSCAN algorithm is that it can find out any arbitrary shape cluster without getting affected by noise. For example, this following example cluster the location of weather stations in Canada.DBSCAN can be used here, for instance, to find the group of stations which show the same weather condition. As you can see, it not only finds different arbitrary shaped clusters but can find the denser part of data-centred samples by ignoring less-dense areas or noises. 

<b> About the DataSet </b>
<h4 align = "center"> Environment Canada
Monthly Values for July - 2015
</h4>

Name in the table Meaning Prov Province Tm Mean Temperature (°C) DwTm Days without Valid Mean Temperature D Mean Temperature difference from Normal (1981-2010) (°C) DwTx Days without Valid Maximum Temperature DwTn Days without Valid Minimum Temperature S Snowfall (cm) DwS Days without Valid Snowfall S%N Percent of Normal (1981-2010) Snowfall DwP Days without Valid Precipitation P%N Percent of Normal (1981-2010) Precipitation S_G Snow on the ground at the end of the month (cm) Pd Number of days with Precipitation 1.0 mm or more BS Bright Sunshine (hours) DwBS Days without Valid Bright Sunshine BS% Percent of Normal (1981-2010) Bright Sunshine HDD Degree Days below 18 °C CDD Degree Days above 18 °C Stn_No Climate station identifier (first 3 digits indicate drainage basin, last 4 characters are for sorting alphabetically). NA Not Available

In [128]:
import numpy as np
import pandas as pd
csv_file = 'weather-stations20140101-20141231.csv'
df = pd.read_csv(csv_file)
df

Unnamed: 0,Stn_Name,Lat,Long,Prov,Tm,DwTm,D,Tx,DwTx,Tn,...,DwP,P%N,S_G,Pd,BS,DwBS,BS%,HDD,CDD,Stn_No
0,CHEMAINUS,48.935,-123.742,BC,8.2,0.0,,13.5,0.0,1.0,...,0.0,,0.0,12.0,,,,273.3,0.0,1011500
1,COWICHAN LAKE FORESTRY,48.824,-124.133,BC,7.0,0.0,3.0,15.0,0.0,-3.0,...,0.0,104.0,0.0,12.0,,,,307.0,0.0,1012040
2,LAKE COWICHAN,48.829,-124.052,BC,6.8,13.0,2.8,16.0,9.0,-2.5,...,9.0,,,11.0,,,,168.1,0.0,1012055
3,DISCOVERY ISLAND,48.425,-123.226,BC,,,,12.5,0.0,,...,,,,,,,,,,1012475
4,DUNCAN KELVIN CREEK,48.735,-123.728,BC,7.7,2.0,3.4,14.5,2.0,-1.0,...,2.0,,,11.0,,,,267.7,0.0,1012573
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,NAIN A,56.550,-61.683,NL,-22.6,0.0,-5.2,-6.8,0.0,-33.5,...,0.0,66.0,74.0,5.0,,,,1136.5,0.0,8502800
1337,NAIN A,56.551,-61.682,NL,-19.2,24.0,,-7.5,17.0,,...,17.0,,,4.0,,,,148.7,0.0,8502801
1338,SAGLEK,58.333,-62.586,NL,-24.4,2.0,,-13.5,1.0,-32.3,...,,,,,,,,1101.2,0.0,8503249
1339,TUKIALIK BAY,54.716,-58.358,NL,-22.8,2.0,,-5.8,1.0,-32.5,...,,,,,,,,1060.0,0.0,8503992


### Cleaning
Now lets remove rows that don't have any value in the TM field

In [129]:
df = df.dropna(subset=['Tm'],axis=0)
df

Unnamed: 0,Stn_Name,Lat,Long,Prov,Tm,DwTm,D,Tx,DwTx,Tn,...,DwP,P%N,S_G,Pd,BS,DwBS,BS%,HDD,CDD,Stn_No
0,CHEMAINUS,48.935,-123.742,BC,8.2,0.0,,13.5,0.0,1.0,...,0.0,,0.0,12.0,,,,273.3,0.0,1011500
1,COWICHAN LAKE FORESTRY,48.824,-124.133,BC,7.0,0.0,3.0,15.0,0.0,-3.0,...,0.0,104.0,0.0,12.0,,,,307.0,0.0,1012040
2,LAKE COWICHAN,48.829,-124.052,BC,6.8,13.0,2.8,16.0,9.0,-2.5,...,9.0,,,11.0,,,,168.1,0.0,1012055
4,DUNCAN KELVIN CREEK,48.735,-123.728,BC,7.7,2.0,3.4,14.5,2.0,-1.0,...,2.0,,,11.0,,,,267.7,0.0,1012573
5,ESQUIMALT HARBOUR,48.432,-123.439,BC,8.8,0.0,,13.1,0.0,1.9,...,8.0,,,12.0,,,,258.6,0.0,1012710
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1336,NAIN A,56.550,-61.683,NL,-22.6,0.0,-5.2,-6.8,0.0,-33.5,...,0.0,66.0,74.0,5.0,,,,1136.5,0.0,8502800
1337,NAIN A,56.551,-61.682,NL,-19.2,24.0,,-7.5,17.0,,...,17.0,,,4.0,,,,148.7,0.0,8502801
1338,SAGLEK,58.333,-62.586,NL,-24.4,2.0,,-13.5,1.0,-32.3,...,,,,,,,,1101.2,0.0,8503249
1339,TUKIALIK BAY,54.716,-58.358,NL,-22.8,2.0,,-5.8,1.0,-32.5,...,,,,,,,,1060.0,0.0,8503992


### Visualization
Visualization of stations on map

In [130]:
import folium

In [131]:
map_can = folium.Map(
    location = [51.507351, -0.127758],
    zoom_start = 4
)
for lat,lng in zip(df['Lat'][0:1000],df['Long'][0:1000]):
    folium.CircleMarker(
    [lat,lng],
    radius=1,
    fill = True,
    color = 'red'
    ).add_to(map_can)


In [132]:
map_can

### a. Clustering of stations based on their location i-e Latitude and Longitude

In [133]:
from sklearn.cluster import  DBSCAN
from sklearn.preprocessing import StandardScaler

In [134]:
Cluster = df[['Lat','Long']]
Cluster = StandardScaler().fit_transform(Cluster)
Cluster

array([[-0.38421049, -1.14248704],
       [-0.40207206, -1.15922563],
       [-0.40126748, -1.15575805],
       ...,
       [ 1.12806886,  1.4755829 ],
       [ 0.54603924,  1.65658231],
       [ 0.25751861,  1.2924002 ]])

In [135]:
#Computer DBSCAN
db = DBSCAN(eps=0.15,min_samples = 10)
db.fit(Cluster)

DBSCAN(algorithm='auto', eps=0.15, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=10, n_jobs=None, p=None)

In [136]:
db.labels_

array([ 0,  0,  0, ..., -1, -1, -1], dtype=int64)

In [137]:
set(db.labels_)

{-1, 0, 1, 2, 3}

In [138]:
core_samples_mask = np.zeros_like(db.labels_,dtype = bool)

In [139]:
core_samples_mask

array([False, False, False, ..., False, False, False])

In [140]:
core_samples_mask[db.core_sample_indices_] = True

In [141]:
db.core_sample_indices_

array([   0,    1,    2,    3,    4,    5,    6,    7,    8,    9,   10,
         11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
         22,   23,   24,   25,   26,   27,   28,   29,   30,   31,   32,
         33,   34,   35,   36,   37,   38,   39,   40,   41,   42,   43,
         44,   45,   46,   47,   48,   49,   50,   51,   52,   53,   54,
         55,   56,   57,   58,   59,   60,   61,   62,   63,   64,   65,
         66,   67,   68,   69,   70,   71,   74,   75,   76,   77,   81,
         85,   86,   91,   92,   93,   94,   96,   97,   98,   99,  100,
        102,  103,  104,  105,  106,  107,  108,  109,  110,  111,  113,
        114,  115,  116,  117,  118,  119,  120,  121,  122,  123,  124,
        125,  126,  127,  128,  129,  130,  131,  132,  133,  134,  135,
        136,  137,  138,  139,  140,  141,  142,  143,  144,  145,  146,
        147,  148,  149,  150,  151,  152,  153,  154,  155,  156,  157,
        158,  159,  160,  161,  162,  163,  164,  1

In [142]:
core_samples_mask

array([ True,  True,  True, ..., False, False, False])

In [143]:
df['Labels'] = db.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [144]:
df['Labels']

0       0
1       0
2       0
4       0
5       0
       ..
1336   -1
1337   -1
1338   -1
1339   -1
1340   -1
Name: Labels, Length: 1256, dtype: int64

In [147]:
df[['Stn_Name','Tx','Tm','Labels']].head()

Unnamed: 0,Stn_Name,Tx,Tm,Labels
0,CHEMAINUS,13.5,8.2,0
1,COWICHAN LAKE FORESTRY,15.0,7.0,0
2,LAKE COWICHAN,16.0,6.8,0
4,DUNCAN KELVIN CREEK,14.5,7.7,0
5,ESQUIMALT HARBOUR,13.1,8.8,0


In [148]:
df['Labels'].unique()

array([ 0, -1,  1,  2,  3], dtype=int64)


As you can see for outliers, the cluster label is -1

### Visualization of Clusters based on Location

In [149]:
map_can = folium.Map(
    location = [56,-106],
    zoom_start = 4
)

In [150]:
color_map  = {0:'red',1:'blue',2:'green',3:'yellow',-1:'gray'}

In [154]:
for lat,lng,lab in zip(df['Lat'],df['Long'],df['Labels']):
    label = folium.Popup('Cluster:'+str(lab),parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        radius = 1,
        fill = True,
        color = color_map[lab],
        popup= label
    ).add_to(map_can)

In [155]:
map_can

### b. Clustering of stations based on their location, mean, max, and min Temperature

In [156]:
Cluster = df[['Lat','Long','Tx','Tm','Tn']]

In [157]:
Cluster = np.nan_to_num(Cluster)
Cluster = StandardScaler().fit_transform(Cluster)
Cluster

array([[-0.38421049, -1.14248704,  1.2287316 ,  1.94601574,  2.16527732],
       [-0.40207206, -1.15922563,  1.39822298,  1.83076653,  1.84789839],
       [-0.40126748, -1.15575805,  1.51121724,  1.81155832,  1.88757076],
       ...,
       [ 1.12806886,  1.4755829 , -1.82211339, -1.1849213 , -0.47690227],
       [ 0.54603924,  1.65658231, -0.9520576 , -1.03125568, -0.49277122],
       [ 0.25751861,  1.2924002 , -1.83341282, -1.50185664, -1.62740089]])

In [159]:
#Compute DBSCAN
db = DBSCAN(eps = 0.3,min_samples= 10)
db.fit(Cluster)

DBSCAN(algorithm='auto', eps=0.3, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=10, n_jobs=None, p=None)

In [163]:
core_samples_mask = np.zeros_like(db.labels_,dtype= bool)
core_samples_mask[db.core_sample_indices_] = True

In [164]:
df['Labels'] = db.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [165]:
df[["Stn_Name","Tx","Tm","Tn","Labels"]].head()

Unnamed: 0,Stn_Name,Tx,Tm,Tn,Labels
0,CHEMAINUS,13.5,8.2,1.0,0
1,COWICHAN LAKE FORESTRY,15.0,7.0,-3.0,0
2,LAKE COWICHAN,16.0,6.8,-2.5,0
4,DUNCAN KELVIN CREEK,14.5,7.7,-1.0,0
5,ESQUIMALT HARBOUR,13.1,8.8,1.9,0


In [166]:
df['Labels'].unique()


array([ 0, -1,  2,  1,  3,  4,  5,  6,  7], dtype=int64)

In [167]:
df.columns

Index(['Stn_Name', 'Lat', 'Long', 'Prov', 'Tm', 'DwTm', 'D', 'Tx', 'DwTx',
       'Tn', 'DwTn', 'S', 'DwS', 'S%N', 'P', 'DwP', 'P%N', 'S_G', 'Pd', 'BS',
       'DwBS', 'BS%', 'HDD', 'CDD', 'Stn_No', 'Labels'],
      dtype='object')

### C. Visualization Based on Location and Temperature

In [168]:
map_can = folium.Map(
    location=[56,-106],
    zoom_start = 4
)

color_map = {0:'red',1:'blue',2:'green',3:'yellow',4:'orange',-1:'gray',
             5:'pink',6:'purple',7:'brown'
            }

for lat,lng,lbl in zip(df['Lat'],df['Long'],df['Labels']):
    label = folium.Popup('Cluster : '+str(lbl),parse_html=True)
    folium.CircleMarker(
        [lat,lng],
        color=color_map[lbl],
        fill=True,
        radius=1,
        popup=label
    ).add_to(map_can)

map_can

In [175]:
Cluster = df[['Lat','Long','Tm']]
Cluster = StandardScaler().fit_transform(Cluster)
Cluster

array([[-0.38421049, -1.14248704,  1.94601574],
       [-0.40207206, -1.15922563,  1.83076653],
       [-0.40126748, -1.15575805,  1.81155832],
       ...,
       [ 1.12806886,  1.4755829 , -1.1849213 ],
       [ 0.54603924,  1.65658231, -1.03125568],
       [ 0.25751861,  1.2924002 , -1.50185664]])

In [178]:
db = DBSCAN(eps=0.6,min_samples=10)


In [180]:
db.fit(Cluster)

DBSCAN(algorithm='auto', eps=0.6, leaf_size=30, metric='euclidean',
       metric_params=None, min_samples=10, n_jobs=None, p=None)

In [181]:
core_samples_mask = np.zeros_like(db.labels_,dtype = bool)
core_samples_mask[db.core_sample_indices_] = True

In [182]:
core_samples_mask

array([ True,  True,  True, ..., False,  True,  True])

In [183]:
df['Labels'] = db.labels_

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [184]:
df['Labels']

0       0
1       0
2       0
4       0
5       0
       ..
1336    0
1337    0
1338    0
1339    0
1340    0
Name: Labels, Length: 1256, dtype: int64

In [185]:
df.Labels.unique()

array([ 0, -1,  1], dtype=int64)