# E-commerce Sales Hotspots and Outliers Detection

In [49]:
## for data
import numpy as np
import pandas as pd
## for plotting
import matplotlib.gridspec as gridspec
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px
## for outilier
from sklearn.neighbors import LocalOutlierFactor
##for clustering
import hdbscan
from sklearn.cluster import DBSCAN
from sklearn.cluster import OPTICS
## for metrics
from sklearn.neighbors import NearestNeighbors
from sklearn import metrics

In [94]:
geolocation = pd.read_csv('olist_geolocation_dataset.csv')

In [95]:
filter = "MT"
state = geolocation[geolocation["geolocation_state"]==filter][["geolocation_zip_code_prefix","geolocation_city","geolocation_state","geolocation_lng","geolocation_lat"]].reset_index(drop=True)

# Anomalies

In [52]:
anomaly = DBSCAN(eps=3, min_samples=5).fit(state[['geolocation_lng','geolocation_lat']])
state['anomaly'] = list(map(lambda x: 'no' if x == 0 else 'yes', list(anomaly.labels_)))

In [53]:
fig1 = px.scatter(state, x="geolocation_lng", y="geolocation_lat", color="anomaly", labels={"anomaly": "anomaly"})
fig1.show()

In [54]:
state = state[state['anomaly'] == 'no']

# Heatmap

In [55]:
state['sales'] = state['geolocation_city'].map(geolocation['geolocation_city'].value_counts())
clean = state.copy()
state.drop_duplicates(subset=['geolocation_city'], inplace= True)

In [89]:
fig2 = px.density_mapbox(state, lat='geolocation_lat', lon='geolocation_lng', z='sales', radius=50,
                        center=dict(lat=-13, lon=-55), zoom=4.4,
                        hover_name="geolocation_city", mapbox_style="stamen-toner")
fig2.show()

# DBSCAN

In [57]:
coord = clean[['geolocation_lng','geolocation_lat']]

In [58]:
clustering2 = DBSCAN(eps=.51, min_samples=25).fit(coord)
x = lambda x: "residue" if x == -1 else str(x)
clean['clusters_0.01'] = list(map(x,list(clustering2.labels_)))

In [59]:
fig3 = px.scatter_mapbox(clean[clean['clusters_0.01'] != "-1"], lat="geolocation_lat", lon="geolocation_lng",
                        color="clusters_0.01", title='DBSCAN 0.5eps',
                        hover_name="geolocation_city", center=dict(lat=-13, lon=-55),zoom=4.5,
                        width=1000, height=500, mapbox_style="stamen-toner")
fig3.show()

# HDSCAN

In [60]:
clusterer = hdbscan.HDBSCAN(min_cluster_size=180, gen_min_span_tree=True)
clusterer.fit(coord)
clean['clusters_HD'] = list(map(x,list(clusterer.labels_)))

In [61]:
fig4 = px.scatter_mapbox(clean[clean['clusters_HD'] != "-1"], lat="geolocation_lat", lon="geolocation_lng",
                        color="clusters_HD", title="HDBSCAN",
                        hover_name="geolocation_city", hover_data=["sales"], zoom=4, center=dict(lat=-13, lon=-55),
                        width=1000, height=500, mapbox_style="stamen-toner")
fig4.show()

# OPTICS

In [72]:
clust = OPTICS(min_samples=50, xi=.03, min_cluster_size=.06)
clust.fit(coord)
clean['clusters_opt'] = list(map(x,list(clust.labels_)))

In [73]:
fig5 = px.scatter_mapbox(clean[clean['clusters_opt'] != "-1"], lat="geolocation_lat", lon="geolocation_lng",
                        color="clusters_opt", title="OPTICS",
                        hover_name="geolocation_city", hover_data=["sales"], zoom=4, center=dict(lat=-13, lon=-55),
                        width=1000, height=500, mapbox_style="stamen-toner")
fig5.show()

# Evaluate models

In [150]:
clusters_DBSCAN = clean[clean['clusters_0.01'] != 'residue']
clusters_HDBSCAN = clean[clean['clusters_HD'] != 'residue']
clusters_OPTICS = clean[clean['clusters_opt'] != 'residue']

In [151]:
print("Silhouette Score")
print("DBSCAN (0.01eps): ",metrics.silhouette_score(clusters_DBSCAN[['geolocation_lng','geolocation_lat']], clusters_DBSCAN['clusters_0.01']))
print("HDBSCAN: ",metrics.silhouette_score(clusters_HDBSCAN[['geolocation_lng','geolocation_lat']], clusters_HDBSCAN['clusters_HD']))
print("OPTICS: ",metrics.silhouette_score(clusters_OPTICS[['geolocation_lng','geolocation_lat']], clusters_OPTICS['clusters_opt']))

Silhouette Score
DBSCAN (0.01eps):  0.4192988637545242
HDBSCAN:  0.8741467184747194
OPTICS:  0.7161451521719372


In [152]:
print("Davies Bouldin Score")
print("DBSCAN (0.6eps): ",metrics.davies_bouldin_score(clusters_DBSCAN[['geolocation_lng','geolocation_lat']], clusters_DBSCAN['clusters_0.01']))
print("HDBSCAN: ",metrics.davies_bouldin_score(clusters_HDBSCAN[['geolocation_lng','geolocation_lat']], clusters_HDBSCAN['clusters_HD']))
print("OPTICS: ",metrics.davies_bouldin_score(clusters_OPTICS[['geolocation_lng','geolocation_lat']], clusters_OPTICS['clusters_opt']))

Davies Bouldin Score
DBSCAN (0.6eps):  0.4925354193289456
HDBSCAN:  0.24801193321383944
OPTICS:  0.4880455918122158


# Reachability

In [66]:
space = np.arange(len(coord))
labels = clust.labels_[clust.ordering_]
reachability = clust.reachability_[clust.ordering_]
labels = list(map(x, list(labels)))

In [70]:
fig6 = px.scatter(x=space, y=reachability, color=labels)
line = [0.51] * len(space)

In [71]:
fig6.add_trace(go.Scatter(y=line,
                    mode='lines',
                    name='0.51 eps'))

References:

Lawson, A.B. Hotspot detection and clustering: ways and means (2010), Environ Ecol Stat 17 https://doi.org/10.1007/s10651-010-0142-z

Olist. Brazilian E-Commerce Public Dataset by Olist (2018), Kaggle https://www.kaggle.com/olistbr/brazilian-ecommerce



In [24]:
import datapane as dp

In [25]:
dp.login(token="ca589191d4e0206e69f992d53273279b2e766787")

Connected successfully to https://datapane.com as lfsouza25


'lfsouza25'

In [90]:
report = dp.Report(
    dp.Plot(fig2, caption="Sales Heatmap")
)

In [91]:
report.publish(name='Heatmap', open=True, visibility=dp.Visibility.PUBLIC)

Publishing document and associated data - *please wait...*

Your report doesn't contain any text - did you know you can add text to your report once published?

Your report only contains a single element - did you know you can add multiple plots and tables to a report, add text to it and export directly to Medium once published?

Report successfully published at https://datapane.com/u/lfsouza25/reports/heatmap/ - you can edit and add additional text from the link