# Data analysis
This notebook should NOT modify the given data.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import altair as alt
import json

import src.visualize as visualize
import src.analysis as analysis

OUTPUT_DIR = 'output/'
JSON_DIR = 'json/'
DATA_DIR = 'data/'
HTML_DIR = 'html/'

# for the notebook only (not for JupyterLab) run this command once per session
alt.renderers.enable('notebook')
alt.data_transformers.enable('default', max_rows=None)

In [None]:
df = pd.read_csv(DATA_DIR + 'final.csv')
geoData = df[['address', 'lat', 'lng']].dropna()
geoData

In [None]:
# China map
with open(JSON_DIR+'china.geo.json', encoding="utf-8") as json_file:  
    chinaMap = json.load(json_file)
data_china_geo = alt.Data(values=chinaMap['features'])
china_map = visualize.draw_map(data_china_geo)
vis = visualize.draw_points(geoData, china_map)
vis
# vis.save(OUTPUT_DIR + HTML_DIR + 'china.html')

In [None]:
# More detailed guangdong map
with open(JSON_DIR+'guangdong.json', encoding="utf-8") as json_file:  
    guangDong = json.load(json_file)
guangDongData = analysis.extract_map_from_geojson(geoData, guangDong)

In [None]:
# Template for extracting and visualizing city data
# All cities: '清远市', '韶关市', '湛江市', '梅州市', '河源市', '肇庆市', '惠州市', '茂名市', '江门市', '阳江市', '云浮市', '广州市', '汕尾市', '揭阳市', '珠海市', '佛山市', '潮州市', '汕头市', '东莞市', '中山市', '深圳市'
specificLocations = ['深圳市']
citiesData, cities = analysis.get_cities_data(geoData, specificLocations, guangDong)
data_geo = alt.Data(values=cities['features'])
citiesMap = visualize.draw_map(data_geo)
vis = visualize.draw_points(citiesData, citiesMap)
vis

In [None]:
data_geo = alt.Data(values=guangDong['features'])
guangdong_map = visualize.draw_map(data_geo)
vis = visualize.draw_points(guangDongData, guangdong_map)
vis.save(OUTPUT_DIR + HTML_DIR + 'guangdong.html')
vis

In [None]:
geoData = guangDongData # Using guang dong data only
# Augment data
# fixed_cluster = [['Fixed cluster 1', 23.186364, 113.418813], ['Fixed cluster 2', 22.569243, 114.108307]]
# fixedDf = pd.DataFrame(fixed_cluster, columns=['address', 'lat', 'lng'])
# geoData = geoData.append([fixedDf]*2000,ignore_index=True)

coords = geoData[['lat', 'lng']].values
coords

## Heatmap

In [None]:
# For website
analysis.outputHeatmap(guangDongData, filename='heatmap.csv')

# Create heatmap using python (Use Chrome, Edge won't work here)
hmap = visualize.draw_heatmap(geoData)
hmap.save(OUTPUT_DIR + HTML_DIR + 'heatmap.html')
# hmap
# Please open 'heatmap.html' to view the heatmap because the heatmap cannot be shown here if the dataset is too large

## DBSCAN

In [None]:
# labels, cluster_pos = analysis.dbscan(coords, km=50, num_of_pts=100, plot=True)
# geoData['cluster_id'] = labels
# clusterSize = geoData.groupby('cluster_id').size().reset_index(name='count')

In [None]:
# centersDf = pd.DataFrame(cluster_pos, columns=['lat', 'lng'])
# centersDf = centersDf.reset_index().rename(columns={'index': 'cluster_id'})

# clusterBarChart = alt.Chart(clusterSize).mark_bar().encode(
#     x='cluster:Q',
#     y='count:Q',
#     color=alt.Color('cluster:N', scale=alt.Scale(scheme='category20')),
# )

# vis = visualize.draw_cluster(geoData, centersDf, guangdong_map)
# vis & clusterBarChart

## K-means clustering

In [None]:
n_clusters = 17
fixed_cluster = np.array([[23.186364, 113.418813], [22.569243, 114.108307]])
labels, cluster_pos = analysis.k_means(coords, n_clusters=n_clusters, max_iter=1000, n_init=50, fixCluster=fixed_cluster)

geoData['cluster_id'] = labels
clusterSize = geoData.groupby('cluster_id').size().reset_index(name='count')

In [None]:
centersDf = pd.DataFrame(cluster_pos, columns=['lat', 'lng'])
centersDf = centersDf.reset_index().rename(columns={'index': 'cluster_id'})
# fixed_cluster = [[23.186364, 113.418813], [22.569243, 114.108307]]
# fixedDf = pd.DataFrame(fixed_cluster, columns=['lat', 'lng'])

clusterBarChart = alt.Chart(clusterSize).mark_bar().encode(
    x='cluster:Q',
    y='count:Q',
    color=alt.Color('cluster:N', scale=alt.Scale(scheme='category20')),
)

# vis = draw_cluster(geoData, centersDf.append(fixedDf, ignore_index=True).reset_index().rename(columns={'index': 'cluster_id'}), guangdong_map)
vis = visualize.draw_cluster(geoData, centersDf, guangdong_map)
vis = vis & clusterBarChart
vis.save(OUTPUT_DIR + HTML_DIR + 'kmean.html')

# Save
analysis.exportResult(geoData, cluster_pos, resultFileName='kmeans.csv', clusterFileName='cluster.csv')

# Plot
vis

In [None]:
# Show specific location
specificLocations = ['深圳市']
specificCluster = [1, 5, 11]
citiesData, cities = analysis.get_cities_data(geoData, specificLocations, guangDong)
data_geo = alt.Data(values=cities['features'])
citiesMap = visualize.draw_map(data_geo)
vis = visualize.draw_cluster(citiesData, centersDf[centersDf['cluster_id'].isin(specificCluster)], citiesMap)
vis

## Phase 2

In [None]:
# method can be k_means, dbscan, and aggClustering
labels, cluster_pos = analysis.subClustering(geoData, cluster_pos, method=analysis.k_means, avg_size=1000)
geoData['cluster_id'] = labels
# geoData['cluster_id']

clusterSize = geoData[geoData['cluster_id']>=0].groupby('cluster_id').size().reset_index(name='count')
clusterBarChart = alt.Chart(clusterSize).mark_bar().encode(
    x='cluster:Q',
    y='count:Q',
    color=alt.Color('cluster:N', scale=alt.Scale(scheme='category20')),
)

centersDf = pd.DataFrame(cluster_pos, columns=['lat', 'lng'])
centersDf = centersDf.reset_index().rename(columns={'index': 'cluster_id'})
vis = visualize.draw_cluster(geoData, centersDf, guangdong_map)
vis = vis & clusterBarChart
vis.save(OUTPUT_DIR + HTML_DIR + 'kmean-phase2.html')

# Save
analysis.exportResult(geoData, cluster_pos, resultFileName='kmeans2.csv', clusterFileName='cluster2.csv')

# Plot
vis

## Hierarchical clustering

In [None]:
# int x =  (int) ((MAP_WIDTH/360.0) * (180 + lon))
# int y =  (int) ((MAP_HEIGHT/180.0) * (90 - lat))

# coords = np.array(list(map(lambda pt: [(MAP_HEIGHT/180.0) * (90 - pt[0]), (MAP_WIDTH/360.0) * (180 + pt[1])], coords)))

In [None]:
labels, cluster_pos = analysis.aggClustering(coords, n_clusters=15)
geoData['cluster_id'] = labels

centersDf = pd.DataFrame(cluster_pos, columns=['lat', 'lng'])
centersDf = centersDf.reset_index().rename(columns={'index': 'cluster_id'})
vis = visualize.draw_cluster(geoData, centersDf, guangdong_map)

# Save
analysis.exportResult(geoData, cluster_pos, resultFileName='hierarchical.csv', clusterFileName='hierarchical_cluster2.csv')

# Plot
vis