In [None]:
import numpy as np
import pywt
import pywt.data
from skimage import data, color
from skimage.transform import rescale, resize, downscale_local_mean
import itertools as it
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.gridspec as gridspec
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import os 
from PIL import Image
Image.MAX_IMAGE_PIXELS = 900_000_000
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
from sklearn.datasets import load_iris, load_wine
from sklearn.cluster import DBSCAN
import seaborn as sns
from collections import Counter
import zipfile 
import urllib.request
import random
import functools as fct
import math as mt
import rasterio
import rasterio.features
import rasterio.warp

In [None]:
dataset = rasterio.open('../zaf_ppp_2019.tif')
band1 = dataset.read(1)

In [None]:
Counter(np.ndarray.flatten(band1))

In [None]:
# ACQUIRE COUNTRY-WIDE POPULATION-DENSITY MAP. 
# Our procedure comes from StackOverflow:
#         https://discuss.analyticsvidhya.com/t/how-to-read-zip-file-directly-in-python/1659
# 
urllib.request.urlretrieve('https://data.humdata.org/dataset/cbfc4206-35c8-42d4-a096-b2dd0aec983d/resource/1b7e9361-651a-4bf2-839b-eaf5c5f45ec1/download/population_zaf_2018-10-01.csv.zip',
                           'sa_pop.zip')
sa_pop_zip = zipfile.ZipFile('sa_pop.zip')
sa_pop_csv = pd.read_csv(filepath_or_buffer = sa_pop_zip.open('population_zaf_2018-10-01.csv'))

In [None]:
sa_pop_csv.describe().T

In [None]:
jberg_pop = sa_pop_csv.loc[ (sa_pop_csv['Lat'].between(left = -26.2,
                                                       right = -26.1)) &\
                            (sa_pop_csv['Lon'].between(left = 27.9,
                                                         right = 28.1))]
jberg_pop.to_csv(path_or_buf = './Data/Jberg_FB_pop_density.csv',
                 index = False)

In [None]:
sns.distplot(a = jberg_pop['Population'],
             bins = 200)

In [None]:
jberg_pop_truncated = jberg_pop.loc[jberg_pop['Population'] >= 1.05 * jberg_pop['Population'].min()]

In [None]:
jberg_pop_truncated.describe().T

In [None]:
jberg_pop_scaler = StandardScaler()
jberg_pop_scaler.fit(X = jberg_pop_truncated)
jberg_pop_scaled = pd.DataFrame(data = jberg_pop_scaler.transform(X = jberg_pop_truncated),
                                columns = jberg_pop_truncated.columns,
                                index = jberg_pop_truncated.index)
jberg_pop_scaled.describe().T

In [None]:
jberg_pop_dbscan = DBSCAN(eps = 0.1,
                          min_samples = 50 )
jberg_pop_dbscan.fit(jberg_pop_scaled)

In [None]:
set(jberg_pop_dbscan.labels_)

In [None]:
np.random.seed(25)
color_map = dict(zip(set(jberg_pop_dbscan.labels_),
                     cm.get_cmap('twilight_shifted', len(set(jberg_pop_dbscan.labels_))).colors))

color_map_index_shuffled = dict(zip(color_map.keys(),
                                    np.random.choice(a = list(color_map.keys()),
                                                     size = len(color_map),
                                                     replace = False)))
color_map.update({color_map_index_shuffled.get(key) : color_map.get(key)
                     for key in color_map.keys()})

color_map.update({idx : np.concatenate((color_map.get(idx)[:-1], np.array([0])))
                         for idx in {-1, 0}})

In [None]:
plt.figure(figsize = (12,9))
plt.scatter(jberg_pop_truncated['Lat'],
            jberg_pop_truncated['Lon'],
            color = pd.Series(data = jberg_pop_dbscan.labels_).map(color_map),
            s = 1)

In [None]:
sa_pop_centriods =\
fct.reduce(lambda Ξ, Ζ : pd.merge(left = Ξ,
                                  right = Ζ,
                                  left_index = True,
                                  right_index = True),
                        [jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_)\
                                           [['Lat', 'Lon', 'clusters']]\
                                           .groupby(by = 'clusters',
                                                    as_index = True)\
                                           .mean()\
                                           .rename(columns = {'Lat' : 'μ_λ',
                                                              'Lon' : 'μ_L'}),
                        jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_)\
                                           [['Lat', 'Lon', 'clusters']]\
                                           .groupby(by = 'clusters',
                                                    as_index = True)\
                                           .std()\
                                           .rename(columns = {'Lat' : 'σ_λ',
                                                              'Lon' : 'σ_L'}),
                        jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_)\
                                           [['Lat', 'Lon', 'clusters']]\
                                           .groupby(by = 'clusters',
                                                    as_index = True)\
                                           .min()\
                                           .rename(columns = {'Lat' : 'min{λ}',
                                                              'Lon' : 'min{L}'}),
                        jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_)\
                                           [['Population', 'clusters']]\
                                           .groupby(by = 'clusters',
                                                    as_index = True)\
                                           .sum()\
                                           .rename(columns = {'Lat' : 'σ_λ',
                                                              'Lon' : 'σ_L'}),
                        jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_,
                                                   pixel_count = 1)\
                                           [['clusters', 'pixel_count']]\
                                           .groupby(by = 'clusters',
                                                    as_index = True)\
                                           .count()])#.drop(labels = [-1, 0])

sa_pop_centriods

In [None]:

jberg_plot_clust = jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_)\
                                      .set_index(keys = 'clusters')
jberg_plot_clust

In [None]:
{pop_clust : jberg_pop_truncated.assign(clusters = jberg_pop_dbscan.labels_)\
                                 .set_index(keys = 'clusters')\
                                 .loc[pop_clust, ['Lat', 'Lon']]\
                                 .corr()
    for pop_clust in jberg_pop_dbscan.labels_
    if pop_clust not in {-1, 0}}

In [None]:
fig, axes = plt.subplots(int(np.ceil(mt.sqrt(len(set(jberg_pop_dbscan.labels_))))),
                         int(np.ceil(mt.sqrt(len(set(jberg_pop_dbscan.labels_))))),
                         figsize = (20,20))
plt_axis = dict(enumerate(np.ndarray.flatten(axes)))
clusters = dict(enumerate(set(jberg_pop_dbscan.labels_)))
for clust in clusters.keys():
    sns.regplot(data = jberg_plot_clust.loc[clusters.get(clust)],
                x = 'Lon',
                y = 'Lat',
                lowess = True,
                ax = plt_axis.get(clust))
    plt_axis.get(clust).set_title(label = f'Population cluster {clusters.get(clust)}')
    
fig.subplots_adjust(wspace = 0.45)
fig.subplots_adjust(hspace = 0.25)