# Measure simpson as patterned character

In [2]:
import geopandas as gpd
import momepy as mm
from tqdm import tqdm
import numpy as np
import pandas as pd
import libpysal
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import mapclassify

In [4]:
gdf = pd.read_csv('/Users/martin/Dropbox/Academia/Data/Geo/Prague/Clustering/primary.csv', index_col=0)

In [5]:
spatial_weights = libpysal.io.open('/Users/martin/Dropbox/Academia/Data/Geo/Prague/Clustering/GRqueen3.gal', 'r').read()

 There are 128 disconnected components.


In [6]:
spatial_weights.neighbors = {int(k): [int(i) for i in v] for k, v in spatial_weights.neighbors.items()}

In [20]:
gdf.set_index('uID', inplace=True)

In [7]:
chars = gdf.columns

In [8]:
skewness = pd.DataFrame(index=chars)
for c in chars:
    skewness.loc[c, 'skewness'] =sp.stats.skew(gdf[c])

In [9]:
headtail = list(skewness.loc[skewness.skewness >= 1].index)

some values will need to be inverted to follow heavy-tailed distribution

In [10]:
to_invert = skewness.loc[skewness.skewness <= -1].index

In [11]:
for inv in to_invert:
    gdf[inv + '_r'] = gdf[inv].max() - gdf[inv]

In [12]:
inverted = [x for x in gdf.columns if '_r' in x]

In [13]:
headtail = headtail + inverted

In [14]:
natural = [x for x in chars if x not in headtail]

CALCULATE

In [15]:
def _simpson_di(data):

    """ Given a hash { 'species': count } , returns the Simpson Diversity Index

    >>> simpson_di({'a': 10, 'b': 20, 'c': 30,})
    0.3888888888888889

    https://gist.github.com/martinjc/f227b447791df8c90568
    """

    def p(n, N):
        """ Relative abundance """
        if n == 0:
            return 0
        return float(n) / N

    N = sum(data.values())

    return sum(p(n, N) ** 2 for n in data.values() if n != 0)

In [16]:
import mapclassify.classifiers as classifiers
schemes = {}
for classifier in classifiers.CLASSIFIERS:
    schemes[classifier.lower()] = getattr(classifiers, classifier)

In [30]:
results = {}
for c in headtail + natural:
    results[c] = []

In [18]:
bins = {}
for c in headtail:
    bins[c] = schemes['headtailbreaks'](gdf[c]).bins
for c in natural:
    bins[c] = mapclassify.gadf(gdf[c], method='NaturalBreaks')[1].bins

In [19]:
bins

{'sdcLAL': array([ 66.61716082, 112.87083218, 151.7766402 , 190.77086234,
        225.94778821, 269.46951012, 327.76307633, 405.98475821,
        491.59752097, 574.50552105, 724.15126419, 974.71075036]),
 'sdcAre': array([  2139.30767059,   6712.49285355,  13465.93413415,  21376.06377824,
         31067.84344442,  44757.06985994,  68945.31638161, 100777.28534996,
        161406.74276203, 351346.07527547]),
 'stcSAl': array([ 9.15645956, 19.6652124 , 27.92893622, 33.7194865 , 37.56474895,
        40.09588059, 41.72273041, 42.83359412, 43.61906861, 44.20445932,
        44.60153434, 44.80043776, 44.91631323, 44.96356844, 44.98327849,
        44.99925174]),
 'sicCAR': array([0.19615551, 0.34531623, 0.4835373 , 0.59693789, 0.70089712,
        0.79157379, 0.87642472, 0.95203008, 0.99158667, 1.00025179,
        1.00159576, 1.00488737, 1.0094428 , 1.0136924 ]),
 'sicFAR': array([ 0.67364926,  1.82627846,  2.96522926,  3.75241437,  4.42433922,
         5.13456254,  6.04908791,  7.44735699,  9.6

In [31]:
for index, row in tqdm(gdf.iterrows(), total=gdf.shape[0]):
    neighbours = spatial_weights.neighbors[index].copy()
    neighbours.append(index)
    
    subset = gdf.loc[neighbours]
    for c in headtail + natural:
        values = subset[c]
        sample_bins = classifiers.UserDefined(values, list(bins[c]))
        counts = dict(zip(bins[c], sample_bins.counts))
        results[c].append(_simpson_di(counts))

100%|██████████| 140315/140315 [2:44:58<00:00, 14.18it/s]    


In [34]:
for c in headtail + natural:
    gdf[c + '_simpson'] = results[c]

In [35]:
gdf

Unnamed: 0,uID,bID,stcOri,sdcLAL,sdcAre,sscCCo,sscERI,stcSAl,sicCAR,sicFAR,...,lcnClo_simpson,lddRea_simpson,sdsSPW_simpson,sdsSPO_simpson,sdsSWD_simpson,sssLin_simpson,lskCCo_simpson,lskERI_simpson,ltkOri_simpson,ltkWNB_simpson
0,0,0,7.603973,33.817996,342.233358,0.381010,0.891051,7.787476,0.087768,0.614375,...,1.000000,0.814507,0.505351,1.000000,0.505351,0.505351,1.000000,1.000000,1.000000,1.000000
1,1,1,3.712491,58.703914,1435.882458,0.530512,0.935147,2.259805,0.598660,2.993299,...,0.337604,0.308864,0.567175,0.538089,0.266274,0.765928,0.401662,0.370845,0.267659,0.372230
2,2,4,12.121107,34.814040,433.237769,0.455122,0.940945,14.950339,0.108374,0.108374,...,1.000000,0.415556,0.326667,0.642222,0.440000,1.000000,0.415556,0.460000,0.397778,0.397778
3,3,5,7.700002,45.376335,839.521945,0.519139,0.994977,5.437630,0.096908,0.290723,...,0.541728,0.512099,0.311605,0.306667,0.323457,0.737284,0.368889,0.368889,0.429136,0.429136
4,4,6,29.080840,135.981870,7058.965193,0.486059,1.039719,27.965128,0.017193,0.017193,...,0.268861,0.266804,0.210562,0.314815,0.242112,0.659122,0.283265,0.294239,0.290809,0.331276
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
140310,140457,5488,28.178572,98.965803,3505.778714,0.455748,0.920684,6.799673,0.573294,4.013055,...,0.582766,0.292880,0.353288,0.447800,0.387392,1.000000,0.526531,0.565351,0.591837,0.392290
140311,140458,1139,24.776814,92.897524,5114.085960,0.754518,1.071859,12.900064,0.155292,0.310584,...,0.276008,0.529517,0.213188,0.242147,0.341501,0.529962,0.267988,0.229227,0.259969,0.304077
140312,140459,4960,15.112430,34.559509,520.511963,0.554888,0.948496,1.076186,0.409776,2.458657,...,0.451250,0.405000,0.533750,0.511250,0.371250,1.000000,0.441250,0.463750,0.531250,0.463750
140313,140460,4960,20.447865,19.148696,193.498945,0.671909,1.019082,6.411622,0.421273,2.527640,...,0.401860,0.387397,0.552686,0.509298,0.311983,1.000000,0.410124,0.627066,0.648760,0.451446


In [36]:
sim = [x for x in gdf.columns if '_simpson' in x]

In [42]:
simpson = gdf[sim + ['uID']]

In [45]:
simpson.rename(columns={'sscERI_r_simpson': 'sscERI_simpson',
 'ssbCCo_r_simpson': 'ssbCCo_simpson',
 'ssbERI_r_simpson': 'ssbERI_simpson',
 'mtdDeg_r_simpson': 'mtdDeg_simpson',
 'sssLin_r_simpson': 'sssLin_simpson',
 'lskERI_r_simpson': 'lskERI_simpson',}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [47]:
simpson.to_csv('/Users/martin/Dropbox/Academia/Data/Geo/Prague/Clustering/simpson.csv')