# Measure simpson as context. character

date: 2020-03-07 Amsterdam Data

In [1]:
import geopandas as gpd
import momepy as mm
from tqdm import tqdm
import numpy as np
import pandas as pd
import libpysal
import scipy as sp
import matplotlib.pyplot as plt
import seaborn as sns
import mapclassify

In [2]:
gdf = pd.read_csv('files/AMS/primary.csv', index_col=0)

In [4]:
spatial_weights = libpysal.io.open('files/AMS/AMSqueen3.gal', 'r').read()

 There are 148 disconnected components.


In [5]:
spatial_weights.neighbors = {int(k): [int(i) for i in v] for k, v in spatial_weights.neighbors.items()}

In [6]:
gdf.set_index('uID', inplace=True)

In [7]:
chars = gdf.columns

In [8]:
skewness = pd.DataFrame(index=chars)
for c in chars:
    skewness.loc[c, 'skewness'] =sp.stats.skew(gdf[c])

In [9]:
headtail = list(skewness.loc[skewness.skewness >= 1].index)

some values will need to be inverted to follow heavy-tailed distribution

In [10]:
to_invert = skewness.loc[skewness.skewness <= -1].index

In [11]:
for inv in to_invert:
    gdf[inv + '_r'] = gdf[inv].max() - gdf[inv]

In [12]:
inverted = [x for x in gdf.columns if '_r' in x]

In [13]:
headtail = headtail + inverted

In [14]:
natural = [x for x in chars if x not in headtail]

CALCULATE

In [15]:
def _simpson_di(data):

    """ Given a hash { 'species': count } , returns the Simpson Diversity Index

    >>> simpson_di({'a': 10, 'b': 20, 'c': 30,})
    0.3888888888888889

    https://gist.github.com/martinjc/f227b447791df8c90568
    """

    def p(n, N):
        """ Relative abundance """
        if n == 0:
            return 0
        return float(n) / N

    N = sum(data.values())

    return sum(p(n, N) ** 2 for n in data.values() if n != 0)

In [16]:
import mapclassify.classifiers as classifiers
schemes = {}
for classifier in classifiers.CLASSIFIERS:
    schemes[classifier.lower()] = getattr(classifiers, classifier)

In [17]:
results = {}
for c in headtail + natural:
    results[c] = []

In [19]:
gdf = gdf.fillna(0)

In [20]:
bins = {}
for c in headtail:
    bins[c] = schemes['headtailbreaks'](gdf[c]).bins
for c in natural:
    bins[c] = mapclassify.gadf(gdf[c], method='NaturalBreaks')[1].bins

In [21]:
bins

{'sdcLAL': array([  50.57293911,   94.20643002,  142.45429324,  194.3083024 ,
         242.6956678 ,  308.45951015,  381.6072764 ,  466.34513671,
         552.61452221,  644.52470234,  901.96380416, 1007.69182103,
        1106.5412916 ]),
 'sdcAre': array([  1147.03995301,   6064.20839039,  15109.46387194,  26880.89235727,
         43167.71609999,  69104.0637923 , 103895.48633201, 150466.04030554,
        270187.97495437, 604674.57810591]),
 'sicFAR': array([ 0.69135103,  1.51694854,  2.18323671,  2.73287034,  3.28880657,
         3.93996555,  4.72700242,  5.95048274,  7.8953992 ,  9.97817611,
        11.88041365, 14.83862612, 16.33407881]),
 'mdcAre': array([  9810.24801468,  32617.42838188,  66435.52369261, 113374.7169178 ,
        179936.55032383, 283014.02743918, 458406.39723456, 642416.43697544,
        698993.25454207, 800519.51044902, 844352.31723601, 915316.54404291]),
 'licGDe': array([0.64057525, 1.39313802, 1.9349581 , 2.37723348, 2.85376686,
        3.3998312 , 3.85556914, 

In [22]:
for index, row in tqdm(gdf.iterrows(), total=gdf.shape[0]):
    neighbours = spatial_weights.neighbors[index].copy()
    neighbours.append(index)
    
    subset = gdf.loc[neighbours]
    for c in headtail + natural:
        values = subset[c]
        sample_bins = classifiers.UserDefined(values, list(bins[c]))
        counts = dict(zip(bins[c], sample_bins.counts))
        results[c].append(_simpson_di(counts))

  gadf = 1 - self.adcm / adam
100%|██████████| 252385/252385 [4:36:38<00:00, 15.21it/s]  


In [23]:
for c in headtail + natural:
    gdf[c + '_simpson'] = results[c]

In [24]:
gdf

Unnamed: 0_level_0,stcOri,sdcLAL,sdcAre,sscCCo,sscERI,sicCAR,sicFAR,mtcWNe,mdcAre,licGDe,...,ldsCDL_simpson,xcnSCl_simpson,mtdMDi_simpson,lddNDe_simpson,linWID_simpson,lddRea_simpson,lddARe_simpson,sddAre_simpson,midRea_simpson,midAre_simpson
uID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,27.510060,216.792883,28760.794790,0.779150,1.081927,0.004502,0.004502,0.001577,56629.313623,0.003862,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
1,27.569911,214.663688,27868.518833,0.770029,1.080976,0.003202,0.003202,0.001599,56629.313623,0.003862,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
2,26.780955,142.946654,4725.131782,0.294426,0.944924,0.063007,0.063007,0.008920,43845.834221,0.025565,...,0.591837,1.000000,1.000000,1.000000,1.000000,0.591837,0.591837,0.591837,0.591837,0.591837
3,38.230240,184.595244,15683.586526,0.586023,1.018656,0.052321,0.052321,0.006100,35810.564680,0.042358,...,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000,1.000000
4,9.362836,151.244275,6032.094361,0.335753,0.822057,0.082069,0.082069,0.015487,84209.214520,0.025565,...,0.591837,1.000000,1.000000,1.000000,1.000000,0.591837,0.591837,0.591837,0.591837,0.591837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19452,32.905625,122.129802,678.408432,0.057911,0.799167,0.117692,0.353076,0.011775,3549.480638,0.478379,...,1.000000,0.404844,0.640138,1.000000,1.000000,0.584775,0.508651,0.584775,0.889273,0.889273
19453,32.907696,119.689689,1907.120365,0.169502,0.895196,0.041436,0.124307,0.015075,6763.119547,1.016754,...,1.000000,0.367188,0.695312,1.000000,1.000000,0.531250,0.460938,0.531250,0.781250,0.882812
19454,40.925191,26.664661,178.598830,0.319828,0.983183,0.285785,0.857355,0.079462,3261.843269,0.682345,...,0.547259,0.386578,0.354442,0.957467,0.916824,0.327977,0.444234,0.500945,0.414934,0.519849
19455,42.884230,42.515494,888.019281,0.625515,1.006693,0.332357,1.329426,0.050626,7891.125350,0.682345,...,0.547259,0.386578,0.354442,0.957467,0.916824,0.327977,0.444234,0.500945,0.414934,0.519849


In [25]:
sim = [x for x in gdf.columns if '_simpson' in x]

In [27]:
sim

['sdcLAL_simpson',
 'sdcAre_simpson',
 'sicFAR_simpson',
 'mdcAre_simpson',
 'licGDe_simpson',
 'ltcWRB_simpson',
 'stcSAl_simpson',
 'sdbHei_simpson',
 'sdbAre_simpson',
 'sdbVol_simpson',
 'sdbPer_simpson',
 'sdbCoA_simpson',
 'ssbFoF_simpson',
 'ssbVFR_simpson',
 'ssbCor_simpson',
 'mtbAli_simpson',
 'mtbNDi_simpson',
 'libNCo_simpson',
 'ldbPWL_simpson',
 'ltbIBD_simpson',
 'ltcBuA_simpson',
 'ssbCCM_simpson',
 'ssbCCD_simpson',
 'stbCeA_simpson',
 'stbSAl_simpson',
 'sscERI_r_simpson',
 'ssbERI_r_simpson',
 'stcOri_simpson',
 'sscCCo_simpson',
 'sscERI_simpson',
 'sicCAR_simpson',
 'mtcWNe_simpson',
 'ssbCCo_simpson',
 'ssbSqu_simpson',
 'ssbERI_simpson',
 'ssbElo_simpson',
 'stbOri_simpson',
 'mtbSWR_simpson',
 'ldkAre_simpson',
 'ldkPer_simpson',
 'lskCCo_simpson',
 'lskERI_simpson',
 'lskCWA_simpson',
 'ltkOri_simpson',
 'ltkWNB_simpson',
 'likWBB_simpson',
 'sdsLen_simpson',
 'sdsSPW_simpson',
 'sdsSPH_simpson',
 'sdsSPR_simpson',
 'sdsSPO_simpson',
 'sdsSWD_simpson',
 'sdsSHD

In [28]:
simpson = gdf[sim]

In [29]:
simpson.rename(columns={'sscERI_r_simpson': 'sscERI_simpson',
 'ssbERI_r_simpson': 'ssbERI_simpson',}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(**kwargs)


In [30]:
simpson.to_csv('files/AMS/simpson.csv')

In [31]:
simpson.shape

(252385, 76)