In [None]:
from collections import Counter
from os import path

import geopandas as gpd
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy import stats
from sklearn.metrics import adjusted_rand_score
from statsmodels.stats.diagnostic import normal_ad
from tqdm import tqdm, tqdm_notebook

from evaluate import evaluate, plot_chromosome

# Force matplotlib to plot from notebook
%matplotlib inline
# Increase default plot size
matplotlib.rcParams['figure.figsize'] = (10, 10)
# Change default plotting font
matplotlib.rcParams['font.family'] = 'Dapifer'

In [None]:
# Import the results data
districts = map(str, range(0,24))
fitness_measures = ['compactness', 'crime_score', 'population_score']
names = districts + fitness_measures
results = pd.read_csv(path.join('..', 'simulation', 'output.csv'), names=names)

# Extract the chromosomes
chromosomes = results.iloc[:, :24].values.tolist()

In [None]:
# https://stackoverflow.com/questions/3724551/python-uniqueness-for-list-of-lists
unique_chromosomes = [list(x) for x in set(tuple(y) for y in chromosomes)]

# Get a set of unique results
unique_results = results.drop_duplicates()
unique_results = unique_results.iloc[:,-3:]

In [None]:
# Import the cleaned neighborhood data and address naming bug with .rename()
neighborhoods_with_border = gpd.read_file(
    path.join('..', 'maps', 'clean', 'neighborhoods.shp')
).rename(columns={'neighborho': 'neighborhood'})
neighborhoods = neighborhoods_with_border.query('neighborhood != "Border"').reset_index(drop=True)

In [None]:
baseline_fitness = evaluate(neighborhoods, neighborhoods['district'])

In [None]:
# Generate histogram of the compactness scores
compactness = unique_results['compactness']

# Get parameters for fitted beta distribution (forcing location)
# and scale as they are the actual limits for the compactness score
a, b, loc, scale = stats.beta.fit(compactness, floc=0, fscale=1)

fit_x = np.linspace(loc, (loc + scale), 1000)
fit_y = stats.beta.pdf(fit_x, a, b, loc, scale)

plt.hist(compactness, bins=50, normed=True)
plt.plot(fit_x, fit_y)
plt.axvline(x=baseline_fitness[0], color='r', linestyle='dashed')
plt.gca().set_xlim(left=0.02, right=0.1)
plt.show()

print('p-value', stats.kstest(compactness, 'beta', (a, b, loc, scale))[1])
print('percentile', stats.percentileofscore(compactness, baseline_fitness[0]))

In [None]:
crime = unique_results['crime_score']

# Get parameters for fitted beta distribution (forcing location)
# and scale as they are the actual limits for the compactness score
a, b, loc, scale = stats.beta.fit(crime, floc=0, fscale=1)

fit_x = np.linspace(loc, (loc + scale), 1000)
fit_y = stats.beta.pdf(fit_x, a, b, loc, scale)

plt.hist(crime, bins=50, normed=True)
plt.plot(fit_x, fit_y)
plt.axvline(x=baseline_fitness[1], color='r', linestyle='dashed')
plt.show()

print('p-value', stats.kstest(crime, 'beta', (a, b, loc, scale))[1])
print('percentile', stats.percentileofscore(crime, baseline_fitness[1]))

In [None]:
pop = unique_results['population_score']

# Get parameters for fitted beta distribution (forcing location)
# and scale as they are the actual limits for the compactness score
a, b, loc, scale = stats.beta.fit(pop, floc=0, fscale=1)

fit_x = np.linspace(loc, (loc + scale), 1000)
fit_y = stats.beta.pdf(fit_x, a, b, loc, scale)

plt.hist(pop, bins=50, normed=True)
plt.plot(fit_x, fit_y)
plt.axvline(x=baseline_fitness[2], color='r', linestyle='dashed')
plt.show()

print('p-value', stats.kstest(pop, 'beta', (a, b, loc, scale))[1])

print(stats.percentileofscore(pop, baseline_fitness[2]))

In [None]:
# Calculate adjusted rand index for each set
rand_scores = []

for chromosome in tqdm_notebook(unique_chromosomes, desc = "Getting Adj. Rand Scores"):
    rand_scores.append(adjusted_rand_score(neighborhoods['district'], chromosome))
    
# Get parameters for fitted beta distribution (forcing location)
# and scale as they are the actual limits for the compactness score
a, b, loc, scale = stats.beta.fit(rand_scores, floc=-1, fscale=2)

fit_x = np.linspace(loc, (loc + scale), 1000)
fit_y = stats.beta.pdf(fit_x, a, b, loc, scale)

plt.hist(rand_scores, bins=50, normed=True)
plt.plot(fit_x, fit_y)
plt.show()

print('p-value', stats.kstest(rand_scores, 'beta', (a, b, loc, scale))[1])

In [None]:
# Count instances of each variation
chromosome_counts = Counter(tuple(x) for x in chromosomes)
chromosome_counts = pd.DataFrame.from_dict(chromosome_counts, orient='index').reset_index()
chromosome_counts.columns = ['chromosome', 'count']

# Isolate the high-count solutions
degenerates = chromosome_counts.sort_values('count', ascending=False).iloc[:4,0]

matplotlib.rcParams['figure.figsize'] = (5, 5)

for degenerate in degenerates:
    plot_chromosome(neighborhoods, degenerate, dissolve=False)