<h1>Table of Contents<span class="tocSkip"></span></h1>
<div class="toc"><ul class="toc-item"><li><span><a href="#Read-in-data" data-toc-modified-id="Read-in-data-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Read in data</a></span><ul class="toc-item"><li><span><a href="#Get-original-input-stats" data-toc-modified-id="Get-original-input-stats-1.1"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>Get original input stats</a></span></li></ul></li><li><span><a href="#Convert-distance-to-meters" data-toc-modified-id="Convert-distance-to-meters-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Convert distance to meters</a></span><ul class="toc-item"><li><ul class="toc-item"><li><span><a href="#Reduction-of-number-of-fan-markings-to-finals" data-toc-modified-id="Reduction-of-number-of-fan-markings-to-finals-2.0.1"><span class="toc-item-num">2.0.1&nbsp;&nbsp;</span>Reduction of number of fan markings to finals</a></span></li></ul></li></ul></li><li><span><a href="#Length-stats" data-toc-modified-id="Length-stats-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Length stats</a></span><ul class="toc-item"><li><span><a href="#Blotch-sizes" data-toc-modified-id="Blotch-sizes-3.1"><span class="toc-item-num">3.1&nbsp;&nbsp;</span>Blotch sizes</a></span></li><li><span><a href="#Longest-fans" data-toc-modified-id="Longest-fans-3.2"><span class="toc-item-num">3.2&nbsp;&nbsp;</span>Longest fans</a></span></li></ul></li><li><span><a href="#Regional" data-toc-modified-id="Regional-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>Regional</a></span></li></ul></div>

In [None]:
%matplotlib nbagg

import seaborn as sns
from planet4 import io, stats, markings
from planet4.catalog_production import ReleaseManager

# Read in data

In [None]:
rm = ReleaseManager('v1.0b4')

In [None]:
db = io.DBManager()

In [None]:
db.n_image_names

In [None]:
db.dbname

In [None]:
blotches = rm.read_blotch_file()
fans = rm.read_fan_file()

## Get original input stats

In [None]:
import dask.dataframe as dd

data = dd.read_hdf(db.dbname, 'df')

fan_input = data[data.marking=='fan']

blotch_input = data[data.marking=='blotch']

fan_input.compute().shape

blotch_input.compute().shape

# Convert distance to meters

In [None]:
fans['distance_m'] = fans.distance*fans.map_scale

blotches['radius_1_m'] = blotches.radius_1*blotches.map_scale
blotches['radius_2_m'] = blotches.radius_2*blotches.map_scale

### Reduction of number of fan markings to finals

In [None]:
n_fan_in = 2792963

In [None]:
fans.shape[0]

In [None]:
fans.shape[0] / n_fan_in

In [None]:
blotches.shape[0]

# Length stats

Percentage of fan markings below 100 m:

In [None]:
import scipy
scipy.stats.percentileofscore(fans.distance_m, 100)

Cumulative histogram of fan lengths

In [None]:
def add_percentage_line(ax, meters, column):
    y = scipy.stats.percentileofscore(column, meters)
    ax.axhline(y/100)
    ax.axvline(meters)
    ax.text(meters, y/100, f"{y/100:0.2f}")

In [None]:
plt.close('all')

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
sns.distplot(fans.distance_m, bins=500, kde=False, hist_kws={'cumulative':True,'normed':True},
            axlabel='Fan length [m]', ax=ax)
ax.set_title("Cumulative normalized histogram for fan lengths")
ax.set_ylabel("Fraction of fans with given length")
add_percentage_line(ax, 100, fans.distance_m)
add_percentage_line(ax, 50, fans.distance_m)

General fan stats, in numbers

In [None]:
fans.distance_m.describe()

In words, the mean length of fans is {{f"{fans.distance_m.describe()['mean']:.1f}"}} m, while the median is
{{f"{fans.distance_m.describe()['50%']:.1f}"}} m.

## Blotch sizes

In [None]:
plt.figure()
cols = ['radius_1','radius_2']
sns.distplot(blotches[cols], kde=False, bins=np.arange(2.0,50.), 
             color=['r','g'], label=cols)
plt.legend()

In [None]:
plt.figure()
cols = ['radius_1_m','radius_2_m']
sns.distplot(blotches[cols], kde=False, bins=np.arange(2.0,50.), 
             color=['r','g'], label=cols)
plt.legend()

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
sns.distplot(blotches.radius_2_m, bins=500, kde=False, hist_kws={'cumulative':True,'normed':True},
            axlabel='Blotch radius_1 [m]', ax=ax)
ax.set_title("Cumulative normalized histogram for blotch lengths")
ax.set_ylabel("Fraction of blotches with given radius_1")
add_percentage_line(ax, 30, blotches.radius_2_m)
add_percentage_line(ax, 10, blotches.radius_2_m)

In [None]:
import scipy
scipy.stats.percentileofscore(blotches.radius_2_m, 30)

In [None]:
plt.close('all')

## Longest fans

In [None]:
fans.query('distance_m > 350')[
    'distance_m distance obsid image_x image_y image_id x_tile y_tile'.split()].sort_values(
        by='distance_m')

In [None]:
users1 = markings.ImageID("APF0000dtk").data.user_name.unique()

In [None]:
users2 = markings.ImageID("de3").data.user_name.unique()

In [None]:
same = []
for user in users1:
    if user in users2:
        same.append(user)

In [None]:
same

In [None]:
len(users2)

In [None]:
from planet4 import plotting

In [None]:
plotting.plot_image_id_pipeline('q45', datapath=rm.catalog, via_obsid=False, figsize=(12,8))

# Regional 

In [None]:
from planet4 import stats
from planet4 import region_data

In [None]:
stats.define_season_column(fans)
stats.define_season_column(blotches)

In [None]:
regions = ['Manhattan2', 'Giza', 'Inca', 'Ithaca']

In [None]:
for reg in regions:
    obj = getattr(region_data, reg)
    roi = obj()
    for marking in [fans, blotches]:
        if reg == 'Manhattan2':
            reg = 'Manhattan'
        marking.loc[marking.obsid.isin(roi.all_obsids), 'roi'] = reg

In [None]:
fans.roi.value_counts(dropna=False)

In [None]:
fans_rois = fans[fans.roi.notnull()]
blotches_rois = blotches[blotches.roi.notnull()]

In [None]:
fans_rois.roi.value_counts(dropna=False)

In [None]:
fans.query('season==2').distance_m.median()

In [None]:
fans.query('season==3').distance_m.median()

In [None]:
import seaborn as sns
sns.set_palette('Set1')

In [None]:
fans_rois

In [None]:
def my_plot(x, **kwargs):
    sns.distplot(x, kde=True, **kwargs)
#     plt.axvline(x.median(), color='blue')
    plt.gca().set_xlim(-10, 150)

In [None]:
g = sns.FacetGrid(fans_rois, col="roi", hue='season', size=2, aspect=1.1, legend_out=False)
# g.map(sns.distplot, "distance_m", kde=True);
g.map(my_plot, 'distance_m')
g.add_legend()

In [None]:
g = sns.FacetGrid(fans_rois, col="roi", hue='season', size=2, aspect=1.1, legend_out=False)
g.map(sns.distplot, "distance_m", kde=True);
# g.map(my_plot, 'distance_m')
g.add_legend()

In [None]:
for region in ['Manhattan2', 'Giza','Ithaca']:
    print(region)
    obj = getattr(region_data, region)
    for s in ['season2','season3']:
        print(s)
        obsids = getattr(obj, s)
        print(fans[fans.obsid.isin(obsids)].distance_m.median())

In [None]:
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure()

sns.set_palette("hls", 1)
data = np.random.randn(30)
p=sns.kdeplot(data, shade=True)

x,y = p.get_lines()[0].get_data()

#care with the order, it is first y
#initial fills a 0 so the result has same length than x
cdf = scipy.integrate.cumtrapz(y, x, initial=0)

nearest_05 = np.abs(cdf-0.5).argmin()

x_median = x[nearest_05]
y_median = y[nearest_05]

plt.vlines(x_median, 0, y_median)

In [None]:
import numpy as np
import scipy
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure()

sns.set_palette("hls", 1)
data = np.random.randn(30)
p=sns.kdeplot(data, shade=True)

x,y = p.get_lines()[0].get_data()

#care with the order, it is first y
#initial fills a 0 so the result has same length than x
cdf = scipy.integrate.cumtrapz(y, x, initial=0)

nearest_05 = np.abs(cdf-0.5).argmin()

x_median = x[nearest_05]
y_median = y[nearest_05]

plt.vlines(x_median, 0, y_median)

In [None]:
np.median(x)

In [None]:
np.percentile(x, 50)