In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from __future__ import print_function, division
import planet4 as p4
import pandas as pd
from planet4 import markings
from planet4.get_data import get_current_database_fname
from planet4 import clustering
import os
from os.path import join as pjoin
HOME = os.environ['HOME']

In [3]:
dbfile = get_current_database_fname()
store = pd.HDFStore(dbfile)
store

<class 'pandas.io.pytables.HDFStore'>
File path: /Users/klay6683/data/planet4/2015-03-08_planet_four_classifications_queryable.h5
/df            frame_table  (typ->appendable,nrows->12110427,ncols->22,indexers->[index],dc->[classification_id,image_id,image_name,user_name,marking,acquisition_date,local_mars_time])

In [4]:
image_names = store.select_column('df', 'image_name')

In [5]:
image_names.size

12110427

In [6]:
image_names.unique().size

407

In [7]:
image_names = image_names.unique()

In [8]:
from IPython.parallel import Client
client = Client()

In [9]:
dview = client.direct_view()
lview = client.load_balanced_view()

In [10]:
%%px
import pandas as pd
from planet4 import clustering, markings
from os.path import join as pjoin
import os
HOME = os.environ['HOME']

In [23]:
def do_clustering(p4img, fans):
    if fans:
        reduced = clustering.perform_dbscan(p4img.get_fans(), fans=fans)
    else:
        reduced = clustering.perform_dbscan(p4img.get_blotches(), fans=fans)
    if reduced is None:
        return None
    series = [cluster.data for cluster in reduced]
    df = pd.DataFrame(series)
    df['image_id'] = p4img.imgid
    return df
    
def process_image_name(image_name):
    import sys
    dirname = pjoin(HOME, 'data/planet4/reduced')
    blotchfname = pjoin(dirname, image_name+'_reduced_blotches.hdf')
    fanfname = pjoin(dirname, image_name+'_reduced_fans.hdf')
    if os.path.exists(blotchfname) and\
            os.path.exists(fanfname):
        return image_name+' already done.'
    data = pd.read_hdf(dbfile, 'df', where="image_name="+image_name)
    img_ids = data.image_id.unique()
    print("Found {} unique P4 image_ids.".format(img_ids.shape[0]))
    sys.stdout.flush()
    blotches = []
    fans = []
    for img_id in img_ids:
        p4img = markings.ImageID(img_id)
        blotches.append(do_clustering(p4img, fans=False))
        fans.append(do_clustering(p4img, fans=True))
    blotches = pd.concat(blotches, ignore_index=True)
    blotches.to_hdf(blotchfname, 'df')
    fans = pd.concat(fans, ignore_index=True)
    fans.to_hdf(fanfname, 'df')
    return image_name

In [24]:
dview.push({'do_clustering':do_clustering,
            'dbfile':dbfile})

<AsyncResult: finished>

In [28]:
result = lview.map_async(process_image_name, image_names)

In [26]:
for res in result:
    print(res)

ESP_011544_0985 already done.
ESP_021684_0985 already done.
ESP_011697_0980 already done.
ESP_020322_0930 already done.
ESP_021455_0935 already done.
ESP_020214_0935 already done.
ESP_012008_0975 already done.
ESP_020930_0980 already done.
ESP_020357_0950 already done.
ESP_012884_0935 already done.
ESP_021497_0980 already done.
ESP_011900_0985 already done.
ESP_012291_0980 already done.
ESP_021494_0945 already done.
ESP_021454_0925 already done.
ESP_021522_0930 already done.
ESP_020339_0985 already done.
ESP_012604_0965 already done.
ESP_012254_1065 already done.
ESP_021605_0985 already done.
ESP_011350_0945 already done.
ESP_020376_0980 already done.
ESP_021526_0985 already done.
ESP_021460_0985 already done.
ESP_012076_0945 already done.
ESP_012344_0950 already done.
ESP_011729_0985 already done.
ESP_012063_0945 already done.
ESP_012212_0950 already done.
ESP_012256_0985 already done.
ESP_020598_0935 already done.
ESP_011565_0930 already done.
ESP_012693_0950 already done.
ESP_012838

KeyboardInterrupt: 

In [29]:
import time
while not result.ready():
    print(100*result.progress/len(image_names))
    time.sleep(30)

54.79115479115479
54.79115479115479
55.52825552825553
56.26535626535627
57.24815724815725
57.73955773955774
58.47665847665848
59.21375921375921
59.95085995085995
60.93366093366093
61.91646191646192
62.65356265356265
62.8992628992629
63.88206388206388
65.1105651105651
66.0933660933661
66.83046683046683
67.81326781326781
68.55036855036855
69.77886977886978
70.51597051597052
71.4987714987715
72.72727272727273
74.2014742014742
74.93857493857494
76.16707616707616
76.90417690417691
77.64127764127764
78.37837837837837
79.36117936117937
80.58968058968058
81.32678132678133
82.55528255528256
83.78378378378379
85.25798525798525
86.73218673218673
87.71498771498771
89.43488943488944
90.17199017199017
91.89189189189189
94.5945945945946
95.08599508599508
96.56019656019656
97.54299754299754
99.01719901719902


In [23]:
ls ~/data/planet4/reduced

ESP_011544_0985_reduced_blotches.hdf  ESP_021684_0985_reduced_blotches.hdf
ESP_011544_0985_reduced_fans.hdf      ESP_021684_0985_reduced_fans.hdf


In [37]:
df =pd.read_hdf('/Users/klay6683/data/planet4/reduced/ESP_011544_0985_reduced_blotches.hdf', 'df')

In [38]:
blotches= df[df.image_id=='APF00002sx']

In [29]:
reload(markings)

<module 'planet4.markings' from '/Users/klay6683/Dropbox/src/P4_sandbox/planet4/markings.py'>

In [39]:
p4id = markings.ImageID('APF00002sx')

In [35]:
%matplotlib qt

In [41]:
p4id.plot_blotches(blotches=blotches)

In [14]:
def cluster_p4_id(image_id):
    imgid = markings.ImageID(image_id)
    reduced_blotches = clustering.perform_dbscan(imgid.get_blotches())
    series = [b.data for b in reduced_blotches]
    df = pd.DataFrame(series)
    df['image_id'] = image_id
    return df